diff --git a/docs/doc/03-develop/04-rust.md b/docs/doc/03-develop/04-rust.md index d86302c097eb..26e25c122bb3 100644 --- a/docs/doc/03-develop/04-rust.md +++ b/docs/doc/03-develop/04-rust.md @@ -66,23 +66,23 @@ async fn main() { let sql_db_create = "CREATE DATABASE IF NOT EXISTS book_db;"; conn.exec(sql_db_create).await.unwrap(); - let sql_table_create = "CREATE TABLE books ( + let sql_table_create = "CREATE TABLE book_db.books ( title VARCHAR, author VARCHAR, date VARCHAR );"; conn.exec(sql_table_create).await.unwrap(); - let sql_insert = "INSERT INTO books VALUES ('mybook', 'author', '2022');"; + let sql_insert = "INSERT INTO book_db.books VALUES ('mybook', 'author', '2022');"; conn.exec(sql_insert).await.unwrap(); - let mut rows = conn.query_iter("SELECT * FROM books;").await.unwrap(); + let mut rows = conn.query_iter("SELECT * FROM book_db.books;").await.unwrap(); while let Some(row) = rows.next().await { let (title, author, date): (String, String, String) = row.unwrap().try_into().unwrap(); println!("{} {} {}", title, author, date); } - let sql_table_drop = "DROP TABLE books;"; + let sql_table_drop = "DROP TABLE book_db.books;"; conn.exec(sql_table_drop).await.unwrap(); let sql_db_drop = "DROP DATABASE book_db;"; diff --git a/src/meta/api/src/id_generator.rs b/src/meta/api/src/id_generator.rs index d97337faca21..9b6d6c921fc5 100644 --- a/src/meta/api/src/id_generator.rs +++ b/src/meta/api/src/id_generator.rs @@ -16,6 +16,7 @@ use common_meta_kvapi::kvapi; use crate::background_api_keys::ID_GEN_BACKGROUND_JOB; use crate::data_mask_api_keys::ID_GEN_DATA_MASK; +use crate::schema_api_keys::ID_GEN_CATALOG; use crate::schema_api_keys::ID_GEN_DATABASE; use crate::schema_api_keys::ID_GEN_INDEX; use crate::schema_api_keys::ID_GEN_TABLE; @@ -86,6 +87,13 @@ impl IdGenerator { resource: ID_GEN_BACKGROUND_JOB.to_string(), } } + + /// Create a key for generating catalog id with kvapi::KVApi + pub fn catalog_id() -> Self { + Self { + resource: ID_GEN_CATALOG.to_string(), + } + } } impl kvapi::Key for IdGenerator { diff --git a/src/meta/api/src/schema_api.rs b/src/meta/api/src/schema_api.rs index 576d0f0aa6f2..110d1f2720a7 100644 --- a/src/meta/api/src/schema_api.rs +++ b/src/meta/api/src/schema_api.rs @@ -16,6 +16,8 @@ use std::sync::Arc; use common_meta_app::schema::CountTablesReply; use common_meta_app::schema::CountTablesReq; +use common_meta_app::schema::CreateCatalogReply; +use common_meta_app::schema::CreateCatalogReq; use common_meta_app::schema::CreateDatabaseReply; use common_meta_app::schema::CreateDatabaseReq; use common_meta_app::schema::CreateIndexReply; @@ -216,5 +218,8 @@ pub trait SchemaApi: Send + Sync { async fn delete_table_lock_rev(&self, req: DeleteTableLockRevReq) -> Result<(), KVAppError>; + async fn create_catalog(&self, req: CreateCatalogReq) + -> Result; + fn name(&self) -> String; } diff --git a/src/meta/api/src/schema_api_impl.rs b/src/meta/api/src/schema_api_impl.rs index 93bdbd4e7caa..688112aa732e 100644 --- a/src/meta/api/src/schema_api_impl.rs +++ b/src/meta/api/src/schema_api_impl.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use chrono::DateTime; use chrono::Utc; use common_meta_app::app_error::AppError; +use common_meta_app::app_error::CatalogAlreadyExists; use common_meta_app::app_error::CreateDatabaseWithDropTime; use common_meta_app::app_error::CreateIndexWithDropTime; use common_meta_app::app_error::CreateTableWithDropTime; @@ -45,9 +46,13 @@ use common_meta_app::app_error::UnknownTableId; use common_meta_app::app_error::VirtualColumnAlreadyExists; use common_meta_app::app_error::WrongShare; use common_meta_app::app_error::WrongShareObject; +use common_meta_app::schema::CatalogId; +use common_meta_app::schema::CatalogIdToName; use common_meta_app::schema::CountTablesKey; use common_meta_app::schema::CountTablesReply; use common_meta_app::schema::CountTablesReq; +use common_meta_app::schema::CreateCatalogReply; +use common_meta_app::schema::CreateCatalogReq; use common_meta_app::schema::CreateDatabaseReply; use common_meta_app::schema::CreateDatabaseReq; use common_meta_app::schema::CreateIndexReply; @@ -2990,6 +2995,83 @@ impl> SchemaApi for KV { Ok(()) } + async fn create_catalog( + &self, + req: CreateCatalogReq, + ) -> Result { + debug!(req = debug(&req), "SchemaApi: {}", func_name!()); + + let name_key = &req.name_ident; + + let ctx = &func_name!(); + + let mut trials = txn_trials(None, ctx); + + let catalog_id = loop { + trials.next().unwrap()?; + + // Get catalog by name to ensure absence + let (catalog_id_seq, catalog_id) = get_u64_value(self, name_key).await?; + debug!(catalog_id_seq, catalog_id, ?name_key, "get_catalog"); + + if catalog_id_seq > 0 { + return if req.if_not_exists { + Ok(CreateCatalogReply { catalog_id }) + } else { + Err(KVAppError::AppError(AppError::CatalogAlreadyExists( + CatalogAlreadyExists::new( + &name_key.catalog_name, + format!("create catalog: tenant: {}", name_key.tenant), + ), + ))) + }; + } + + // Create catalog by inserting these record: + // (tenant, catalog_name) -> catalog_id + // (catalog_id) -> catalog_meta + // (catalog_id) -> (tenant, catalog_name) + let catalog_id = fetch_id(self, IdGenerator::catalog_id()).await?; + let id_key = CatalogId { catalog_id }; + let id_to_name_key = CatalogIdToName { catalog_id }; + + debug!(catalog_id, name_key = debug(&name_key), "new catalog id"); + + { + let condition = vec![ + txn_cond_seq(name_key, Eq, 0), + txn_cond_seq(&id_to_name_key, Eq, 0), + ]; + let if_then = vec![ + txn_op_put(name_key, serialize_u64(catalog_id)?), /* (tenant, catalog_name) -> catalog_id */ + txn_op_put(&id_key, serialize_struct(&req.meta)?), /* (catalog_id) -> catalog_meta */ + txn_op_put(&id_to_name_key, serialize_struct(name_key)?), /* __fd_catalog_id_to_name/ -> (tenant,catalog_name) */ + ]; + + let txn_req = TxnRequest { + condition, + if_then, + else_then: vec![], + }; + + let (succ, _) = send_txn(self, txn_req).await?; + + debug!( + name = debug(&name_key), + id = debug(&id_key), + succ = display(succ), + "create_catalog" + ); + + if succ { + break catalog_id; + } + } + }; + + Ok(CreateCatalogReply { catalog_id }) + } + fn name(&self) -> String { "SchemaApiImpl".to_string() } diff --git a/src/meta/api/src/schema_api_keys.rs b/src/meta/api/src/schema_api_keys.rs index f78ec8f2d7c1..48f78e33bcb1 100644 --- a/src/meta/api/src/schema_api_keys.rs +++ b/src/meta/api/src/schema_api_keys.rs @@ -18,3 +18,5 @@ pub(crate) const ID_GEN_TABLE: &str = "table_id"; pub(crate) const ID_GEN_DATABASE: &str = "database_id"; pub(crate) const ID_GEN_TABLE_LOCK: &str = "table_lock_id"; pub(crate) const ID_GEN_INDEX: &str = "index_id"; + +pub(crate) const ID_GEN_CATALOG: &str = "catalog_id"; diff --git a/src/meta/app/src/app_error.rs b/src/meta/app/src/app_error.rs index be3851a65db7..141032ec33f3 100644 --- a/src/meta/app/src/app_error.rs +++ b/src/meta/app/src/app_error.rs @@ -42,6 +42,22 @@ impl DatabaseAlreadyExists { } } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, thiserror::Error)] +#[error("CatalogAlreadyExists: `{catalog_name}` while `{context}`")] +pub struct CatalogAlreadyExists { + catalog_name: String, + context: String, +} + +impl CatalogAlreadyExists { + pub fn new(catalog_name: impl Into, context: impl Into) -> Self { + Self { + catalog_name: catalog_name.into(), + context: context.into(), + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, thiserror::Error)] #[error("DatamaskAlreadyExists: `{name}` while `{context}`")] pub struct DatamaskAlreadyExists { @@ -760,6 +776,9 @@ pub enum AppError { #[error(transparent)] DatabaseAlreadyExists(#[from] DatabaseAlreadyExists), + #[error(transparent)] + CatalogAlreadyExists(#[from] CatalogAlreadyExists), + #[error(transparent)] CreateDatabaseWithDropTime(#[from] CreateDatabaseWithDropTime), @@ -891,6 +910,12 @@ impl AppErrorMessage for DatabaseAlreadyExists { } } +impl AppErrorMessage for CatalogAlreadyExists { + fn message(&self) -> String { + format!("Catalog '{}' already exists", self.catalog_name) + } +} + impl AppErrorMessage for CreateDatabaseWithDropTime { fn message(&self) -> String { format!("Create database '{}' with drop time", self.db_name) @@ -1153,6 +1178,7 @@ impl From for ErrorCode { AppError::UnknownTableId(err) => ErrorCode::UnknownTableId(err.message()), AppError::UnknownTable(err) => ErrorCode::UnknownTable(err.message()), AppError::DatabaseAlreadyExists(err) => ErrorCode::DatabaseAlreadyExists(err.message()), + AppError::CatalogAlreadyExists(err) => ErrorCode::CatalogAlreadyExists(err.message()), AppError::CreateDatabaseWithDropTime(err) => { ErrorCode::CreateDatabaseWithDropTime(err.message()) } diff --git a/src/meta/app/src/principal/user_stage.rs b/src/meta/app/src/principal/user_stage.rs index b38441d28689..205b9e8f7206 100644 --- a/src/meta/app/src/principal/user_stage.rs +++ b/src/meta/app/src/principal/user_stage.rs @@ -61,7 +61,7 @@ pub enum StageType { /// LegacyInternal will be deprecated. /// /// Please never use this variant except in `proto_conv`. We keep this - /// stage type for backword compatible. + /// stage type for backward compatible. /// /// TODO(xuanwo): remove this when we are releasing v0.9. LegacyInternal, @@ -135,7 +135,7 @@ impl FromStr for StageFileCompression { "xz" => Ok(StageFileCompression::Xz), "none" => Ok(StageFileCompression::None), _ => Err("Unknown file compression type, must one of { auto | gzip | bz2 | brotli | zstd | deflate | raw_deflate | lzo | snappy | xz | none }" - .to_string()), + .to_string()), } } } diff --git a/src/meta/app/src/schema/catalog.rs b/src/meta/app/src/schema/catalog.rs index 1555afd212a9..b0405cd31694 100644 --- a/src/meta/app/src/schema/catalog.rs +++ b/src/meta/app/src/schema/catalog.rs @@ -36,46 +36,77 @@ impl Display for CatalogType { } } -/// Option for creating a iceberg catalog -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct IcebergCatalogOption { - pub storage_params: Box, -} - /// different options for creating catalogs -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, PartialEq, Eq)] pub enum CatalogOption { // hms_address - Hive(String), + Hive(HiveCatalogOption), // Uri location for iceberg Iceberg(IcebergCatalogOption), } -#[derive(Clone, Debug, PartialEq, Eq)] +/// Option for creating a iceberg catalog +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] +pub struct HiveCatalogOption { + pub address: String, +} + +/// Option for creating a iceberg catalog +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] +pub struct IcebergCatalogOption { + pub storage_params: Box, +} + +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub struct CatalogMeta { pub catalog_option: CatalogOption, pub created_on: DateTime, } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub struct CatalogNameIdent { pub tenant: String, pub catalog_name: String, } +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, Eq, PartialEq)] +pub struct CatalogId { + pub catalog_id: u64, +} + impl Display for CatalogNameIdent { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "'{}'/'{}'", self.tenant, self.catalog_name) } } -#[derive(Clone, Debug, PartialEq, Eq)] +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default, Eq, PartialEq)] +pub struct CatalogIdToName { + pub catalog_id: u64, +} + +impl Display for CatalogIdToName { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.catalog_id) + } +} + +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] pub struct CreateCatalogReq { pub if_not_exists: bool, pub name_ident: CatalogNameIdent, pub meta: CatalogMeta, } +impl CreateCatalogReq { + pub fn tenant(&self) -> &str { + &self.name_ident.tenant + } + pub fn catalog_name(&self) -> &str { + &self.name_ident.catalog_name + } +} + impl Display for CreateCatalogReq { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -86,6 +117,11 @@ impl Display for CreateCatalogReq { } } +#[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Eq, PartialEq)] +pub struct CreateCatalogReply { + pub catalog_id: u64, +} + #[derive(Clone, Debug, PartialEq, Eq)] pub struct DropCatalogReq { pub if_exists: bool, @@ -101,3 +137,79 @@ impl Display for DropCatalogReq { ) } } + +mod kvapi_key_impl { + use common_meta_kvapi::kvapi; + + use super::CatalogId; + use super::CatalogIdToName; + use super::CatalogNameIdent; + use crate::schema::PREFIX_CATALOG; + use crate::schema::PREFIX_CATALOG_BY_ID; + use crate::schema::PREFIX_CATALOG_ID_TO_NAME; + + /// __fd_catalog// -> + impl kvapi::Key for CatalogNameIdent { + const PREFIX: &'static str = PREFIX_CATALOG; + + fn to_string_key(&self) -> String { + kvapi::KeyBuilder::new_prefixed(Self::PREFIX) + .push_str(&self.tenant) + .push_str(&self.catalog_name) + .done() + } + + fn from_str_key(s: &str) -> Result { + let mut p = kvapi::KeyParser::new_prefixed(s, Self::PREFIX)?; + + let tenant = p.next_str()?; + let catalog_name = p.next_str()?; + p.done()?; + + Ok(CatalogNameIdent { + tenant, + catalog_name, + }) + } + } + + /// "__fd_catalog_by_id/" + impl kvapi::Key for CatalogId { + const PREFIX: &'static str = PREFIX_CATALOG_BY_ID; + + fn to_string_key(&self) -> String { + kvapi::KeyBuilder::new_prefixed(Self::PREFIX) + .push_u64(self.catalog_id) + .done() + } + + fn from_str_key(s: &str) -> Result { + let mut p = kvapi::KeyParser::new_prefixed(s, Self::PREFIX)?; + + let catalog_id = p.next_u64()?; + p.done()?; + + Ok(CatalogId { catalog_id }) + } + } + + /// "__fd_catalog_id_to_name/ -> CatalogNameIdent" + impl kvapi::Key for CatalogIdToName { + const PREFIX: &'static str = PREFIX_CATALOG_ID_TO_NAME; + + fn to_string_key(&self) -> String { + kvapi::KeyBuilder::new_prefixed(Self::PREFIX) + .push_u64(self.catalog_id) + .done() + } + + fn from_str_key(s: &str) -> Result { + let mut p = kvapi::KeyParser::new_prefixed(s, Self::PREFIX)?; + + let catalog_id = p.next_u64()?; + p.done()?; + + Ok(CatalogIdToName { catalog_id }) + } + } +} diff --git a/src/meta/app/src/schema/mod.rs b/src/meta/app/src/schema/mod.rs index 94e4ee02d9e4..e05085a25694 100644 --- a/src/meta/app/src/schema/mod.rs +++ b/src/meta/app/src/schema/mod.rs @@ -20,13 +20,7 @@ mod index; mod table; mod virtual_column; -pub use catalog::CatalogMeta; -pub use catalog::CatalogNameIdent; -pub use catalog::CatalogOption; -pub use catalog::CatalogType; -pub use catalog::CreateCatalogReq; -pub use catalog::DropCatalogReq; -pub use catalog::IcebergCatalogOption; +pub use catalog::*; pub use database::CreateDatabaseReply; pub use database::CreateDatabaseReq; pub use database::DatabaseId; @@ -101,6 +95,10 @@ pub use virtual_column::UpdateVirtualColumnReq; pub use virtual_column::VirtualColumnMeta; pub use virtual_column::VirtualColumnNameIdent; +const PREFIX_CATALOG: &str = "__fd_catalog"; +const PREFIX_CATALOG_BY_ID: &str = "__fd_catalog_by_id"; +const PREFIX_CATALOG_ID_TO_NAME: &str = "__fd_catalog_id_to_name"; + const PREFIX_DB_ID_LIST: &str = "__fd_db_id_list"; const PREFIX_DATABASE: &str = "__fd_database"; const PREFIX_DATABASE_BY_ID: &str = "__fd_database_by_id"; diff --git a/src/meta/proto-conv/src/catalog_from_to_protobuf_impl.rs b/src/meta/proto-conv/src/catalog_from_to_protobuf_impl.rs new file mode 100644 index 000000000000..5db784cfede1 --- /dev/null +++ b/src/meta/proto-conv/src/catalog_from_to_protobuf_impl.rs @@ -0,0 +1,129 @@ +// Copyright 2021 Datafuse Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This mod is the key point about compatibility. +//! Everytime update anything in this file, update the `VER` and let the tests pass. + +use chrono::DateTime; +use chrono::Utc; +use common_meta_app::schema as mt; +use common_meta_app::schema::CatalogOption; +use common_meta_app::schema::HiveCatalogOption; +use common_meta_app::schema::IcebergCatalogOption; +use common_meta_app::storage::StorageParams; +use common_protos::pb; + +use crate::reader_check_msg; +use crate::FromToProto; +use crate::Incompatible; +use crate::MIN_READER_VER; +use crate::VER; + +impl FromToProto for mt::CatalogNameIdent { + type PB = pb::CatalogNameIdent; + fn get_pb_ver(p: &Self::PB) -> u64 { + p.ver + } + fn from_pb(p: pb::CatalogNameIdent) -> Result { + reader_check_msg(p.ver, p.min_reader_ver)?; + + let v = Self { + tenant: p.tenant, + catalog_name: p.catalog_name, + }; + Ok(v) + } + + fn to_pb(&self) -> Result { + let p = pb::CatalogNameIdent { + ver: VER, + min_reader_ver: MIN_READER_VER, + tenant: self.tenant.clone(), + catalog_name: self.catalog_name.clone(), + }; + Ok(p) + } +} + +impl FromToProto for mt::CatalogMeta { + type PB = pb::CatalogMeta; + + fn get_pb_ver(p: &Self::PB) -> u64 { + p.ver + } + + fn from_pb(p: pb::CatalogMeta) -> Result { + reader_check_msg(p.ver, p.min_reader_ver)?; + + let option = p + .option + .ok_or_else(|| Incompatible { + reason: "CatalogMeta.option is None".to_string(), + })? + .catalog_option + .ok_or_else(|| Incompatible { + reason: "CatalogMeta.option.catalog_option is None".to_string(), + })?; + + let v = Self { + catalog_option: match option { + pb::catalog_option::CatalogOption::Hive(v) => { + CatalogOption::Hive(HiveCatalogOption { address: v.address }) + } + pb::catalog_option::CatalogOption::Iceberg(v) => { + CatalogOption::Iceberg(IcebergCatalogOption { + storage_params: Box::new(StorageParams::from_pb( + v.storage_params.ok_or_else(|| Incompatible { + reason: "CatalogMeta.option.catalog_option.iceberg.StorageParams is None".to_string(), + })?, + )?), + }) + } + }, + created_on: DateTime::::from_pb(p.created_on)?, + }; + + Ok(v) + } + + fn to_pb(&self) -> Result { + let p = pb::CatalogMeta { + ver: VER, + min_reader_ver: MIN_READER_VER, + option: match self.catalog_option.clone() { + CatalogOption::Hive(v) => Some(pb::CatalogOption { + catalog_option: Some(pb::catalog_option::CatalogOption::Hive( + pb::HiveCatalogOption { + ver: VER, + min_reader_ver: MIN_READER_VER, + address: v.address, + }, + )), + }), + CatalogOption::Iceberg(v) => Some(pb::CatalogOption { + catalog_option: Some(pb::catalog_option::CatalogOption::Iceberg( + pb::IcebergCatalogOption { + ver: VER, + min_reader_ver: MIN_READER_VER, + storage_params: Some(v.storage_params.to_pb()?), + }, + )), + }), + }, + created_on: self.created_on.to_pb()?, + }; + + Ok(p) + } +} diff --git a/src/meta/proto-conv/src/lib.rs b/src/meta/proto-conv/src/lib.rs index ba3a9fc6fa1b..c0fe0cc830b6 100644 --- a/src/meta/proto-conv/src/lib.rs +++ b/src/meta/proto-conv/src/lib.rs @@ -62,6 +62,7 @@ mod background_job_from_to_protobuf_impl; mod background_task_from_to_protobuf_impl; +mod catalog_from_to_protobuf_impl; mod config_from_to_protobuf_impl; mod data_mask_from_to_protobuf_impl; mod database_from_to_protobuf_impl; diff --git a/src/meta/proto-conv/src/util.rs b/src/meta/proto-conv/src/util.rs index 7cb1d1696653..f1bad61e31ed 100644 --- a/src/meta/proto-conv/src/util.rs +++ b/src/meta/proto-conv/src/util.rs @@ -75,7 +75,8 @@ const META_CHANGE_LOG: &[(u64, &str)] = &[ (43, "2023-06-05: Add fields `number_of_segments` and `number_of_blocks` to TableStatistics", ), (44, "2023-06-07: Add: metadata.proto/ComputedExpr", ), (45, "2023-06-06: Add: background_tasks.proto and background_jobs.proto", ), - (46, "2023-06-28: Add: index.proto/IndexMeta::updated_on", ) + (46, "2023-06-28: Add: index.proto/IndexMeta::updated_on", ), + (47, "2023-07-03: Add: catalog.proto/CatalogMeta",), // Dear developer: // If you're gonna add a new metadata version, you'll have to add a test for it. // You could just copy an existing test file(e.g., `../tests/it/v024_table_meta.rs`) diff --git a/src/meta/proto-conv/tests/it/main.rs b/src/meta/proto-conv/tests/it/main.rs index 6bfdba403f53..c18f0baa804e 100644 --- a/src/meta/proto-conv/tests/it/main.rs +++ b/src/meta/proto-conv/tests/it/main.rs @@ -51,3 +51,4 @@ mod v043_table_statistics; mod v044_table_meta; mod v045_background; mod v046_index_meta; +mod v047_catalog_meta; diff --git a/src/meta/proto-conv/tests/it/proto_conv.rs b/src/meta/proto-conv/tests/it/proto_conv.rs index 06977328b528..1007f789b4f5 100644 --- a/src/meta/proto-conv/tests/it/proto_conv.rs +++ b/src/meta/proto-conv/tests/it/proto_conv.rs @@ -27,8 +27,11 @@ use common_expression::TableDataType; use common_expression::TableField; use common_expression::TableSchema; use common_meta_app::schema as mt; +use common_meta_app::schema::CatalogOption; +use common_meta_app::schema::IcebergCatalogOption; use common_meta_app::schema::IndexType; use common_meta_app::share; +use common_meta_app::storage::StorageS3Config; use common_proto_conv::FromToProto; use common_proto_conv::Incompatible; use common_proto_conv::VER; @@ -281,6 +284,24 @@ fn new_table_statistics() -> common_meta_app::schema::TableStatistics { } } +fn new_catalog_meta() -> common_meta_app::schema::CatalogMeta { + common_meta_app::schema::CatalogMeta { + catalog_option: CatalogOption::Iceberg(IcebergCatalogOption { + storage_params: Box::new(common_meta_app::storage::StorageParams::S3( + StorageS3Config { + endpoint_url: "http://127.0.0.1:9900".to_string(), + region: "hello".to_string(), + bucket: "world".to_string(), + access_key_id: "databend_has_super_power".to_string(), + secret_access_key: "databend_has_super_power".to_string(), + ..Default::default() + }, + )), + }), + created_on: Utc.with_ymd_and_hms(2014, 11, 28, 12, 0, 9).unwrap(), + } +} + #[test] fn test_pb_from_to() -> anyhow::Result<()> { let db = new_db_meta(); @@ -460,5 +481,15 @@ fn test_build_pb_buf() -> anyhow::Result<()> { println!("table statistics:{:?}", buf); } + // catalog meta + { + let catalog_meta = new_catalog_meta(); + let p = catalog_meta.to_pb()?; + + let mut buf = vec![]; + common_protos::prost::Message::encode(&p, &mut buf)?; + println!("catalog catalog_meta:{:?}", buf); + } + Ok(()) } diff --git a/src/meta/proto-conv/tests/it/v047_catalog_meta.rs b/src/meta/proto-conv/tests/it/v047_catalog_meta.rs new file mode 100644 index 000000000000..61dbcaf0a750 --- /dev/null +++ b/src/meta/proto-conv/tests/it/v047_catalog_meta.rs @@ -0,0 +1,65 @@ +// Copyright 2021 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use chrono::TimeZone; +use chrono::Utc; +use common_meta_app::schema::CatalogOption; +use common_meta_app::schema::IcebergCatalogOption; +use common_meta_app::storage::StorageS3Config; + +use crate::common; + +// These bytes are built when a new version in introduced, +// and are kept for backward compatibility test. +// +// ************************************************************* +// * These messages should never be updated, * +// * only be added when a new version is added, * +// * or be removed when an old version is no longer supported. * +// ************************************************************* +// +// The message bytes are built from the output of `proto_conv::test_build_pb_buf()` +#[test] +fn test_decode_v47_catalog() -> anyhow::Result<()> { + let catalog_v047 = vec![ + 18, 107, 26, 105, 10, 97, 10, 95, 10, 5, 104, 101, 108, 108, 111, 18, 21, 104, 116, 116, + 112, 58, 47, 47, 49, 50, 55, 46, 48, 46, 48, 46, 49, 58, 57, 57, 48, 48, 26, 24, 100, 97, + 116, 97, 98, 101, 110, 100, 95, 104, 97, 115, 95, 115, 117, 112, 101, 114, 95, 112, 111, + 119, 101, 114, 34, 24, 100, 97, 116, 97, 98, 101, 110, 100, 95, 104, 97, 115, 95, 115, 117, + 112, 101, 114, 95, 112, 111, 119, 101, 114, 42, 5, 119, 111, 114, 108, 100, 160, 6, 47, + 168, 6, 24, 160, 6, 47, 168, 6, 24, 162, 1, 23, 50, 48, 49, 52, 45, 49, 49, 45, 50, 56, 32, + 49, 50, 58, 48, 48, 58, 48, 57, 32, 85, 84, 67, 160, 6, 47, 168, 6, 24, + ]; + + let want = || common_meta_app::schema::CatalogMeta { + catalog_option: CatalogOption::Iceberg(IcebergCatalogOption { + storage_params: Box::new(common_meta_app::storage::StorageParams::S3( + StorageS3Config { + endpoint_url: "http://127.0.0.1:9900".to_string(), + region: "hello".to_string(), + bucket: "world".to_string(), + access_key_id: "databend_has_super_power".to_string(), + secret_access_key: "databend_has_super_power".to_string(), + ..Default::default() + }, + )), + }), + created_on: Utc.with_ymd_and_hms(2014, 11, 28, 12, 0, 9).unwrap(), + }; + + common::test_pb_from_to(func_name!(), want())?; + common::test_load_old(func_name!(), catalog_v047.as_slice(), 47, want())?; + + Ok(()) +} diff --git a/src/meta/protos/proto/catalog.proto b/src/meta/protos/proto/catalog.proto new file mode 100644 index 000000000000..0efb490cd806 --- /dev/null +++ b/src/meta/protos/proto/catalog.proto @@ -0,0 +1,63 @@ +// Copyright 2022 Datafuse Labs. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package databend_proto; + +import "config.proto"; + +message CatalogNameIdent { + uint64 ver = 100; + uint64 min_reader_ver = 101; + + // The user this db belongs to + string tenant = 1; + + // Catalog name + string catalog_name = 2; +} + +message CatalogMeta { + uint64 ver = 100; + uint64 min_reader_ver = 101; + + // catalog options + CatalogOption option = 2; + + // The time catalog created. + string created_on = 20; +} + +message CatalogOption { + oneof catalog_option { + HiveCatalogOption hive = 2; + IcebergCatalogOption iceberg = 3; + } +} + +message HiveCatalogOption { + uint64 ver = 100; + uint64 min_reader_ver = 101; + + // Hive metastore thrift uri + string address = 1; +} + +message IcebergCatalogOption { + uint64 ver = 100; + uint64 min_reader_ver = 101; + + StorageConfig storage_params = 1; +} diff --git a/src/meta/protos/proto/database.proto b/src/meta/protos/proto/database.proto index 0cf8e5218b29..a1f0a81a7fe3 100644 --- a/src/meta/protos/proto/database.proto +++ b/src/meta/protos/proto/database.proto @@ -70,4 +70,4 @@ message DbIdList { uint64 min_reader_ver = 101; repeated uint64 ids = 1; -} \ No newline at end of file +} diff --git a/src/meta/protos/src/lib.rs b/src/meta/protos/src/lib.rs index 0a2680e45676..c617f839b45d 100644 --- a/src/meta/protos/src/lib.rs +++ b/src/meta/protos/src/lib.rs @@ -13,6 +13,7 @@ // limitations under the License. #[allow(clippy::derive_partial_eq_without_eq)] +#[allow(clippy::large_enum_variant)] /// ProtoBuf generated files. pub mod pb { diff --git a/src/query/ast/src/ast/format/ast_format.rs b/src/query/ast/src/ast/format/ast_format.rs index 2d3929c70c58..66486b11b5c4 100644 --- a/src/query/ast/src/ast/format/ast_format.rs +++ b/src/query/ast/src/ast/format/ast_format.rs @@ -1366,10 +1366,21 @@ impl<'ast> Visitor<'ast> for AstFormatVisitor { let action_format_ctx = AstFormatContext::new(action_name); FormatTreeNode::new(action_format_ctx) } - AlterTableAction::ModifyColumn { column, action: _ } => { + AlterTableAction::ModifyColumn { column, action } => { + let child_name = match action { + ModifyColumnAction::SetMaskingPolicy(mask_name) => { + format!("Action SetMaskingPolicy {}", mask_name) + } + ModifyColumnAction::ConvertStoredComputedColumn => { + "Action ConvertStoredComputedColumn".to_string() + } + }; + let child_format_ctx = AstFormatContext::new(child_name); + let child = FormatTreeNode::new(child_format_ctx); + let action_name = format!("Action ModifyColumn column {}", column); - let action_format_ctx = AstFormatContext::new(action_name); - FormatTreeNode::new(action_format_ctx) + let action_format_ctx = AstFormatContext::with_children(action_name, 1); + FormatTreeNode::with_children(action_format_ctx, vec![child]) } AlterTableAction::DropColumn { column } => { let action_name = format!("Action Drop column {}", column); diff --git a/src/query/ast/src/ast/statements/table.rs b/src/query/ast/src/ast/statements/table.rs index 2ffc6ddc2e3b..9d9e82fda774 100644 --- a/src/query/ast/src/ast/statements/table.rs +++ b/src/query/ast/src/ast/statements/table.rs @@ -671,12 +671,14 @@ impl Display for ColumnDefinition { #[derive(Debug, Clone, PartialEq)] pub enum ModifyColumnAction { SetMaskingPolicy(String), + ConvertStoredComputedColumn, } impl Display for ModifyColumnAction { fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { match &self { ModifyColumnAction::SetMaskingPolicy(name) => write!(f, "SET MASKING POLICY {}", name)?, + ModifyColumnAction::ConvertStoredComputedColumn => write!(f, "DROP STORED")?, } Ok(()) diff --git a/src/query/ast/src/parser/statement.rs b/src/query/ast/src/parser/statement.rs index 7fc74ebd6d85..ced8b1f587e6 100644 --- a/src/query/ast/src/parser/statement.rs +++ b/src/query/ast/src/parser/statement.rs @@ -1567,15 +1567,15 @@ pub fn column_def(i: Input) -> IResult { ), map( rule! { - AS ~ ^"(" ~ ^#subexpr(NOT_PREC) ~ ^")" ~ VIRTUAL + (GENERATED ~ ALWAYS)? ~ AS ~ ^"(" ~ ^#subexpr(NOT_PREC) ~ ^")" ~ VIRTUAL }, - |(_, _, virtual_expr, _, _)| ColumnConstraint::VirtualExpr(Box::new(virtual_expr)), + |(_, _, _, virtual_expr, _, _)| ColumnConstraint::VirtualExpr(Box::new(virtual_expr)), ), map( rule! { - AS ~ "(" ~ ^#subexpr(NOT_PREC) ~ ^")" ~ STORED + (GENERATED ~ ALWAYS)? ~ AS ~ ^"(" ~ ^#subexpr(NOT_PREC) ~ ^")" ~ STORED }, - |(_, _, stored_expr, _, _)| ColumnConstraint::StoredExpr(Box::new(stored_expr)), + |(_, _, _, stored_expr, _, _)| ColumnConstraint::StoredExpr(Box::new(stored_expr)), ), )); @@ -1860,6 +1860,15 @@ pub fn alter_table_action(i: Input) -> IResult { action: ModifyColumnAction::SetMaskingPolicy(mask_name.to_string()), }, ); + let convert_stored_computed_column = map( + rule! { + MODIFY ~ COLUMN ~ #ident ~ DROP ~ STORED + }, + |(_, _, column, _, _)| AlterTableAction::ModifyColumn { + column, + action: ModifyColumnAction::ConvertStoredComputedColumn, + }, + ); let drop_column = map( rule! { DROP ~ COLUMN ~ #ident @@ -1910,6 +1919,7 @@ pub fn alter_table_action(i: Input) -> IResult { | #add_column | #drop_column | #modify_column + | #convert_stored_computed_column | #alter_table_cluster_key | #drop_table_cluster_key | #recluster_table diff --git a/src/query/ast/src/parser/token.rs b/src/query/ast/src/parser/token.rs index 6c83a6cf16d4..da64d17cff46 100644 --- a/src/query/ast/src/parser/token.rs +++ b/src/query/ast/src/parser/token.rs @@ -303,6 +303,8 @@ pub enum TokenKind { SOME, #[token("ALTER", ignore(ascii_case))] ALTER, + #[token("ALWAYS", ignore(ascii_case))] + ALWAYS, #[token("ANALYZE", ignore(ascii_case))] ANALYZE, #[token("AND", ignore(ascii_case))] @@ -527,6 +529,8 @@ pub enum TokenKind { FUSE, #[token("GENERATE", ignore(ascii_case))] GENERATE, + #[token("GENERATED", ignore(ascii_case))] + GENERATED, #[token("GLOBAL", ignore(ascii_case))] GLOBAL, #[token("GRAPH", ignore(ascii_case))] diff --git a/src/query/ast/tests/it/parser.rs b/src/query/ast/tests/it/parser.rs index 8f1613984b33..82b314921a79 100644 --- a/src/query/ast/tests/it/parser.rs +++ b/src/query/ast/tests/it/parser.rs @@ -89,7 +89,7 @@ fn test_statement() { r#"create table if not exists a.b (c integer default 1 not null, b varchar) as select * from t;"#, r#"create table if not exists a.b (c tuple(m integer, n string), d tuple(integer, string));"#, r#"create table if not exists a.b (a string, b string, c string as (concat(a, ' ', b)) stored );"#, - r#"create table if not exists a.b (a int, b int, c int as (a + b) virtual );"#, + r#"create table if not exists a.b (a int, b int, c int generated always as (a + b) virtual );"#, r#"create table a.b like c.d;"#, r#"create table t like t2 engine = memory;"#, r#"create table if not exists a.b (a int) 's3://testbucket/admin/data/' connection=(aws_key_id='minioadmin' aws_secret_key='minioadmin' endpoint_url='http://127.0.0.1:9900');"#, diff --git a/src/query/ast/tests/it/testdata/statement-error.txt b/src/query/ast/tests/it/testdata/statement-error.txt index 3838d29545e5..a99a48d1ba70 100644 --- a/src/query/ast/tests/it/testdata/statement-error.txt +++ b/src/query/ast/tests/it/testdata/statement-error.txt @@ -5,7 +5,7 @@ error: --> SQL:1:38 | 1 | create table a.b (c integer not null 1, b float(10)) - | ------ ^ expected `)`, `NULL`, `NOT`, `DEFAULT`, `AS`, `COMMENT`, or 1 more ... + | ------ ^ expected `)`, `NULL`, `NOT`, `DEFAULT`, `GENERATED`, `AS`, or 2 more ... | | | while parsing `CREATE TABLE [IF NOT EXISTS] [.] [] []` @@ -17,7 +17,7 @@ error: --> SQL:1:24 | 1 | create table a (c float(10)) - | ------ ^ expected `)`, `NULL`, `NOT`, `DEFAULT`, `AS`, `COMMENT`, or 1 more ... + | ------ ^ expected `)`, `NULL`, `NOT`, `DEFAULT`, `GENERATED`, `AS`, or 2 more ... | | | while parsing `CREATE TABLE [IF NOT EXISTS] [.]
[] []` diff --git a/src/query/ast/tests/it/testdata/statement.txt b/src/query/ast/tests/it/testdata/statement.txt index bda1e896b867..ea0f27d6ecfe 100644 --- a/src/query/ast/tests/it/testdata/statement.txt +++ b/src/query/ast/tests/it/testdata/statement.txt @@ -1024,7 +1024,7 @@ CreateTable( ---------- Input ---------- -create table if not exists a.b (a int, b int, c int as (a + b) virtual ); +create table if not exists a.b (a int, b int, c int generated always as (a + b) virtual ); ---------- Output --------- CREATE TABLE IF NOT EXISTS a.b (a Int32 NOT NULL, b Int32 NOT NULL, c Int32 NOT NULL AS ((a + b)) VIRTUAL) ---------- AST ------------ @@ -1088,12 +1088,12 @@ CreateTable( Virtual( BinaryOp { span: Some( - 58..59, + 75..76, ), op: Plus, left: ColumnRef { span: Some( - 56..57, + 73..74, ), database: None, table: None, @@ -1102,14 +1102,14 @@ CreateTable( name: "a", quote: None, span: Some( - 56..57, + 73..74, ), }, ), }, right: ColumnRef { span: Some( - 60..61, + 77..78, ), database: None, table: None, @@ -1118,7 +1118,7 @@ CreateTable( name: "b", quote: None, span: Some( - 60..61, + 77..78, ), }, ), diff --git a/src/query/functions/src/lib.rs b/src/query/functions/src/lib.rs index 4b8e6035df4e..fa90e063a6b2 100644 --- a/src/query/functions/src/lib.rs +++ b/src/query/functions/src/lib.rs @@ -35,7 +35,7 @@ pub fn is_builtin_function(name: &str) -> bool { #[ctor] pub static BUILTIN_FUNCTIONS: FunctionRegistry = builtin_functions(); -pub const GENERAL_WINDOW_FUNCTIONS: [&str; 12] = [ +pub const GENERAL_WINDOW_FUNCTIONS: [&str; 13] = [ "row_number", "rank", "dense_rank", @@ -48,6 +48,7 @@ pub const GENERAL_WINDOW_FUNCTIONS: [&str; 12] = [ "last", "nth_value", "ntile", + "cume_dist", ]; fn builtin_functions() -> FunctionRegistry { diff --git a/src/query/service/src/catalogs/catalog_manager.rs b/src/query/service/src/catalogs/catalog_manager.rs index 0462aff172fd..5bccae5dc5bf 100644 --- a/src/query/service/src/catalogs/catalog_manager.rs +++ b/src/query/service/src/catalogs/catalog_manager.rs @@ -122,7 +122,7 @@ impl CatalogManagerHelper for CatalogManager { // when compiling without `hive` feature enabled // `address` will be seem as unused, which is not intentional #[allow(unused)] - CatalogOption::Hive(address) => { + CatalogOption::Hive(cfg) => { #[cfg(not(feature = "hive"))] { Err(ErrorCode::CatalogNotSupported( @@ -131,7 +131,7 @@ impl CatalogManagerHelper for CatalogManager { } #[cfg(feature = "hive")] { - let catalog: Arc = Arc::new(HiveCatalog::try_create(address)?); + let catalog: Arc = Arc::new(HiveCatalog::try_create(cfg.address)?); let ctl_name = &req.name_ident.catalog_name; let if_not_exists = req.if_not_exists; diff --git a/src/query/service/src/interpreters/interpreter_table_modify_column.rs b/src/query/service/src/interpreters/interpreter_table_modify_column.rs index a67d596e7e97..0efd3a72a5d9 100644 --- a/src/query/service/src/interpreters/interpreter_table_modify_column.rs +++ b/src/query/service/src/interpreters/interpreter_table_modify_column.rs @@ -19,6 +19,9 @@ use common_ast::ast::ModifyColumnAction; use common_catalog::table::Table; use common_exception::ErrorCode; use common_exception::Result; +use common_expression::ComputedExpr; +use common_expression::TableSchema; +use common_license::license::Feature::ComputedColumn; use common_license::license::Feature::DataMask; use common_license::license_manager::get_license_manager; use common_meta_app::schema::DatabaseType; @@ -93,6 +96,46 @@ impl ModifyTableColumnInterpreter { new_table_meta.column_mask_policy = Some(column_mask_policy); Ok(new_table_meta) } + + async fn do_convert_stored_computed_column( + &self, + table: &Arc, + table_meta: TableMeta, + ) -> Result { + let license_manager = get_license_manager(); + license_manager.manager.check_enterprise_enabled( + &self.ctx.get_settings(), + self.ctx.get_tenant(), + ComputedColumn, + )?; + + let schema = table.schema(); + let new_schema = if let Some((i, field)) = schema.column_with_name(&self.plan.column) { + match field.computed_expr { + Some(ComputedExpr::Stored(_)) => {} + _ => { + return Err(ErrorCode::UnknownColumn(format!( + "Column '{}' is not a stored computed column", + self.plan.column + ))); + } + } + let mut new_field = field.clone(); + new_field.computed_expr = None; + let mut fields = schema.fields().clone(); + fields[i] = new_field; + TableSchema::new_from(fields, schema.metadata.clone()) + } else { + return Err(ErrorCode::UnknownColumn(format!( + "Cannot find column {}", + self.plan.column + ))); + }; + + let mut new_table_meta = table_meta; + new_table_meta.schema = new_schema.into(); + Ok(new_table_meta) + } } #[async_trait::async_trait] @@ -144,6 +187,10 @@ impl Interpreter for ModifyTableColumnInterpreter { self.do_set_data_mask_policy(table, table_meta, mask_name.clone()) .await? } + ModifyColumnAction::ConvertStoredComputedColumn => { + self.do_convert_stored_computed_column(table, table_meta) + .await? + } }; let table_id = table_info.ident.table_id; diff --git a/src/query/service/src/pipelines/pipeline_builder.rs b/src/query/service/src/pipelines/pipeline_builder.rs index d5f983d646c2..dd50f56c3fb6 100644 --- a/src/query/service/src/pipelines/pipeline_builder.rs +++ b/src/query/service/src/pipelines/pipeline_builder.rs @@ -1223,12 +1223,15 @@ impl PipelineBuilder { fn build_join_probe(&mut self, join: &HashJoin, state: Arc) -> Result<()> { self.build_pipeline(&join.probe)?; + let max_block_size = self.ctx.get_settings().get_max_block_size()? as usize; + let func_ctx = self.ctx.get_function_context()?; self.main_pipeline.add_transform(|input, output| { let transform = TransformHashJoinProbe::create( - self.ctx.clone(), input, output, TransformHashJoinProbe::attach(state.clone())?, + max_block_size, + func_ctx.clone(), &join.join_type, !join.non_equi_conditions.is_empty(), )?; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/common.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/common.rs index 05ce65c56195..cfa199fad1d9 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/common.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/common.rs @@ -34,7 +34,6 @@ use common_hashtable::HashJoinHashtableLike; use common_hashtable::RowPtr; use common_sql::executor::cast_expr_to_non_null_boolean; -use super::desc::JOIN_MAX_BLOCK_SIZE; use super::desc::MARKER_KIND_FALSE; use super::desc::MARKER_KIND_NULL; use super::desc::MARKER_KIND_TRUE; @@ -73,6 +72,7 @@ impl JoinHashTable { } #[inline] + #[allow(clippy::too_many_arguments)] pub(crate) fn probe_key<'a, H: HashJoinHashtableLike>( &self, hash_table: &'a H, @@ -81,9 +81,10 @@ impl JoinHashTable { i: usize, vec_ptr: *mut RowPtr, occupied: usize, + max_block_size: usize, ) -> (usize, u64) { if valids.as_ref().map_or(true, |v| v.get_bit(i)) { - return hash_table.probe_hash_table(key, vec_ptr, occupied, JOIN_MAX_BLOCK_SIZE); + return hash_table.probe_hash_table(key, vec_ptr, occupied, max_block_size); } (0, 0) } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs index 0f37344fa07e..2fc304780682 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/desc.rs @@ -23,7 +23,6 @@ use parking_lot::RwLock; use crate::sql::plans::JoinType; -pub const JOIN_MAX_BLOCK_SIZE: usize = 65536; pub const MARKER_KIND_TRUE: u8 = 0; pub const MARKER_KIND_FALSE: u8 = 1; pub const MARKER_KIND_NULL: u8 = 2; diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state_impl.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state_impl.rs index 9996516ddf6e..10428aed8f29 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state_impl.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/hash_join_state_impl.rs @@ -44,7 +44,6 @@ use ethnum::U256; use super::ProbeState; use crate::pipelines::processors::transforms::hash_join::desc::JoinState; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_FALSE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_NULL; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_TRUE; @@ -63,7 +62,7 @@ impl HashJoinState for JoinHashTable { let mut buffer = self.row_space.buffer.write(); buffer.push(input); let buffer_row_size = buffer.iter().fold(0, |acc, x| acc + x.num_rows()); - if buffer_row_size < *self.data_block_size_limit { + if buffer_row_size < *self.build_side_block_size_limit { Ok(()) } else { let data_block = DataBlock::concat(buffer.as_slice())?; @@ -525,6 +524,7 @@ impl HashJoinState for JoinHashTable { task: usize, state: &mut ProbeState, ) -> Result> { + let max_block_size = state.max_block_size; let true_validity = &state.true_validity; let build_indexes = &mut state.build_indexes; let mut build_indexes_occupied = 0; @@ -551,7 +551,7 @@ impl HashJoinState for JoinHashTable { let outer_map_len = outer_map.len(); let mut row_index = 0; while row_index < outer_map_len { - while row_index < outer_map_len && build_indexes_occupied < JOIN_MAX_BLOCK_SIZE { + while row_index < outer_map_len && build_indexes_occupied < max_block_size { if !outer_map[row_index] { build_indexes[build_indexes_occupied].chunk_index = chunk_index as u32; build_indexes[build_indexes_occupied].row_index = row_index as u32; @@ -567,7 +567,7 @@ impl HashJoinState for JoinHashTable { if self.hash_join_desc.join_type == JoinType::Full { let num_rows = unmatched_build_block.num_rows(); - let nullable_unmatched_build_columns = if num_rows == JOIN_MAX_BLOCK_SIZE { + let nullable_unmatched_build_columns = if num_rows == max_block_size { unmatched_build_block .columns() .iter() @@ -601,6 +601,7 @@ impl HashJoinState for JoinHashTable { } fn right_semi_outer_scan(&self, task: usize, state: &mut ProbeState) -> Result> { + let max_block_size = state.max_block_size; let build_indexes = &mut state.build_indexes; let mut build_indexes_occupied = 0; let mut result_blocks = vec![]; @@ -626,7 +627,7 @@ impl HashJoinState for JoinHashTable { let outer_map_len = outer_map.len(); let mut row_index = 0; while row_index < outer_map_len { - while row_index < outer_map_len && build_indexes_occupied < JOIN_MAX_BLOCK_SIZE { + while row_index < outer_map_len && build_indexes_occupied < max_block_size { if outer_map[row_index] { build_indexes[build_indexes_occupied].chunk_index = chunk_index as u32; build_indexes[build_indexes_occupied].row_index = row_index as u32; @@ -645,6 +646,7 @@ impl HashJoinState for JoinHashTable { } fn right_anti_outer_scan(&self, task: usize, state: &mut ProbeState) -> Result> { + let max_block_size = state.max_block_size; let build_indexes = &mut state.build_indexes; let mut build_indexes_occupied = 0; let mut result_blocks = vec![]; @@ -670,7 +672,7 @@ impl HashJoinState for JoinHashTable { let outer_map_len = outer_map.len(); let mut row_index = 0; while row_index < outer_map_len { - while row_index < outer_map_len && build_indexes_occupied < JOIN_MAX_BLOCK_SIZE { + while row_index < outer_map_len && build_indexes_occupied < max_block_size { if !outer_map[row_index] { build_indexes[build_indexes_occupied].chunk_index = chunk_index as u32; build_indexes[build_indexes_occupied].row_index = row_index as u32; @@ -693,6 +695,7 @@ impl HashJoinState for JoinHashTable { } fn left_mark_scan(&self, task: usize, state: &mut ProbeState) -> Result> { + let max_block_size = state.max_block_size; let build_indexes = &mut state.build_indexes; let mut build_indexes_occupied = 0; let mut result_blocks = vec![]; @@ -720,7 +723,7 @@ impl HashJoinState for JoinHashTable { let markers_len = markers.len(); let mut row_index = 0; while row_index < markers_len { - let block_size = std::cmp::min(markers_len - row_index, JOIN_MAX_BLOCK_SIZE); + let block_size = std::cmp::min(markers_len - row_index, max_block_size); let mut validity = MutableBitmap::with_capacity(block_size); let mut boolean_bit_map = MutableBitmap::with_capacity(block_size); while build_indexes_occupied < block_size { diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/join_hash_table.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/join_hash_table.rs index 6b1e5ed319ff..f5698b1ee96b 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/join_hash_table.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/join_hash_table.rs @@ -83,7 +83,7 @@ pub enum HashJoinHashTable { pub struct JoinHashTable { pub(crate) ctx: Arc, - pub(crate) data_block_size_limit: Arc, + pub(crate) build_side_block_size_limit: Arc, /// Reference count pub(crate) build_count: Mutex, pub(crate) finalize_count: Mutex, @@ -161,7 +161,9 @@ impl JoinHashTable { } Ok(Self { row_space: RowSpace::new(ctx.clone(), build_data_schema)?, - data_block_size_limit: Arc::new(ctx.get_settings().get_max_block_size()? as usize * 16), + build_side_block_size_limit: Arc::new( + ctx.get_settings().get_max_block_size()? as usize * 16, + ), ctx, build_count: Mutex::new(0), finalize_count: Mutex::new(0), diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/inner_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/inner_join.rs index f20cb2c40d3f..d0055788853b 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/inner_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/inner_join.rs @@ -26,7 +26,6 @@ use common_functions::BUILTIN_FUNCTIONS; use common_hashtable::HashJoinHashtableLike; use common_sql::executor::cast_expr_to_non_null_boolean; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; @@ -42,6 +41,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; // The inner join will return multiple data blocks of similar size. let mut occupied = 0; @@ -62,14 +62,20 @@ impl JoinHashTable { for (i, key) in keys_iter.enumerate() { // If the join is derived from correlated subquery, then null equality is safe. - let (mut match_count, mut incomplete_ptr) = if self - .hash_join_desc - .from_correlated_subquery - { - hash_table.probe_hash_table(key, build_indexes_ptr, occupied, JOIN_MAX_BLOCK_SIZE) - } else { - self.probe_key(hash_table, key, valids, i, build_indexes_ptr, occupied) - }; + let (mut match_count, mut incomplete_ptr) = + if self.hash_join_desc.from_correlated_subquery { + hash_table.probe_hash_table(key, build_indexes_ptr, occupied, max_block_size) + } else { + self.probe_key( + hash_table, + key, + valids, + i, + build_indexes_ptr, + occupied, + max_block_size, + ) + }; if match_count == 0 { continue; } @@ -77,7 +83,7 @@ impl JoinHashTable { occupied += match_count; probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { probed_blocks.push( self.merge_eq_block( @@ -103,7 +109,7 @@ impl JoinHashTable { incomplete_ptr, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -113,7 +119,7 @@ impl JoinHashTable { probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs index 25827c59c1d3..b5c7f33136d4 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_join.rs @@ -25,7 +25,6 @@ use common_expression::Scalar; use common_expression::Value; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; use crate::sql::plans::JoinType; @@ -43,16 +42,13 @@ impl JoinHashTable { H::Key: 'a, { let input_num_rows = input.num_rows(); + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let true_validity = &probe_state.true_validity; let probe_indexes = &mut probe_state.probe_indexes; let local_build_indexes = &mut probe_state.build_indexes; let local_build_indexes_ptr = local_build_indexes.as_mut_ptr(); // Safe to unwrap. - if input_num_rows > probe_state.probe_unmatched_indexes.as_ref().unwrap().len() { - probe_state.probe_unmatched_indexes = Some(vec![(0, 0); input_num_rows]); - } - // Safe to unwrap. let probe_unmatched_indexes = probe_state.probe_unmatched_indexes.as_mut().unwrap(); let mut matched_num = 0; @@ -78,7 +74,7 @@ impl JoinHashTable { key, local_build_indexes_ptr, matched_num, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ) } else { self.probe_key( @@ -88,6 +84,7 @@ impl JoinHashTable { i, local_build_indexes_ptr, matched_num, + max_block_size, ) }; let mut total_probe_matched = 0; @@ -104,8 +101,16 @@ impl JoinHashTable { } else { probe_unmatched_indexes[probe_unmatched_indexes_occupied] = (i as u32, 1); probe_unmatched_indexes_occupied += 1; + if probe_unmatched_indexes_occupied >= max_block_size { + result_blocks.push(self.create_left_join_null_block( + input, + probe_unmatched_indexes, + probe_unmatched_indexes_occupied, + )?); + probe_unmatched_indexes_occupied = 0; + } } - if matched_num >= JOIN_MAX_BLOCK_SIZE || i == input_num_rows - 1 { + if matched_num >= max_block_size || i == input_num_rows - 1 { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -137,14 +142,14 @@ impl JoinHashTable { .collect::>(), matched_num, ) - } else if matched_num == JOIN_MAX_BLOCK_SIZE { + } else if matched_num == max_block_size { ( build_block .columns() .iter() - .map(|c| Self::set_validity(c, JOIN_MAX_BLOCK_SIZE, true_validity)) + .map(|c| Self::set_validity(c, max_block_size, true_validity)) .collect::>(), - JOIN_MAX_BLOCK_SIZE, + max_block_size, ) } else { let mut validity = MutableBitmap::new(); @@ -163,11 +168,11 @@ impl JoinHashTable { // For full join, wrap nullable for probe block if self.hash_join_desc.join_type == JoinType::Full { - let nullable_probe_columns = if matched_num == JOIN_MAX_BLOCK_SIZE { + let nullable_probe_columns = if matched_num == max_block_size { probe_block .columns() .iter() - .map(|c| Self::set_validity(c, JOIN_MAX_BLOCK_SIZE, true_validity)) + .map(|c| Self::set_validity(c, max_block_size, true_validity)) .collect::>() } else { let mut validity = MutableBitmap::new(); @@ -205,21 +210,24 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, matched_num, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); - total_probe_matched += probe_matched; - if self.hash_join_desc.join_type == JoinType::Single && total_probe_matched > 1 - { - return Err(ErrorCode::Internal( - "Scalar subquery can't return more than one row", - )); + if probe_matched > 0 { + total_probe_matched += probe_matched; + if self.hash_join_desc.join_type == JoinType::Single + && total_probe_matched > 1 + { + return Err(ErrorCode::Internal( + "Scalar subquery can't return more than one row", + )); + } + matched_num += probe_matched; + probe_indexes[probe_indexes_occupied] = (i as u32, probe_matched as u32); + probe_indexes_occupied += 1; } - probe_indexes[probe_indexes_occupied] = (i as u32, probe_matched as u32); - probe_indexes_occupied += 1; - - if matched_num < JOIN_MAX_BLOCK_SIZE && i != input_num_rows - 1 { + if matched_num < max_block_size && i != input_num_rows - 1 { break; } } @@ -229,43 +237,11 @@ impl JoinHashTable { if probe_unmatched_indexes_occupied == 0 { return Ok(result_blocks); } - - let null_build_block = DataBlock::new( - self.row_space - .data_schema - .fields() - .iter() - .map(|df| BlockEntry { - data_type: df.data_type().clone(), - value: Value::Scalar(Scalar::Null), - }) - .collect(), - probe_unmatched_indexes_occupied, - ); - - let mut probe_block = DataBlock::take_compacted_indices( + result_blocks.push(self.create_left_join_null_block( input, - &probe_unmatched_indexes[0..probe_unmatched_indexes_occupied], + probe_unmatched_indexes, probe_unmatched_indexes_occupied, - )?; - - // For full join, wrap nullable for probe block - if self.hash_join_desc.join_type == JoinType::Full { - let nullable_probe_columns = probe_block - .columns() - .iter() - .map(|c| { - let mut probe_validity = MutableBitmap::new(); - probe_validity.extend_constant(probe_unmatched_indexes_occupied, true); - let probe_validity: Bitmap = probe_validity.into(); - Self::set_validity(c, probe_unmatched_indexes_occupied, &probe_validity) - }) - .collect::>(); - probe_block = DataBlock::new(nullable_probe_columns, probe_unmatched_indexes_occupied); - } - - result_blocks.push(self.merge_eq_block(&null_build_block, &probe_block)?); - + )?); Ok(result_blocks) } @@ -281,6 +257,7 @@ impl JoinHashTable { H::Key: 'a, { let input_num_rows = input.num_rows(); + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let true_validity = &probe_state.true_validity; let probe_indexes = &mut probe_state.probe_indexes; @@ -319,7 +296,7 @@ impl JoinHashTable { key, local_build_indexes_ptr, matched_num, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ) } else { self.probe_key( @@ -329,6 +306,7 @@ impl JoinHashTable { i, local_build_indexes_ptr, matched_num, + max_block_size, ) }; let mut total_probe_matched = 0; @@ -348,7 +326,7 @@ impl JoinHashTable { probe_indexes[probe_indexes_occupied] = (i as u32, probe_matched as u32); probe_indexes_occupied += 1; } - if matched_num >= JOIN_MAX_BLOCK_SIZE || i == input_num_rows - 1 { + if matched_num >= max_block_size || i == input_num_rows - 1 { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -380,14 +358,14 @@ impl JoinHashTable { .collect::>(), matched_num, ) - } else if matched_num == JOIN_MAX_BLOCK_SIZE { + } else if matched_num == max_block_size { ( build_block .columns() .iter() - .map(|c| Self::set_validity(c, JOIN_MAX_BLOCK_SIZE, true_validity)) + .map(|c| Self::set_validity(c, max_block_size, true_validity)) .collect::>(), - JOIN_MAX_BLOCK_SIZE, + max_block_size, ) } else { let mut validity = MutableBitmap::new(); @@ -406,11 +384,11 @@ impl JoinHashTable { // For full join, wrap nullable for probe block if self.hash_join_desc.join_type == JoinType::Full { - let nullable_probe_columns = if matched_num == JOIN_MAX_BLOCK_SIZE { + let nullable_probe_columns = if matched_num == max_block_size { probe_block .columns() .iter() - .map(|c| Self::set_validity(c, JOIN_MAX_BLOCK_SIZE, true_validity)) + .map(|c| Self::set_validity(c, max_block_size, true_validity)) .collect::>() } else { let mut validity = MutableBitmap::new(); @@ -490,26 +468,29 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, matched_num, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); - total_probe_matched += probe_matched; - if self.hash_join_desc.join_type == JoinType::Single && total_probe_matched > 1 - { - return Err(ErrorCode::Internal( - "Scalar subquery can't return more than one row", - )); - } + if probe_matched > 0 { + total_probe_matched += probe_matched; + if self.hash_join_desc.join_type == JoinType::Single + && total_probe_matched > 1 + { + return Err(ErrorCode::Internal( + "Scalar subquery can't return more than one row", + )); + } - row_state[i] += probe_matched; - for _ in 0..probe_matched { - row_state_indexes[matched_num] = i; - matched_num += 1; + row_state[i] += probe_matched; + for _ in 0..probe_matched { + row_state_indexes[matched_num] = i; + matched_num += 1; + } + probe_indexes[probe_indexes_occupied] = (i as u32, probe_matched as u32); + probe_indexes_occupied += 1; } - probe_indexes[probe_indexes_occupied] = (i as u32, probe_matched as u32); - probe_indexes_occupied += 1; - if matched_num < JOIN_MAX_BLOCK_SIZE && i != input_num_rows - 1 { + if matched_num < max_block_size && i != input_num_rows - 1 { break; } } @@ -522,6 +503,14 @@ impl JoinHashTable { if row_state[idx] == 0 { probe_indexes[probe_indexes_occupied] = (idx as u32, 1); probe_indexes_occupied += 1; + if probe_indexes_occupied >= max_block_size { + result_blocks.push(self.create_left_join_null_block( + input, + probe_indexes, + probe_indexes_occupied, + )?); + probe_indexes_occupied = 0; + } } row_state[idx] = 0; idx += 1; @@ -530,7 +519,20 @@ impl JoinHashTable { if probe_indexes_occupied == 0 { return Ok(result_blocks); } + result_blocks.push(self.create_left_join_null_block( + input, + probe_indexes, + probe_indexes_occupied, + )?); + Ok(result_blocks) + } + fn create_left_join_null_block( + &self, + input: &DataBlock, + indexes: &[(u32, u32)], + occupied: usize, + ) -> Result { let null_build_block = DataBlock::new( self.row_space .data_schema @@ -541,33 +543,27 @@ impl JoinHashTable { value: Value::Scalar(Scalar::Null), }) .collect(), - probe_indexes_occupied, + occupied, ); - let mut probe_block = DataBlock::take_compacted_indices( - input, - &probe_indexes[0..probe_indexes_occupied], - probe_indexes_occupied, - )?; + let mut probe_block = + DataBlock::take_compacted_indices(input, &indexes[0..occupied], occupied)?; // For full join, wrap nullable for probe block if self.hash_join_desc.join_type == JoinType::Full { - let num_rows = probe_block.num_rows(); let nullable_probe_columns = probe_block .columns() .iter() .map(|c| { let mut probe_validity = MutableBitmap::new(); - probe_validity.extend_constant(num_rows, true); + probe_validity.extend_constant(occupied, true); let probe_validity: Bitmap = probe_validity.into(); - Self::set_validity(c, num_rows, &probe_validity) + Self::set_validity(c, occupied, &probe_validity) }) .collect::>(); - probe_block = DataBlock::new(nullable_probe_columns, num_rows); + probe_block = DataBlock::new(nullable_probe_columns, occupied); } - result_blocks.push(self.merge_eq_block(&null_build_block, &probe_block)?); - - Ok(result_blocks) + self.merge_eq_block(&null_build_block, &probe_block) } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_mark.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_mark.rs index a35406f330b9..4efe692f9d61 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_mark.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_mark.rs @@ -24,7 +24,6 @@ use common_expression::types::ValueType; use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_FALSE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_NULL; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_TRUE; @@ -43,8 +42,8 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let mut max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; - let mut block_size = JOIN_MAX_BLOCK_SIZE; // `probe_column` is the subquery result column. // For sql: select * from t1 where t1.a in (select t2.a from t2); t2.a is the `probe_column`, let probe_column = input.get_by_offset(0).value.as_column().unwrap(); @@ -61,8 +60,8 @@ impl JoinHashTable { let mark_scan_map = unsafe { &mut *self.mark_scan_map.get() }; for (i, key) in keys_iter.enumerate() { - if (i & block_size) == 0 { - block_size <<= 1; + if (i & max_block_size) == 0 { + max_block_size <<= 1; if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -75,13 +74,18 @@ impl JoinHashTable { .hash_join_desc .from_correlated_subquery { - true => hash_table.probe_hash_table( + true => { + hash_table.probe_hash_table(key, build_indexes_ptr, occupied, max_block_size) + } + false => self.probe_key( + hash_table, key, + valids, + i, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ), - false => self.probe_key(hash_table, key, valids, i, build_indexes_ptr, occupied), }; if match_count == 0 { continue; @@ -101,7 +105,7 @@ impl JoinHashTable { incomplete_ptr, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -124,8 +128,8 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; - // `probe_column` is the subquery result column. // For sql: select * from t1 where t1.a in (select t2.a from t2); t2.a is the `probe_column`, let probe_column = input.get_by_offset(0).value.as_column().unwrap(); @@ -157,14 +161,20 @@ impl JoinHashTable { let _mark_scan_map_lock = self.mark_scan_map_lock.lock(); for (i, key) in keys_iter.enumerate() { - let (mut match_count, mut incomplete_ptr) = if self - .hash_join_desc - .from_correlated_subquery - { - hash_table.probe_hash_table(key, build_indexes_ptr, occupied, JOIN_MAX_BLOCK_SIZE) - } else { - self.probe_key(hash_table, key, valids, i, build_indexes_ptr, occupied) - }; + let (mut match_count, mut incomplete_ptr) = + if self.hash_join_desc.from_correlated_subquery { + hash_table.probe_hash_table(key, build_indexes_ptr, occupied, max_block_size) + } else { + self.probe_key( + hash_table, + key, + valids, + i, + build_indexes_ptr, + occupied, + max_block_size, + ) + }; if match_count == 0 { continue; } @@ -172,7 +182,7 @@ impl JoinHashTable { occupied += match_count; probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -222,7 +232,7 @@ impl JoinHashTable { incomplete_ptr, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -232,7 +242,7 @@ impl JoinHashTable { probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_semi_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_semi_join.rs index 1a19789e30d6..2225a6e0806c 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_semi_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/left_semi_join.rs @@ -22,7 +22,6 @@ use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; use common_hashtable::RowPtr; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; @@ -89,6 +88,7 @@ impl JoinHashTable { { // If there is no build key, the result is input // Eg: select * from onecolumn as a right semi join twocolumn as b on true order by b.x + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let probe_indexes = &mut probe_state.probe_indexes; let mut probe_indexes_occupied = 0; @@ -105,7 +105,7 @@ impl JoinHashTable { (true, true) | (false, false) => { probe_indexes[probe_indexes_occupied] = (i as u32, 1); probe_indexes_occupied += 1; - if probe_indexes_occupied >= JOIN_MAX_BLOCK_SIZE { + if probe_indexes_occupied >= max_block_size { let probe_block = DataBlock::take_compacted_indices( input, &probe_indexes[0..probe_indexes_occupied], @@ -140,6 +140,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; // The semi join will return multiple data chunks of similar size. let mut occupied = 0; @@ -167,14 +168,20 @@ impl JoinHashTable { }]; for (i, key) in keys_iter.enumerate() { - let (mut match_count, mut incomplete_ptr) = if self - .hash_join_desc - .from_correlated_subquery - { - hash_table.probe_hash_table(key, build_indexes_ptr, occupied, JOIN_MAX_BLOCK_SIZE) - } else { - self.probe_key(hash_table, key, valids, i, build_indexes_ptr, occupied) - }; + let (mut match_count, mut incomplete_ptr) = + if self.hash_join_desc.from_correlated_subquery { + hash_table.probe_hash_table(key, build_indexes_ptr, occupied, max_block_size) + } else { + self.probe_key( + hash_table, + key, + valids, + i, + build_indexes_ptr, + occupied, + max_block_size, + ) + }; let true_match_count = match_count; match match_count > 0 { @@ -202,7 +209,7 @@ impl JoinHashTable { occupied += match_count; probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -260,7 +267,7 @@ impl JoinHashTable { incomplete_ptr, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -274,7 +281,7 @@ impl JoinHashTable { probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_anti_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_anti_join.rs index c5978a0d13bc..d7eeefb7804c 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_anti_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_anti_join.rs @@ -20,7 +20,6 @@ use common_exception::Result; use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; @@ -35,6 +34,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let mut occupied = 0; let local_build_indexes = &mut probe_state.build_indexes; @@ -50,12 +50,13 @@ impl JoinHashTable { i, local_build_indexes_ptr, occupied, + max_block_size, ); if match_count == 0 { continue; } occupied += match_count; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -78,7 +79,7 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -86,7 +87,7 @@ impl JoinHashTable { occupied += match_count; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } @@ -111,6 +112,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; // The right join will return multiple data blocks of similar size. let mut occupied = 0; @@ -137,6 +139,7 @@ impl JoinHashTable { i, local_build_indexes_ptr, occupied, + max_block_size, ); if match_count == 0 { continue; @@ -144,7 +147,7 @@ impl JoinHashTable { occupied += match_count; local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -199,7 +202,7 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -209,7 +212,7 @@ impl JoinHashTable { local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_join.rs index 1514b99d6ac1..6c3c3b6b479b 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_join.rs @@ -22,7 +22,6 @@ use common_exception::Result; use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; @@ -38,6 +37,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let true_validity = &probe_state.true_validity; let local_probe_indexes = &mut probe_state.probe_indexes; @@ -66,6 +66,7 @@ impl JoinHashTable { i, local_build_indexes_ptr, matched_num, + max_block_size, ); if match_count == 0 { continue; @@ -73,10 +74,10 @@ impl JoinHashTable { matched_num += match_count; local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if matched_num >= JOIN_MAX_BLOCK_SIZE { + if matched_num >= max_block_size { loop { - // The matched_num must be equal to JOIN_MAX_BLOCK_SIZE. - debug_assert_eq!(matched_num, JOIN_MAX_BLOCK_SIZE); + // The matched_num must be equal to max_block_size. + debug_assert_eq!(matched_num, max_block_size); if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( "Aborted query, because the server is shutting down or the query was killed.", @@ -91,16 +92,16 @@ impl JoinHashTable { let mut probe_block = DataBlock::take_compacted_indices( input, &local_probe_indexes[0..probe_indexes_len], - JOIN_MAX_BLOCK_SIZE, + max_block_size, )?; // The join type is right join, we need to wrap nullable for probe side. let nullable_columns = probe_block .columns() .iter() - .map(|c| Self::set_validity(c, JOIN_MAX_BLOCK_SIZE, true_validity)) + .map(|c| Self::set_validity(c, max_block_size, true_validity)) .collect::>(); - probe_block = DataBlock::new(nullable_columns, JOIN_MAX_BLOCK_SIZE); + probe_block = DataBlock::new(nullable_columns, max_block_size); if !probe_block.is_empty() { let merged_block = self.merge_eq_block(&build_block, &probe_block)?; @@ -126,7 +127,7 @@ impl JoinHashTable { // Safe to unwrap. let validity = bm.unwrap(); let mut idx = 0; - while idx < JOIN_MAX_BLOCK_SIZE { + while idx < max_block_size { let valid = unsafe { validity.get_bit_unchecked(idx) }; if valid { outer_scan_map @@ -153,7 +154,7 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, matched_num, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -163,7 +164,7 @@ impl JoinHashTable { local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if matched_num < JOIN_MAX_BLOCK_SIZE { + if matched_num < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_mark.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_mark.rs index 987f51e8895d..af105f2fb1bb 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_mark.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_mark.rs @@ -24,7 +24,6 @@ use common_expression::types::ValueType; use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_FALSE; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_NULL; use crate::pipelines::processors::transforms::hash_join::desc::MARKER_KIND_TRUE; @@ -74,6 +73,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; let has_null = *self.hash_join_desc.marker_join_desc.has_null.read(); let cols = input @@ -103,14 +103,20 @@ impl JoinHashTable { .fold(0, |acc, chunk| acc + chunk.num_rows()); for (i, key) in keys_iter.enumerate() { - let (mut match_count, mut incomplete_ptr) = if self - .hash_join_desc - .from_correlated_subquery - { - hash_table.probe_hash_table(key, build_indexes_ptr, occupied, JOIN_MAX_BLOCK_SIZE) - } else { - self.probe_key(hash_table, key, valids, i, build_indexes_ptr, occupied) - }; + let (mut match_count, mut incomplete_ptr) = + if self.hash_join_desc.from_correlated_subquery { + hash_table.probe_hash_table(key, build_indexes_ptr, occupied, max_block_size) + } else { + self.probe_key( + hash_table, + key, + valids, + i, + build_indexes_ptr, + occupied, + max_block_size, + ) + }; if match_count == 0 { continue; } @@ -118,7 +124,7 @@ impl JoinHashTable { occupied += match_count; probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -171,7 +177,7 @@ impl JoinHashTable { incomplete_ptr, build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -181,7 +187,7 @@ impl JoinHashTable { probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_semi_join.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_semi_join.rs index e9f23e2a75dc..9b067fccd2e3 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_semi_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_join/right_semi_join.rs @@ -20,7 +20,6 @@ use common_exception::Result; use common_expression::DataBlock; use common_hashtable::HashJoinHashtableLike; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::pipelines::processors::transforms::hash_join::ProbeState; use crate::pipelines::processors::JoinHashTable; @@ -35,6 +34,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; // The right join will return multiple data blocks of similar size. let mut occupied = 0; @@ -50,12 +50,13 @@ impl JoinHashTable { i, local_build_indexes_ptr, occupied, + max_block_size, ); if match_count == 0 { continue; } occupied += match_count; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -78,7 +79,7 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -86,7 +87,7 @@ impl JoinHashTable { occupied += match_count; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } @@ -111,6 +112,7 @@ impl JoinHashTable { IT: Iterator + TrustedLen, H::Key: 'a, { + let max_block_size = probe_state.max_block_size; let valids = &probe_state.valids; // The right join will return multiple data blocks of similar size. let mut occupied = 0; @@ -137,6 +139,7 @@ impl JoinHashTable { i, local_build_indexes_ptr, occupied, + max_block_size, ); if match_count == 0 { continue; @@ -144,7 +147,7 @@ impl JoinHashTable { occupied += match_count; local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied >= JOIN_MAX_BLOCK_SIZE { + if occupied >= max_block_size { loop { if self.interrupt.load(Ordering::Relaxed) { return Err(ErrorCode::AbortedQuery( @@ -199,7 +202,7 @@ impl JoinHashTable { incomplete_ptr, local_build_indexes_ptr, occupied, - JOIN_MAX_BLOCK_SIZE, + max_block_size, ); if match_count == 0 { break; @@ -209,7 +212,7 @@ impl JoinHashTable { local_probe_indexes[probe_indexes_len] = (i as u32, match_count as u32); probe_indexes_len += 1; - if occupied < JOIN_MAX_BLOCK_SIZE { + if occupied < max_block_size { break; } } diff --git a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_state.rs b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_state.rs index 214e67a5b34e..5fa94f4f04fe 100644 --- a/src/query/service/src/pipelines/processors/transforms/hash_join/probe_state.rs +++ b/src/query/service/src/pipelines/processors/transforms/hash_join/probe_state.rs @@ -18,12 +18,12 @@ use common_expression::FunctionContext; use common_hashtable::RowPtr; use super::desc::MARKER_KIND_FALSE; -use crate::pipelines::processors::transforms::hash_join::desc::JOIN_MAX_BLOCK_SIZE; use crate::sql::plans::JoinType; /// ProbeState used for probe phase of hash join. /// We may need some reusable state for probe phase. pub struct ProbeState { + pub(crate) max_block_size: usize, pub(crate) probe_indexes: Vec<(u32, u32)>, pub(crate) build_indexes: Vec, pub(crate) valids: Option, @@ -44,41 +44,47 @@ impl ProbeState { self.valids = None; } - pub fn create(join_type: &JoinType, with_conjunct: bool, func_ctx: FunctionContext) -> Self { + pub fn create( + max_block_size: usize, + join_type: &JoinType, + with_conjunct: bool, + func_ctx: FunctionContext, + ) -> Self { let mut true_validity = MutableBitmap::new(); - true_validity.extend_constant(JOIN_MAX_BLOCK_SIZE, true); + true_validity.extend_constant(max_block_size, true); let true_validity: Bitmap = true_validity.into(); let (row_state, row_state_indexes, probe_unmatched_indexes) = match &join_type { JoinType::Left | JoinType::Single | JoinType::Full => { if with_conjunct { ( - Some(vec![0; JOIN_MAX_BLOCK_SIZE]), - Some(vec![0; JOIN_MAX_BLOCK_SIZE]), + Some(vec![0; max_block_size]), + Some(vec![0; max_block_size]), None, ) } else { ( - Some(vec![0; JOIN_MAX_BLOCK_SIZE]), + Some(vec![0; max_block_size]), None, - Some(vec![(0, 0); JOIN_MAX_BLOCK_SIZE]), + Some(vec![(0, 0); max_block_size]), ) } } _ => (None, None, None), }; let markers = if matches!(&join_type, JoinType::RightMark) { - Some(vec![MARKER_KIND_FALSE; JOIN_MAX_BLOCK_SIZE]) + Some(vec![MARKER_KIND_FALSE; max_block_size]) } else { None }; ProbeState { - probe_indexes: vec![(0, 0); JOIN_MAX_BLOCK_SIZE], + max_block_size, + probe_indexes: vec![(0, 0); max_block_size], build_indexes: vec![ RowPtr { chunk_index: 0, row_index: 0, }; - JOIN_MAX_BLOCK_SIZE + max_block_size ], valids: None, true_validity, diff --git a/src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs b/src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs index b4165d38f364..49d329dadadc 100644 --- a/src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs +++ b/src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use common_exception::ErrorCode; use common_exception::Result; use common_expression::DataBlock; +use common_expression::FunctionContext; use common_sql::plans::JoinType; use super::hash_join::ProbeState; @@ -27,8 +28,6 @@ use crate::pipelines::processors::port::OutputPort; use crate::pipelines::processors::processor::Event; use crate::pipelines::processors::transforms::hash_join::HashJoinState; use crate::pipelines::processors::Processor; -use crate::sessions::QueryContext; -use crate::sessions::TableContext; enum HashJoinStep { Build, @@ -47,20 +46,20 @@ pub struct TransformHashJoinProbe { step: HashJoinStep, join_state: Arc, probe_state: ProbeState, - block_size: u64, + max_block_size: usize, outer_scan_finished: bool, } impl TransformHashJoinProbe { pub fn create( - ctx: Arc, input_port: Arc, output_port: Arc, join_state: Arc, + max_block_size: usize, + func_ctx: FunctionContext, join_type: &JoinType, with_conjunct: bool, ) -> Result> { - let default_block_size = ctx.get_settings().get_max_block_size()?; Ok(Box::new(TransformHashJoinProbe { input_data: VecDeque::new(), output_data_blocks: VecDeque::new(), @@ -68,8 +67,8 @@ impl TransformHashJoinProbe { output_port, step: HashJoinStep::Build, join_state, - probe_state: ProbeState::create(join_type, with_conjunct, ctx.get_function_context()?), - block_size: default_block_size, + probe_state: ProbeState::create(max_block_size, join_type, with_conjunct, func_ctx), + max_block_size, outer_scan_finished: false, })) } @@ -140,7 +139,7 @@ impl Processor for TransformHashJoinProbe { if self.input_port.has_data() { let data = self.input_port.pull_data().unwrap()?; // Split data to `block_size` rows per sub block. - let (sub_blocks, remain_block) = data.split_by_rows(self.block_size as usize); + let (sub_blocks, remain_block) = data.split_by_rows(self.max_block_size); self.input_data.extend(sub_blocks); if let Some(remain) = remain_block { self.input_data.push_back(remain); diff --git a/src/query/service/src/pipelines/processors/transforms/window/frame_bound.rs b/src/query/service/src/pipelines/processors/transforms/window/frame_bound.rs index abf8382839b1..49fa853066b6 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/frame_bound.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/frame_bound.rs @@ -20,7 +20,7 @@ use common_expression::types::Number; use common_expression::Scalar; use common_sql::plans::WindowFuncFrameBound; -#[derive(PartialEq)] +#[derive(Debug, PartialEq)] pub enum FrameBound { CurrentRow, Preceding(Option), diff --git a/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs b/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs index f667804623e8..a1e9fb6da907 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/transform_window.rs @@ -124,6 +124,9 @@ pub struct TransformWindow { /// For ROWS frame, it is the same as `current_row`. /// For RANGE frame, `peer_group_start` <= `current_row` peer_group_start: RowPtr, + peer_group_end: RowPtr, + peer_group_ended: bool, + need_peer: bool, // Used for row_number current_row_in_partition: usize, @@ -357,6 +360,29 @@ impl TransformWindow { row } + fn advance_peer_group_end(&mut self, mut row: RowPtr) { + if !self.need_peer { + return; + } + + let current_row = row; + while row < self.partition_end { + row = self.advance_row(row); + if row == self.partition_end { + break; + } + if self.are_peers(¤t_row, &row, false) { + continue; + } else { + self.peer_group_end = row; + self.peer_group_ended = true; + return; + } + } + self.peer_group_ended = self.partition_ended; + self.peer_group_end = self.partition_end; + } + /// If the two rows are within the same peer group. fn are_peers(&self, lhs: &RowPtr, rhs: &RowPtr, for_computing_bound: bool) -> bool { if lhs == rhs { @@ -570,6 +596,26 @@ impl TransformWindow { let bucket = ntile.compute_nitle(self.current_row_in_partition, num_partition_rows); builder.push(ScalarRef::Number(NumberScalar::UInt64(bucket as u64))); } + WindowFunctionImpl::CumeDist => { + let cume_rows = { + let mut rows = 0; + let mut row = self.partition_start; + while row < self.peer_group_end { + row = self.advance_row(row); + rows += 1; + } + rows + }; + + let builder = &mut self.blocks[self.current_row.block - self.first_block].builder; + + let cume_dist = if self.partition_size > 0 { + cume_rows as f64 / self.partition_size as f64 + } else { + 0_f64 + }; + builder.push(ScalarRef::Number(NumberScalar::Float64(cume_dist.into()))); + } }; Ok(()) @@ -642,6 +688,9 @@ impl TransformWindow { prev_frame_start: RowPtr::default(), prev_frame_end: RowPtr::default(), peer_group_start: RowPtr::default(), + peer_group_end: RowPtr::default(), + peer_group_ended: false, + need_peer: false, current_row: RowPtr::default(), current_row_in_partition: 1, current_rank: 1, @@ -678,6 +727,8 @@ where T: Number + ResultTypeOfUnary false }; + let need_peer = matches!(func, WindowFunctionImpl::CumeDist); + Ok(Self { input, output, @@ -707,6 +758,9 @@ where T: Number + ResultTypeOfUnary prev_frame_start: RowPtr::default(), prev_frame_end: RowPtr::default(), peer_group_start: RowPtr::default(), + peer_group_end: RowPtr::default(), + peer_group_ended: false, + need_peer, current_row: RowPtr::default(), current_row_in_partition: 1, current_rank: 1, @@ -882,9 +936,14 @@ where T: Number + ResultTypeOfUnary while self.current_row < self.partition_end { if !self.are_peers(&self.peer_group_start, &self.current_row, false) { self.peer_group_start = self.current_row; + self.peer_group_end = self.current_row; + self.peer_group_ended = false; self.current_dense_rank += 1; self.current_rank = self.current_row_in_partition; + // peer changed, re-calculate peer end. + self.advance_peer_group_end(self.peer_group_start); + // If current peer group is a null frame, there will be no null frame in this partition again; // if current peer group is not a null frame, we may need to check it in the codes below. self.is_null_frame = false; @@ -893,6 +952,21 @@ where T: Number + ResultTypeOfUnary self.need_check_null_frame = false; } + // execute only once for each partition. + if self.peer_group_start == self.partition_start { + self.advance_peer_group_end(self.current_row); + } + + if self.need_peer && self.partition_ended { + self.peer_group_ended = true; + } + + if self.need_peer && !self.peer_group_ended { + debug_assert!(!self.input_is_finished); + debug_assert!(!self.partition_ended); + break; + } + // 2. if self.need_check_null_frame { self.is_null_frame = self.is_in_null_frame(); @@ -966,6 +1040,7 @@ where T: Number + ResultTypeOfUnary // reset peer group self.peer_group_start = self.partition_start; + self.peer_group_end = self.partition_start; // reset row number, rank, ... self.current_row_in_partition = 1; diff --git a/src/query/service/src/pipelines/processors/transforms/window/window_function.rs b/src/query/service/src/pipelines/processors/transforms/window/window_function.rs index 94c06e0f28d3..49216baf0da6 100644 --- a/src/query/service/src/pipelines/processors/transforms/window/window_function.rs +++ b/src/query/service/src/pipelines/processors/transforms/window/window_function.rs @@ -41,6 +41,7 @@ pub enum WindowFunctionInfo { LagLead(WindowFuncLagLeadImpl), NthValue(WindowFuncNthValueImpl), Ntile(WindowFuncNtileImpl), + CumeDist, } pub struct WindowFuncAggImpl { @@ -174,6 +175,7 @@ pub enum WindowFunctionImpl { LagLead(WindowFuncLagLeadImpl), NthValue(WindowFuncNthValueImpl), Ntile(WindowFuncNtileImpl), + CumeDist, } impl WindowFunctionInfo { @@ -226,6 +228,7 @@ impl WindowFunctionInfo { n: func.n as usize, return_type: func.return_type.clone(), }), + WindowFunction::CumeDist => Self::CumeDist, }) } } @@ -255,6 +258,7 @@ impl WindowFunctionImpl { WindowFunctionInfo::LagLead(ll) => Self::LagLead(ll), WindowFunctionInfo::NthValue(func) => Self::NthValue(func), WindowFunctionInfo::Ntile(func) => Self::Ntile(func), + WindowFunctionInfo::CumeDist => Self::CumeDist, }) } @@ -264,7 +268,7 @@ impl WindowFunctionImpl { Self::RowNumber | Self::Rank | Self::DenseRank => { DataType::Number(NumberDataType::UInt64) } - Self::PercentRank => DataType::Number(NumberDataType::Float64), + Self::PercentRank | Self::CumeDist => DataType::Number(NumberDataType::Float64), Self::LagLead(f) => f.return_type.clone(), Self::NthValue(f) => f.return_type.clone(), Self::Ntile(f) => f.return_type.clone(), diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs index 0a900d826b82..5370b19f2c9f 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs @@ -128,7 +128,7 @@ async fn test_recluster_mutator_block_select() -> Result<()> { let segment_locations = base_snapshot.segments.clone(); let segment_locations = create_segment_location_vector(segment_locations, None); let block_metas = FusePruner::create(&ctx, data_accessor.clone(), schema, &None)? - .pruning(segment_locations) + .read_pruning(segment_locations) .await?; let mut blocks_map: BTreeMap)>> = BTreeMap::new(); block_metas.iter().for_each(|(idx, b)| { diff --git a/src/query/service/tests/it/storages/fuse/pruning.rs b/src/query/service/tests/it/storages/fuse/pruning.rs index c7b8e92f3aee..8e0ee19a47c3 100644 --- a/src/query/service/tests/it/storages/fuse/pruning.rs +++ b/src/query/service/tests/it/storages/fuse/pruning.rs @@ -61,7 +61,7 @@ async fn apply_block_pruning( let segment_locs = table_snapshot.segments.clone(); let segment_locs = create_segment_location_vector(segment_locs, None); FusePruner::create(&ctx, op, schema, push_down)? - .pruning(segment_locs) + .read_pruning(segment_locs) .await .map(|v| v.into_iter().map(|(_, v)| v).collect()) } diff --git a/src/query/sql/src/executor/physical_plan.rs b/src/query/sql/src/executor/physical_plan.rs index a135ba658150..e666da6ec4b8 100644 --- a/src/query/sql/src/executor/physical_plan.rs +++ b/src/query/sql/src/executor/physical_plan.rs @@ -327,6 +327,7 @@ pub enum WindowFunction { LagLead(LagLeadFunctionDesc), NthValue(NthValueFunctionDesc), Ntile(NtileFunctionDesc), + CumeDist, } impl WindowFunction { @@ -336,7 +337,9 @@ impl WindowFunction { WindowFunction::RowNumber | WindowFunction::Rank | WindowFunction::DenseRank => { Ok(DataType::Number(NumberDataType::UInt64)) } - WindowFunction::PercentRank => Ok(DataType::Number(NumberDataType::Float64)), + WindowFunction::PercentRank | WindowFunction::CumeDist => { + Ok(DataType::Number(NumberDataType::Float64)) + } WindowFunction::LagLead(f) => Ok(f.return_type.clone()), WindowFunction::NthValue(f) => Ok(f.return_type.clone()), WindowFunction::Ntile(f) => Ok(f.return_type.clone()), @@ -356,6 +359,7 @@ impl Display for WindowFunction { WindowFunction::LagLead(_) => write!(f, "lead"), WindowFunction::NthValue(_) => write!(f, "nth_value"), WindowFunction::Ntile(_) => write!(f, "ntile"), + WindowFunction::CumeDist => write!(f, "cume_dist"), } } } diff --git a/src/query/sql/src/executor/physical_plan_builder.rs b/src/query/sql/src/executor/physical_plan_builder.rs index f43ea982a8ef..201f84b691d9 100644 --- a/src/query/sql/src/executor/physical_plan_builder.rs +++ b/src/query/sql/src/executor/physical_plan_builder.rs @@ -1288,6 +1288,7 @@ impl PhysicalPlanBuilder { WindowFuncType::Rank => WindowFunction::Rank, WindowFuncType::DenseRank => WindowFunction::DenseRank, WindowFuncType::PercentRank => WindowFunction::PercentRank, + WindowFuncType::CumeDist => WindowFunction::CumeDist, }; Ok(PhysicalPlan::Window(Window { diff --git a/src/query/sql/src/planner/binder/ddl/catalog.rs b/src/query/sql/src/planner/binder/ddl/catalog.rs index ade29eefbb60..65a3b80ddd8b 100644 --- a/src/query/sql/src/planner/binder/ddl/catalog.rs +++ b/src/query/sql/src/planner/binder/ddl/catalog.rs @@ -30,6 +30,7 @@ use common_expression::DataSchemaRefExt; use common_meta_app::schema::CatalogMeta; use common_meta_app::schema::CatalogOption; use common_meta_app::schema::CatalogType; +use common_meta_app::schema::HiveCatalogOption; use common_meta_app::schema::IcebergCatalogOption; use url::Url; @@ -148,7 +149,9 @@ impl Binder { .get("address") .ok_or_else(|| ErrorCode::InvalidArgument("expected field: ADDRESS"))?; - CatalogOption::Hive(address.to_string()) + CatalogOption::Hive(HiveCatalogOption { + address: address.to_string(), + }) } CatalogType::Iceberg => { let mut catalog_options = options.clone(); diff --git a/src/query/sql/src/planner/optimizer/hyper_dp/dphyp.rs b/src/query/sql/src/planner/optimizer/hyper_dp/dphyp.rs index 98816daca74c..dc34c155b99f 100644 --- a/src/query/sql/src/planner/optimizer/hyper_dp/dphyp.rs +++ b/src/query/sql/src/planner/optimizer/hyper_dp/dphyp.rs @@ -147,14 +147,19 @@ impl DPhpy { } let mut left_is_subquery = false; let mut right_is_subquery = false; - // Fixme: If join's child is EvalScalar, we think it is a subquery. - // Check join's child is filter or scan let left_op = s_expr.child(0)?.plan.as_ref(); let right_op = s_expr.child(1)?.plan.as_ref(); - if matches!(left_op, RelOperator::EvalScalar(_)) { + // Eager aggregate will be executed after dphyp, so if join's child is aggregate, we should treat it as subquery. + if matches!( + left_op, + RelOperator::EvalScalar(_) | RelOperator::Aggregate(_) + ) { left_is_subquery = true; } - if matches!(right_op, RelOperator::EvalScalar(_)) { + if matches!( + right_op, + RelOperator::EvalScalar(_) | RelOperator::Aggregate(_) + ) { right_is_subquery = true; } // Add join conditions diff --git a/src/query/sql/src/planner/plans/window.rs b/src/query/sql/src/planner/plans/window.rs index a17e7579ed5e..a3c5f8b89d86 100644 --- a/src/query/sql/src/planner/plans/window.rs +++ b/src/query/sql/src/planner/plans/window.rs @@ -224,6 +224,7 @@ pub enum WindowFuncType { LagLead(LagLeadFunction), NthValue(NthValueFunction), Ntile(NtileFunction), + CumeDist, } impl WindowFuncType { @@ -233,6 +234,7 @@ impl WindowFuncType { "rank" => Ok(WindowFuncType::Rank), "dense_rank" => Ok(WindowFuncType::DenseRank), "percent_rank" => Ok(WindowFuncType::PercentRank), + "cume_dist" => Ok(WindowFuncType::CumeDist), _ => Err(ErrorCode::UnknownFunction(format!( "Unknown window function: {}", name @@ -251,6 +253,7 @@ impl WindowFuncType { WindowFuncType::LagLead(_) => "lead".to_string(), WindowFuncType::NthValue(_) => "nth_value".to_string(), WindowFuncType::Ntile(_) => "ntile".to_string(), + WindowFuncType::CumeDist => "cume_dist".to_string(), } } @@ -279,7 +282,9 @@ impl WindowFuncType { WindowFuncType::RowNumber | WindowFuncType::Rank | WindowFuncType::DenseRank => { DataType::Number(NumberDataType::UInt64) } - WindowFuncType::PercentRank => DataType::Number(NumberDataType::Float64), + WindowFuncType::PercentRank | WindowFuncType::CumeDist => { + DataType::Number(NumberDataType::Float64) + } WindowFuncType::LagLead(lag_lead) => *lag_lead.return_type.clone(), WindowFuncType::NthValue(nth_value) => *nth_value.return_type.clone(), WindowFuncType::Ntile(buckets) => *buckets.return_type.clone(), diff --git a/src/query/storages/fuse/src/metrics/fuse_metrics.rs b/src/query/storages/fuse/src/metrics/fuse_metrics.rs index 1c6eda6ecacc..35712f9604c2 100644 --- a/src/query/storages/fuse/src/metrics/fuse_metrics.rs +++ b/src/query/storages/fuse/src/metrics/fuse_metrics.rs @@ -173,6 +173,13 @@ pub fn metrics_inc_deletion_block_range_pruned_nums(c: u64) { increment_gauge!(key!("deletion_block_range_pruned_nums"), c as f64); } +pub fn metrics_inc_deletion_segment_range_purned_whole_segment_nums(c: u64) { + increment_gauge!( + key!("deletion_segment_range_pruned_whole_segment_nums"), + c as f64 + ); +} + pub fn metrics_inc_deletion_block_range_pruned_whole_block_nums(c: u64) { increment_gauge!( key!("deletion_block_range_pruned_whole_block_nums"), @@ -226,4 +233,7 @@ pub fn metrics_reset() { gauge!(key!("bytes_block_range_pruning_after"), c); gauge!(key!("deletion_block_range_pruned_nums"), c); gauge!(key!("deletion_block_range_pruned_whole_block_nums"), c); + + // segment metrics + gauge!(key!("deletion_segment_range_pruned_whole_segment_nums"), c); } diff --git a/src/query/storages/fuse/src/operations/common/mutation_accumulator.rs b/src/query/storages/fuse/src/operations/common/mutation_accumulator.rs index aa9402dedcbd..4364d9c0a314 100644 --- a/src/query/storages/fuse/src/operations/common/mutation_accumulator.rs +++ b/src/query/storages/fuse/src/operations/common/mutation_accumulator.rs @@ -41,6 +41,7 @@ use crate::operations::common::MutationLogEntry; use crate::operations::common::Replacement; use crate::operations::common::ReplacementLogEntry; use crate::operations::mutation::BlockIndex; +use crate::operations::mutation::MutationDeletedSegment; use crate::operations::mutation::SegmentIndex; use crate::statistics::reducers::deduct_statistics_mut; use crate::statistics::reducers::merge_statistics_mut; @@ -84,6 +85,7 @@ pub struct MutationAccumulator { thresholds: BlockThresholds, mutations: HashMap, + deleted_segments: Vec, // (path, segment_info) appended_segments: Vec<(String, Arc, FormatVersion)>, base_segments: Vec, @@ -113,6 +115,7 @@ impl MutationAccumulator { base_segments, abort_operation: AbortOperation::default(), summary, + deleted_segments: vec![], } } @@ -136,10 +139,14 @@ impl MutationAccumulator { self.abort_operation.add_block(block_meta); } Replacement::Deleted => { - self.mutations - .entry(meta.index.segment_idx) - .and_modify(|v| v.push_deleted(meta.index.block_idx)) - .or_insert(BlockMutations::new_deletion(meta.index.block_idx)); + if let Some(deleted_segment) = &meta.deleted_segment { + self.deleted_segments.push(deleted_segment.clone()) + } else { + self.mutations + .entry(meta.index.segment_idx) + .and_modify(|v| v.push_deleted(meta.index.block_idx)) + .or_insert(BlockMutations::new_deletion(meta.index.block_idx)); + } } Replacement::DoNothing => (), } @@ -164,7 +171,18 @@ impl MutationAccumulator { impl MutationAccumulator { pub async fn apply(&mut self) -> Result { let mut recalc_stats = false; - if self.mutations.len() == self.base_segments.len() { + let segment_locations = self.base_segments.clone(); + let mut segments_editor = + BTreeMap::<_, _>::from_iter(segment_locations.into_iter().enumerate()); + // clean deleted segments' summary + for mutation_deleted_segment in &self.deleted_segments { + deduct_statistics_mut( + &mut self.summary, + &mutation_deleted_segment.deleted_segment.segment_info.1, + ); + segments_editor.remove(&mutation_deleted_segment.deleted_segment.index); + } + if self.mutations.len() == self.base_segments.len() - self.deleted_segments.len() { self.summary = Statistics::default(); recalc_stats = true; } @@ -172,10 +190,6 @@ impl MutationAccumulator { let start = Instant::now(); let mut count = 0; - let segment_locations = self.base_segments.clone(); - let mut segments_editor = - BTreeMap::<_, _>::from_iter(segment_locations.into_iter().enumerate()); - let chunk_size = self.ctx.get_settings().get_max_storage_io_requests()? as usize; let segment_indices = self.mutations.keys().cloned().collect::>(); for chunk in segment_indices.chunks(chunk_size) { diff --git a/src/query/storages/fuse/src/operations/common/mutation_log.rs b/src/query/storages/fuse/src/operations/common/mutation_log.rs index 01f22008cf7f..b27180b620dd 100644 --- a/src/query/storages/fuse/src/operations/common/mutation_log.rs +++ b/src/query/storages/fuse/src/operations/common/mutation_log.rs @@ -26,6 +26,7 @@ use storages_common_table_meta::meta::SegmentInfo; use storages_common_table_meta::meta::Statistics; use crate::operations::common::AbortOperation; +use crate::operations::mutation::MutationDeletedSegment; #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Default)] pub struct MutationLogs { @@ -42,6 +43,8 @@ pub enum MutationLogEntry { pub struct ReplacementLogEntry { pub index: BlockMetaIndex, pub op: Replacement, + // for delete operation, delete whole segment + pub deleted_segment: Option, } #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] diff --git a/src/query/storages/fuse/src/operations/delete.rs b/src/query/storages/fuse/src/operations/delete.rs index 07c3c368e3ef..a59763f4dd92 100644 --- a/src/query/storages/fuse/src/operations/delete.rs +++ b/src/query/storages/fuse/src/operations/delete.rs @@ -48,8 +48,10 @@ use storages_common_table_meta::meta::StatisticsOfColumns; use storages_common_table_meta::meta::TableSnapshot; use tracing::info; +use super::mutation::MutationDeletedSegment; use crate::metrics::metrics_inc_deletion_block_range_pruned_nums; use crate::metrics::metrics_inc_deletion_block_range_pruned_whole_block_nums; +use crate::metrics::metrics_inc_deletion_segment_range_purned_whole_segment_nums; use crate::operations::mutation::MutationAction; use crate::operations::mutation::MutationPartInfo; use crate::operations::mutation::MutationSource; @@ -133,6 +135,7 @@ impl FuseTable { projection, &snapshot, true, + true, ) .await?; info!( @@ -279,6 +282,7 @@ impl FuseTable { ctx.set_status_info(&status); info!(status); } + let (parts, part_info) = self .do_mutation_block_pruning( ctx.clone(), @@ -287,6 +291,7 @@ impl FuseTable { projection.clone(), base_snapshot, with_origin, + false, // for update ) .await?; ctx.set_partitions(parts)?; @@ -305,6 +310,7 @@ impl FuseTable { } #[async_backtrace::framed] + #[allow(clippy::too_many_arguments)] pub async fn do_mutation_block_pruning( &self, ctx: Arc, @@ -313,6 +319,7 @@ impl FuseTable { projection: Projection, base_snapshot: &TableSnapshot, with_origin: bool, + is_delete: bool, ) -> Result<(Partitions, MutationTaskInfo)> { let push_down = Some(PushDownInfo { projection: Some(projection), @@ -321,7 +328,7 @@ impl FuseTable { }); let segment_locations = base_snapshot.segments.clone(); - let pruner = FusePruner::create( + let mut pruner = FusePruner::create( &ctx, self.operator.clone(), self.table_info.schema(), @@ -329,34 +336,39 @@ impl FuseTable { )?; let segment_locations = create_segment_location_vector(segment_locations, None); - let block_metas = pruner.pruning(segment_locations).await?; + + if let Some(inverse) = inverted_filter { + // now the `block_metas` refers to the blocks that need to be deleted completely or partially. + // + // let's try pruning the blocks further to get the blocks that need to be deleted completely, so that + // later during mutation, we need not to load the data of these blocks: + // + // 1. invert the filter expression + // 2. apply the inverse filter expression to the block metas, utilizing range index + // - for those blocks that need to be deleted completely, they will be filtered out. + // - for those blocks that need to be deleted partially, they will NOT be filtered out. + // + let inverse = inverse.as_expr(&BUILTIN_FUNCTIONS); + let func_ctx = ctx.get_function_context()?; + let range_index = RangeIndex::try_create( + func_ctx, + &inverse, + self.table_info.schema(), + StatisticsOfColumns::default(), // TODO default values + )?; + pruner.set_inverse_range_index(range_index); + } + + let block_metas = if is_delete { + pruner.delete_pruning(segment_locations).await? + } else { + pruner.read_pruning(segment_locations).await? + }; let mut whole_block_deletions = std::collections::HashSet::new(); if !block_metas.is_empty() { - if let Some(inverse) = inverted_filter { - // now the `block_metas` refers to the blocks that need to be deleted completely or partially. - // - // let's try pruning the blocks further to get the blocks that need to be deleted completely, so that - // later during mutation, we need not to load the data of these blocks: - // - // 1. invert the filter expression - // 2. apply the inverse filter expression to the block metas, utilizing range index - // - for those blocks that need to be deleted completely, they will be filtered out. - // - for those blocks that need to be deleted partially, they will NOT be filtered out. - // - - let inverse = inverse.as_expr(&BUILTIN_FUNCTIONS); - - let func_ctx = ctx.get_function_context()?; - - let range_index = RangeIndex::try_create( - func_ctx, - &inverse, - self.table_info.schema(), - StatisticsOfColumns::default(), // TODO default values - )?; - + if let Some(range_index) = pruner.get_inverse_range_index() { for (block_meta_idx, block_meta) in &block_metas { if !range_index.should_keep(&block_meta.as_ref().col_stats, None) { // this block should be deleted completely @@ -382,7 +394,7 @@ impl FuseTable { PruningStatistics::default(), )?; - let parts = Partitions::create_nolazy( + let mut parts = Partitions::create_nolazy( PartitionsShuffleKind::Mod, block_metas .into_iter() @@ -405,13 +417,24 @@ impl FuseTable { .collect(), ); - let part_num = parts.len(); - - let num_whole_block_mutation = whole_block_deletions.len(); + let mut part_num = parts.len(); + let mut num_whole_block_mutation = whole_block_deletions.len(); + let segment_num = pruner.deleted_segments.len(); + // now try to add deleted_segment + for deleted_segment in pruner.deleted_segments { + part_num += deleted_segment.segment_info.1.block_count as usize; + num_whole_block_mutation += deleted_segment.segment_info.1.block_count as usize; + parts + .partitions + .push(Arc::new(Box::new(MutationDeletedSegment::create( + deleted_segment, + )))); + } let block_nums = base_snapshot.summary.block_count; metrics_inc_deletion_block_range_pruned_nums(block_nums - part_num as u64); metrics_inc_deletion_block_range_pruned_whole_block_nums(num_whole_block_mutation as u64); + metrics_inc_deletion_segment_range_purned_whole_segment_nums(segment_num as u64); Ok((parts, MutationTaskInfo { total_tasks: part_num, num_whole_block_mutation, diff --git a/src/query/storages/fuse/src/operations/mutation/mod.rs b/src/query/storages/fuse/src/operations/mutation/mod.rs index 792743af1324..96e23e7e94ae 100644 --- a/src/query/storages/fuse/src/operations/mutation/mod.rs +++ b/src/query/storages/fuse/src/operations/mutation/mod.rs @@ -27,6 +27,7 @@ pub use compact::SegmentCompactMutator; pub use compact::SegmentCompactionState; pub use compact::SegmentCompactor; pub use mutation_meta::SerializeDataMeta; +pub use mutation_part::MutationDeletedSegment; pub use mutation_part::MutationPartInfo; pub use mutation_source::MutationAction; pub use mutation_source::MutationSource; diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs index 5a4eba9a106d..e94bc9f7d4e6 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_meta.rs @@ -19,6 +19,7 @@ use common_expression::BlockMetaInfoDowncast; use common_expression::BlockMetaInfoPtr; use storages_common_table_meta::meta::ClusterStatistics; +use super::MutationDeletedSegment; use crate::operations::common::BlockMetaIndex; #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, PartialEq, Eq)] @@ -31,6 +32,7 @@ pub enum ClusterStatsGenType { pub struct SerializeDataMeta { pub index: BlockMetaIndex, pub stats_type: ClusterStatsGenType, + pub deleted_segment: Option, } #[typetag::serde(name = "serialize_data_meta")] @@ -53,6 +55,20 @@ impl BlockMetaInfo for SerializeDataMeta { impl SerializeDataMeta { pub fn create(index: BlockMetaIndex, stats_type: ClusterStatsGenType) -> BlockMetaInfoPtr { - Box::new(SerializeDataMeta { index, stats_type }) + Box::new(SerializeDataMeta { + index, + stats_type, + deleted_segment: None, + }) + } + + pub fn create_with_deleted_segment( + deleted_segment: MutationDeletedSegment, + ) -> BlockMetaInfoPtr { + Box::new(SerializeDataMeta { + index: BlockMetaIndex::default(), // default value + stats_type: ClusterStatsGenType::Generally, // default value + deleted_segment: Some(deleted_segment), + }) } } diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_part.rs b/src/query/storages/fuse/src/operations/mutation/mutation_part.rs index 15b42e58fbdb..787aa2764633 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_part.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_part.rs @@ -22,6 +22,46 @@ use common_exception::Result; use storages_common_pruner::BlockMetaIndex; use storages_common_table_meta::meta::ClusterStatistics; +use crate::pruning::DeletedSegmentInfo; + +#[derive(serde::Serialize, serde::Deserialize, PartialEq, Clone, Debug)] +pub struct MutationDeletedSegment { + pub deleted_segment: DeletedSegmentInfo, +} + +#[typetag::serde(name = "mutation_delete_segment")] +impl PartInfo for MutationDeletedSegment { + fn as_any(&self) -> &dyn Any { + self + } + + fn equals(&self, info: &Box) -> bool { + match info.as_any().downcast_ref::() { + None => false, + Some(other) => self == other, + } + } + + fn hash(&self) -> u64 { + self.deleted_segment.hash() + } +} + +impl MutationDeletedSegment { + pub fn create(deleted_segment: DeletedSegmentInfo) -> Self { + MutationDeletedSegment { deleted_segment } + } + + pub fn from_part(info: &PartInfoPtr) -> Result<&MutationDeletedSegment> { + match info.as_any().downcast_ref::() { + Some(part_ref) => Ok(part_ref), + None => Err(ErrorCode::Internal( + "Cannot downcast from PartInfo to MutationDeletedSegment.", + )), + } + } +} + #[derive(serde::Serialize, serde::Deserialize, PartialEq)] pub struct MutationPartInfo { pub index: BlockMetaIndex, diff --git a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs index fb18743b5bfc..c91a5ce50d58 100644 --- a/src/query/storages/fuse/src/operations/mutation/mutation_source.rs +++ b/src/query/storages/fuse/src/operations/mutation/mutation_source.rs @@ -35,6 +35,7 @@ use common_expression::ROW_ID_COL_NAME; use common_functions::BUILTIN_FUNCTIONS; use common_sql::evaluator::BlockOperator; +use super::MutationDeletedSegment; use crate::fuse_part::FusePartInfo; use crate::io::BlockReader; use crate::io::ReadSettings; @@ -60,13 +61,13 @@ enum State { ReadRemain { part: PartInfoPtr, data_block: DataBlock, - filter: Value, + filter: Option>, }, MergeRemain { part: PartInfoPtr, merged_io_read_result: MergeIOReadResult, data_block: DataBlock, - filter: Value, + filter: Option>, }, PerformOperator(DataBlock), Output(Option, DataBlock), @@ -260,23 +261,23 @@ impl Processor for MutationSource { self.state = State::ReadRemain { part, data_block, - filter: Value::Column(filter), + filter: Some(Value::Column(filter)), } } } } MutationAction::Update => { + data_block.add_column(BlockEntry::new( + DataType::Boolean, + Value::upcast(predicates), + )); if self.remain_reader.is_none() { - data_block.add_column(BlockEntry::new( - DataType::Boolean, - Value::upcast(predicates), - )); self.state = State::PerformOperator(data_block); } else { self.state = State::ReadRemain { part, data_block, - filter: predicates, + filter: None, }; } } @@ -309,22 +310,15 @@ impl Processor for MutationSource { &self.storage_format, )?; - match self.action { - MutationAction::Deletion => { - let remain_block = remain_block.filter_boolean_value(&filter)?; - for col in remain_block.columns() { - data_block.add_column(col.clone()); - } - } - MutationAction::Update => { - for col in remain_block.columns() { - data_block.add_column(col.clone()); - } - data_block.add_column(BlockEntry::new( - DataType::Boolean, - Value::upcast(filter), - )); - } + let remain_block = if let Some(filter) = filter { + // for deletion. + remain_block.filter_boolean_value(&filter)? + } else { + remain_block + }; + + for col in remain_block.columns() { + data_block.add_column(col.clone()); } } else { return Err(ErrorCode::Internal("It's a bug. Need remain reader")); @@ -351,40 +345,61 @@ impl Processor for MutationSource { match std::mem::replace(&mut self.state, State::Finish) { State::ReadData(Some(part)) => { let settings = ReadSettings::from_ctx(&self.ctx)?; - let part = MutationPartInfo::from_part(&part)?; - - self.index = BlockMetaIndex { - segment_idx: part.index.segment_idx, - block_idx: part.index.block_idx, - }; - if matches!(self.action, MutationAction::Deletion) { - self.stats_type = ClusterStatsGenType::WithOrigin(part.cluster_stats.clone()); - } + let res = MutationPartInfo::from_part(&part); + if let Ok(part) = res { + self.index = BlockMetaIndex { + segment_idx: part.index.segment_idx, + block_idx: part.index.block_idx, + }; + if matches!(self.action, MutationAction::Deletion) { + self.stats_type = + ClusterStatsGenType::WithOrigin(part.cluster_stats.clone()); + } - let inner_part = part.inner_part.clone(); - let fuse_part = FusePartInfo::from_part(&inner_part)?; + let inner_part = part.inner_part.clone(); + let fuse_part = FusePartInfo::from_part(&inner_part)?; - if part.whole_block_mutation && matches!(self.action, MutationAction::Deletion) { - // whole block deletion. + if part.whole_block_mutation && matches!(self.action, MutationAction::Deletion) + { + // whole block deletion. + let progress_values = ProgressValues { + rows: fuse_part.nums_rows, + bytes: 0, + }; + self.ctx.get_write_progress().incr(&progress_values); + let meta = + SerializeDataMeta::create(self.index.clone(), self.stats_type.clone()); + self.state = State::Output( + self.ctx.get_partition(), + DataBlock::empty_with_meta(meta), + ); + } else { + let read_res = self + .block_reader + .read_columns_data_by_merge_io( + &settings, + &fuse_part.location, + &fuse_part.columns_meta, + ) + .await?; + self.state = State::FilterData(inner_part, read_res); + } + } else { + // it could be a deleted_segment info + // we can make sure this is Mutation::Delete + // only delete operation will have deleted segments, not for update. + let deleted_segment = MutationDeletedSegment::from_part(&part)?; let progress_values = ProgressValues { - rows: fuse_part.nums_rows, + rows: deleted_segment.deleted_segment.segment_info.1.row_count as usize, bytes: 0, }; self.ctx.get_write_progress().incr(&progress_values); - let meta = - SerializeDataMeta::create(self.index.clone(), self.stats_type.clone()); - self.state = - State::Output(self.ctx.get_partition(), DataBlock::empty_with_meta(meta)); - } else { - let read_res = self - .block_reader - .read_columns_data_by_merge_io( - &settings, - &fuse_part.location, - &fuse_part.columns_meta, - ) - .await?; - self.state = State::FilterData(inner_part, read_res); + self.state = State::Output( + self.ctx.get_partition(), + DataBlock::empty_with_meta(SerializeDataMeta::create_with_deleted_segment( + deleted_segment.clone(), + )), + ) } } State::ReadRemain { diff --git a/src/query/storages/fuse/src/operations/mutation/recluster_mutator.rs b/src/query/storages/fuse/src/operations/mutation/recluster_mutator.rs index f26c0dce21b5..bcedfa29d90a 100644 --- a/src/query/storages/fuse/src/operations/mutation/recluster_mutator.rs +++ b/src/query/storages/fuse/src/operations/mutation/recluster_mutator.rs @@ -103,6 +103,7 @@ impl ReclusterMutator { let entry = ReplacementLogEntry { index: block_idx, op: Replacement::Deleted, + deleted_segment: None, }; self.mutation_logs .entries @@ -183,6 +184,7 @@ impl ReclusterMutator { let entry = ReplacementLogEntry { index: block_idx, op: Replacement::Deleted, + deleted_segment: None, }; self.mutation_logs .entries diff --git a/src/query/storages/fuse/src/operations/mutation/transform_serialize_data.rs b/src/query/storages/fuse/src/operations/mutation/transform_serialize_data.rs index a6d9c45a81a2..a930c60fd937 100644 --- a/src/query/storages/fuse/src/operations/mutation/transform_serialize_data.rs +++ b/src/query/storages/fuse/src/operations/mutation/transform_serialize_data.rs @@ -25,6 +25,7 @@ use common_pipeline_core::processors::port::InputPort; use common_pipeline_core::processors::processor::ProcessorPtr; use opendal::Operator; +use super::MutationDeletedSegment; use crate::io::write_data; use crate::io::BlockBuilder; use crate::io::BlockSerialization; @@ -65,6 +66,8 @@ pub struct SerializeDataTransform { dal: Operator, index: BlockMetaIndex, + + deleted_segment: Option, } impl SerializeDataTransform { @@ -91,6 +94,7 @@ impl SerializeDataTransform { block_builder, dal: table.get_operator(), index: BlockMetaIndex::default(), + deleted_segment: None, }))) } } @@ -141,11 +145,18 @@ impl Processor for SerializeDataTransform { let meta = input_data.take_meta(); if let Some(meta) = meta { let meta = SerializeDataMeta::downcast_ref_from(&meta).unwrap(); - self.index = meta.index.clone(); - if input_data.is_empty() { + // delete a whole segment, segment level + if let Some(deleted_segment) = &meta.deleted_segment { self.state = State::Output(Replacement::Deleted); + self.deleted_segment = Some(deleted_segment.clone()); } else { - self.state = State::NeedSerialize(input_data, meta.stats_type.clone()); + // block level + self.index = meta.index.clone(); + if input_data.is_empty() { + self.state = State::Output(Replacement::Deleted); + } else { + self.state = State::NeedSerialize(input_data, meta.stats_type.clone()); + } } } else { self.state = State::Output(Replacement::DoNothing); @@ -173,6 +184,7 @@ impl Processor for SerializeDataTransform { let entry = ReplacementLogEntry { index: self.index.clone(), op, + deleted_segment: self.deleted_segment.take(), }; let meta = MutationLogs { entries: vec![MutationLogEntry::Replacement(entry)], diff --git a/src/query/storages/fuse/src/operations/read_partitions.rs b/src/query/storages/fuse/src/operations/read_partitions.rs index 28c130db98a8..c6f32a7a4b0f 100644 --- a/src/query/storages/fuse/src/operations/read_partitions.rs +++ b/src/query/storages/fuse/src/operations/read_partitions.rs @@ -165,7 +165,7 @@ impl FuseTable { } } - let pruner = if !self.is_native() || self.cluster_key_meta.is_none() { + let mut pruner = if !self.is_native() || self.cluster_key_meta.is_none() { FusePruner::create(&ctx, dal.clone(), table_info.schema(), &push_downs)? } else { let cluster_keys = self.cluster_keys(ctx.clone()); @@ -180,7 +180,7 @@ impl FuseTable { )? }; - let block_metas = pruner.pruning(segments_location).await?; + let block_metas = pruner.read_pruning(segments_location).await?; let pruning_stats = pruner.pruning_stats(); info!( diff --git a/src/query/storages/fuse/src/operations/recluster.rs b/src/query/storages/fuse/src/operations/recluster.rs index 9a8f6b685021..b56535bfe344 100644 --- a/src/query/storages/fuse/src/operations/recluster.rs +++ b/src/query/storages/fuse/src/operations/recluster.rs @@ -69,8 +69,8 @@ impl FuseTable { let schema = self.table_info.schema(); let segment_locations = snapshot.segments.clone(); let segment_locations = create_segment_location_vector(segment_locations, None); - let pruner = FusePruner::create(&ctx, self.operator.clone(), schema, &push_downs)?; - let block_metas = pruner.pruning(segment_locations).await?; + let mut pruner = FusePruner::create(&ctx, self.operator.clone(), schema, &push_downs)?; + let block_metas = pruner.read_pruning(segment_locations).await?; let default_cluster_key_id = self.cluster_key_meta.clone().unwrap().0; let mut blocks_map: BTreeMap)>> = BTreeMap::new(); diff --git a/src/query/storages/fuse/src/operations/replace_into/mutator/merge_into_mutator.rs b/src/query/storages/fuse/src/operations/replace_into/mutator/merge_into_mutator.rs index 06c5bf1c4bce..6e192cd5978d 100644 --- a/src/query/storages/fuse/src/operations/replace_into/mutator/merge_into_mutator.rs +++ b/src/query/storages/fuse/src/operations/replace_into/mutator/merge_into_mutator.rs @@ -367,6 +367,7 @@ impl AggregationContext { block_idx: block_index, }, op: Replacement::Deleted, + deleted_segment: None, }; return Ok(Some(mutation)); @@ -437,6 +438,7 @@ impl AggregationContext { block_idx: block_index, }, op: Replacement::Replaced(Arc::new(new_block_meta)), + deleted_segment: None, }; Ok(Some(mutation)) diff --git a/src/query/storages/fuse/src/operations/update.rs b/src/query/storages/fuse/src/operations/update.rs index a949edd32341..d8be31e5c555 100644 --- a/src/query/storages/fuse/src/operations/update.rs +++ b/src/query/storages/fuse/src/operations/update.rs @@ -152,6 +152,9 @@ impl FuseTable { .map(|index| schema.fields()[*index].clone()) .collect(); + fields.push(TableField::new("_predicate", TableDataType::Boolean)); + pos += 1; + let remain_col_indices: Vec = all_column_indices .into_iter() .filter(|index| !col_indices.contains(index)) @@ -171,9 +174,6 @@ impl FuseTable { remain_reader = Some((*reader).clone()); } - fields.push(TableField::new("_predicate", TableDataType::Boolean)); - pos += 1; - ( Projection::Columns(col_indices.clone()), Arc::new(TableSchema::new(fields)), @@ -185,9 +185,8 @@ impl FuseTable { cap += 1; } let mut ops = Vec::with_capacity(cap); - let mut exprs = Vec::with_capacity(update_list.len()); - let mut computed_exprs = Vec::with_capacity(computed_list.len()); + let mut exprs = Vec::with_capacity(update_list.len()); for (id, remote_expr) in update_list.into_iter() { let expr = remote_expr .as_expr(&BUILTIN_FUNCTIONS) @@ -196,6 +195,11 @@ impl FuseTable { offset_map.insert(id, pos); pos += 1; } + if !exprs.is_empty() { + ops.push(BlockOperator::Map { exprs }); + } + + let mut computed_exprs = Vec::with_capacity(computed_list.len()); for (id, remote_expr) in computed_list.into_iter() { let expr = remote_expr .as_expr(&BUILTIN_FUNCTIONS) @@ -208,15 +212,13 @@ impl FuseTable { offset_map.insert(id, pos); pos += 1; } - if !exprs.is_empty() { - ops.push(BlockOperator::Map { exprs }); - } // regenerate related stored computed columns. if !computed_exprs.is_empty() { ops.push(BlockOperator::Map { exprs: computed_exprs, }); } + ops.push(BlockOperator::Project { projection: offset_map.values().cloned().collect(), }); diff --git a/src/query/storages/fuse/src/pruning/fuse_pruner.rs b/src/query/storages/fuse/src/pruning/fuse_pruner.rs index 6c911fd86b3c..c7ae4479268e 100644 --- a/src/query/storages/fuse/src/pruning/fuse_pruner.rs +++ b/src/query/storages/fuse/src/pruning/fuse_pruner.rs @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::hash_map::DefaultHasher; +use std::hash::Hash; +use std::hash::Hasher; use std::sync::Arc; use common_base::base::tokio::sync::Semaphore; @@ -27,6 +30,7 @@ use common_expression::SEGMENT_NAME_COL_NAME; use common_functions::BUILTIN_FUNCTIONS; use common_sql::field_default_value; use opendal::Operator; +use storages_common_index::RangeIndex; use storages_common_pruner::BlockMetaIndex; use storages_common_pruner::InternalColumnPruner; use storages_common_pruner::Limiter; @@ -39,6 +43,8 @@ use storages_common_pruner::TopNPrunner; use storages_common_table_meta::meta::BlockMeta; use storages_common_table_meta::meta::ClusterKey; use storages_common_table_meta::meta::ColumnStatistics; +use storages_common_table_meta::meta::Location; +use storages_common_table_meta::meta::Statistics; use storages_common_table_meta::meta::StatisticsOfColumns; use tracing::warn; @@ -63,12 +69,30 @@ pub struct PruningContext { pub pruning_stats: Arc, } +#[derive(serde::Serialize, serde::Deserialize, PartialEq, Clone, Debug)] +pub struct DeletedSegmentInfo { + // segment index. + pub index: usize, + // deleted segment location and summary. + // location is used for hash + pub segment_info: (Location, Statistics), +} + +impl DeletedSegmentInfo { + pub fn hash(&self) -> u64 { + let mut s = DefaultHasher::new(); + self.segment_info.0.hash(&mut s); + s.finish() + } +} pub struct FusePruner { max_concurrency: usize, pub table_schema: TableSchemaRef, pub pruning_ctx: Arc, pub push_down: Option, + pub inverse_range_index: Option, + pub deleted_segments: Vec, } impl FusePruner { @@ -199,15 +223,33 @@ impl FusePruner { table_schema, push_down: push_down.clone(), pruning_ctx, + inverse_range_index: None, + deleted_segments: vec![], }) } + #[async_backtrace::framed] + pub async fn read_pruning( + &mut self, + segment_locs: Vec, + ) -> Result)>> { + self.pruning(segment_locs, false).await + } + + #[async_backtrace::framed] + pub async fn delete_pruning( + &mut self, + segment_locs: Vec, + ) -> Result)>> { + self.pruning(segment_locs, true).await + } // Pruning chain: // segment pruner -> block pruner -> topn pruner #[async_backtrace::framed] pub async fn pruning( - &self, + &mut self, mut segment_locs: Vec, + delete_purning: bool, ) -> Result)>> { // Segment pruner. let segment_pruner = @@ -224,7 +266,7 @@ impl FusePruner { remain -= gap_size; let mut batch = segment_locs.drain(0..batch_size).collect::>(); - + let inverse_range_index = self.get_inverse_range_index(); works.push(self.pruning_ctx.pruning_runtime.spawn({ let block_pruner = block_pruner.clone(); let segment_pruner = segment_pruner.clone(); @@ -243,12 +285,39 @@ impl FusePruner { } let mut res = vec![]; + let mut deleted_segments = vec![]; let pruned_segments = segment_pruner.pruning(batch).await?; - for (location, info) in pruned_segments { - res.extend(block_pruner.pruning(location, &info).await?); - } - Result::<_, ErrorCode>::Ok(res) + if delete_purning { + // inverse purn + for (segment_location, compact_segment_info) in &pruned_segments { + // for delete_purn + if !inverse_range_index + .as_ref() + .unwrap() + .should_keep(&compact_segment_info.summary.col_stats, None) + { + deleted_segments.push(DeletedSegmentInfo { + index: segment_location.segment_idx, + segment_info: ( + segment_location.location.clone(), + compact_segment_info.summary.clone(), + ), + }) + } else { + res.extend( + block_pruner + .pruning(segment_location.clone(), compact_segment_info) + .await?, + ); + } + } + } else { + for (location, info) in pruned_segments { + res.extend(block_pruner.pruning(location, &info).await?); + } + } + Result::<_, ErrorCode>::Ok((res, deleted_segments)) } })); } @@ -261,11 +330,18 @@ impl FusePruner { Ok(workers) => { let mut metas = vec![]; for worker in workers { - metas.extend(worker?); + let mut res = worker?; + metas.extend(res.0); + self.deleted_segments.append(&mut res.1); + } + if delete_purning { + Ok(metas) + } else { + // Todo:: for now, all operation (contains other mutation other than delete, like select,update etc.) + // will get here, we can prevent other mutations like update and so on. + // TopN pruner. + self.topn_pruning(metas) } - - // TopN pruner. - self.topn_pruning(metas) } } } @@ -314,4 +390,12 @@ impl FusePruner { blocks_bloom_pruning_after, } } + + pub fn set_inverse_range_index(&mut self, index: RangeIndex) { + self.inverse_range_index = Some(index) + } + + pub fn get_inverse_range_index(&self) -> Option { + self.inverse_range_index.clone() + } } diff --git a/src/query/storages/fuse/src/pruning/mod.rs b/src/query/storages/fuse/src/pruning/mod.rs index a45a1d2a6b27..34f9e16974aa 100644 --- a/src/query/storages/fuse/src/pruning/mod.rs +++ b/src/query/storages/fuse/src/pruning/mod.rs @@ -22,6 +22,7 @@ mod segment_pruner; pub use block_pruner::BlockPruner; pub use bloom_pruner::BloomPruner; pub use bloom_pruner::BloomPrunerCreator; +pub use fuse_pruner::DeletedSegmentInfo; pub use fuse_pruner::FusePruner; pub use fuse_pruner::PruningContext; pub use pruner_location::create_segment_location_vector; diff --git a/tests/sqllogictests/suites/base/03_common/03_0025_delete_from b/tests/sqllogictests/suites/base/03_common/03_0025_delete_from index aec2e69d9260..fb18f6802ab2 100644 --- a/tests/sqllogictests/suites/base/03_common/03_0025_delete_from +++ b/tests/sqllogictests/suites/base/03_common/03_0025_delete_from @@ -264,9 +264,137 @@ select * from t order by c; + statement ok drop table t all +#################################### +# delete pruning, whole segments # +#################################### + statement ok -DROP DATABASE db1 +create table t (c int); + +# add the first segment +statement ok +insert into t values(1),(2),(3); + +# add the second segment +statement ok +insert into t values(4),(5),(6); + +# add the third segment +statement ok +insert into t values(7),(8),(9); + +#clear metrics +statement ok +truncate table system.metrics; + +statement ok +delete from t where c > 3 and c < 8; + +# expects 1 block pruned and +query I +select value from system.metrics where metric = 'fuse_deletion_block_range_pruned_nums'; +---- +1.0 + +query I +select value from system.metrics where metric = 'fuse_deletion_segment_range_pruned_whole_segment_nums'; +---- +1.0 + +query I +select * from t order by c; +---- +1 +2 +3 +8 +9 + +# expects 1 whole block deletion: the block of value 2 +query I +select value from system.metrics where metric = 'fuse_deletion_block_range_pruned_whole_block_nums'; +---- +1.0 + +statement ok +drop table t all + +# test large data +statement ok +create table t (c int) row_per_block = 100000 block_per_segment = 1000; + +statement ok +set max_threads = 1; + +# 0-1000000 +statement ok +create table t_number(c int); + +statement ok +insert into t_number select * from numbers(1000000); + +# insert data +statement ok +insert into t select c from t_number; + +query I +select block_count from fuse_snapshot('db1','t'); +---- +10 + +query I +select segment_count from fuse_snapshot('db1','t'); +---- +1 + +statement ok +insert into t select c + 1000000 from t_number; + +query I +select block_count from fuse_snapshot('db1','t') order by block_count; +---- +10 +20 + +query I +select segment_count from fuse_snapshot('db1','t') order by segment_count; +---- +1 +2 + +#clear metrics +statement ok +truncate table system.metrics; + +statement ok +delete from t where c >= 0 and c < 1500000; + +query I +select value from system.metrics where metric = 'fuse_deletion_block_range_pruned_nums'; +---- +5.0 + +query I +select value from system.metrics where metric = 'fuse_deletion_segment_range_pruned_whole_segment_nums'; +---- +1.0 +query I +select value from system.metrics where metric = 'fuse_deletion_block_range_pruned_whole_block_nums'; +---- +15.0 + +query I +select count(*) from t; +---- +500000 + +statement ok +drop table t all + +statement ok +DROP DATABASE db1 diff --git a/tests/sqllogictests/suites/ee/01_ee_system/01_0001_computed_column b/tests/sqllogictests/suites/ee/01_ee_system/01_0001_computed_column index f0b8b8ab16de..960c36ebe4e0 100644 --- a/tests/sqllogictests/suites/ee/01_ee_system/01_0001_computed_column +++ b/tests/sqllogictests/suites/ee/01_ee_system/01_0001_computed_column @@ -101,7 +101,7 @@ statement ok drop table if exists t_virtual statement ok -create table t_virtual(a string null default 'a', b string null as (concat(a, '-', c)) stored, c string null default 'c') +create table t_virtual(a string null default 'a', b string null as (concat(a, '-', c)) virtual, c string null default 'c') statement ok insert into t_virtual values ('a1', 'c1'), ('a2', 'c2') diff --git a/tests/sqllogictests/suites/ee/05_ee_ddl/05_0003_ddl_create_add_computed_column b/tests/sqllogictests/suites/ee/05_ee_ddl/05_0003_ddl_create_add_computed_column index 025f2922ce47..1d29a6664ccb 100644 --- a/tests/sqllogictests/suites/ee/05_ee_ddl/05_0003_ddl_create_add_computed_column +++ b/tests/sqllogictests/suites/ee/05_ee_ddl/05_0003_ddl_create_add_computed_column @@ -48,6 +48,15 @@ alter table t1 rename column a to x statement ok alter table t1 drop column x +statement ok +create table t2(a string, b string generated always as (upper(a)) stored, c string generated always as (lower(a)) virtual) + +statement error 1058 +alter table t2 modify column c drop stored + +statement ok +alter table t2 modify column b drop stored + statement ok USE default diff --git a/tests/sqllogictests/suites/query/join.test b/tests/sqllogictests/suites/query/join.test index eb0a88cfc3df..6c184524cf01 100644 --- a/tests/sqllogictests/suites/query/join.test +++ b/tests/sqllogictests/suites/query/join.test @@ -630,3 +630,62 @@ select * from (select * from numbers(10)) n full join t1 on n.number = t1.a orde statement ok drop table t1; + +query III +select a.number, pt,register_at from ( select number, to_yyyymmdd(to_timestamp(number)) as pt + from numbers(10) where number > 5 +) a join ( select distinct number , to_yyyymmdd(to_timestamp(number)) as register_at from numbers(10) where number > 5 +) b on a.number=b.number order by a.number; +---- +6 19700101 19700101 +7 19700101 19700101 +8 19700101 19700101 +9 19700101 19700101 + +# https://github.com/datafuselabs/databend/pull/11950 +statement ok +set max_block_size = 2; + +statement ok +drop table if exists t1; + +statement ok +create table t1(a int, b int) + +statement ok +insert into t1 values(1, 2), (2, 4), (3, 6), (4, 8), (5, 10) + +statement ok +drop table if exists t2 + +statement ok +create table t2(a int, b int) + +statement ok +insert into t2 values(1, 2), (1, 4), (1, 6), (1, 8), (1, 10); + +query I +select * from t1 left join t2 on t1.a = t2.a order by t1.a, t2.a +---- +1 2 1 10 +1 2 1 8 +1 2 1 6 +1 2 1 4 +1 2 1 2 +2 4 NULL NULL +3 6 NULL NULL +4 8 NULL NULL +5 10 NULL NULL + +# left join with conjunct +query II +select * from t1 left join t2 on t1.a = t2.a and t1.b > t2.b order by t1.a, t2.a +---- +1 2 NULL NULL +2 4 NULL NULL +3 6 NULL NULL +4 8 NULL NULL +5 10 NULL NULL + +statement ok +set max_block_size = 65536; \ No newline at end of file diff --git a/tests/sqllogictests/suites/query/window_function/window_wisconsin.test b/tests/sqllogictests/suites/query/window_function/window_wisconsin.test index 70882da10445..7f645bf96e26 100644 --- a/tests/sqllogictests/suites/query/window_function/window_wisconsin.test +++ b/tests/sqllogictests/suites/query/window_function/window_wisconsin.test @@ -143,6 +143,21 @@ SELECT ntile(5) OVER (ORDER BY ten, four) nn FROM tenk1 ORDER BY ten, four, nn 5 5 +# cume_dist +query I +SELECT CAST(cume_dist() OVER (PARTITION BY four ORDER BY ten)*10 as INT) FROM tenk1 WHERE unique2 < 10 order by four, ten +---- +6 +6 +10 +5 +5 +7 +10 +10 +5 +10 + # lead/lag query I SELECT lag(ten) OVER (PARTITION BY four ORDER BY ten) lt FROM tenk1 order by four, ten, lt nulls first diff --git a/website/blog/2023-07-02-databend-weekly-100.md b/website/blog/2023-07-02-databend-weekly-100.md new file mode 100644 index 000000000000..76ca1d3470a4 --- /dev/null +++ b/website/blog/2023-07-02-databend-weekly-100.md @@ -0,0 +1,154 @@ +--- +title: "This Week in Databend #100" +date: 2023-07-02 +slug: 2023-07-02-databend-weekly +cover_url: 'weekly/weekly-100.png' +image: 'weekly/weekly-100.png' +tags: [weekly] +description: "Stay up to date with the latest weekly developments on Databend!" +contributors: + - name: ariesdevil + - name: b41sh + - name: BohuTANG + - name: dantengsky + - name: Dousir9 + - name: drmingdrmer + - name: everpcpc + - name: JackTan25 + - name: leiysky + - name: lichuang + - name: PsiACE + - name: RinChanNOWWW + - name: soyeric128 + - name: sundy-li + - name: TCeason + - name: Xuanwo + - name: xudong963 + - name: youngsofun + - name: zhang2014 + - name: ZhiHanZ + - name: zhyass +authors: + - name: PsiACE + url: https://github.com/psiace + image_url: https://github.com/psiace.png +--- + +[Databend](https://github.com/datafuselabs/databend) is a modern cloud data warehouse, serving your massive-scale analytics needs at low cost and complexity. Open source alternative to Snowflake. Also available in the cloud: . + +> For security reasons, the Root user is no longer available out of the box. You must configure it before use. Learn more at . + +## What's On In Databend + +Stay connected with the latest news about Databend. + +### Announcing Databend v1.2! Data + AI + +Databend v1.2 was officially released on June 29, 2023! Thanks to all the community partners who participated and to everyone who contributed to making Databend better! + + +- New Data Type: `BITMAP` +- Direct Query of CSV/TSV/NDJSON Files Using Column Position +- New Hash Table: Improved Hash Join Performance +- AI Functions +- Computed Columns +- `VACUUM TABLE` +- Serverless Background Service +- Bind `databend` into Python +- BendSQL - Databend Native Command Line Tool +- Integration with Apache DolphinScheduler, Apache Flink CDC and Tableau + +If you are interested in learning more, please check out the resources listed below. + +- [What's Fresh in Databend v1.2 | Blog | Databend](/blog/databend-changelog-1-2) + +## Code Corner + +Discover some fascinating code snippets or projects that showcase our work or learning journey. + +### Databend Long Run Tests + +Databend's long run tests the correctness and performance of the system under heavy load and concurrency. This includes concurrent large-scale data ingestion, table maintenance (optimization, re-clustering, and vacuuming), as well as querying. + +The test will run a series of SQL and validation commands to verify the results. It will begin by executing the pre-test scripts (`_before.sh`), followed by repeatedly running concurrent test scripts, and finally executing post-test scripts (`_after.sh`). All event logs will be stored in a table on Databend for further analysis. + +Databend conducts long run tests to verify the correctness and performance of the system under heavy load and concurrency. These tests involve concurrent ingestion of large-scale data, table maintenance (optimization, re-clustering, and vacuuming), as well as querying. + +During the testing process, a series of SQL commands and validation checks will be performed to ensure accurate results. The testing process will start by running pre-test scripts (`_before.sh`), followed by repeated execution of concurrent test scripts, and finally executing post-test scripts (`_after.sh`). All event logs will be stored in a Databend table for further analysis. + +```lua + +-------------------+ + | Long Run | + +-------------------+ + | + | + v + +-----------------------+ + | Before Test Scripts | + +-----------------------+ + | + | + v + +----------------------------------+ + | Concurrent Test Scripts | + +----------------------------------+ + | | | + | | | + v v v ++----------------+ +----------------+ +----------------+ +| Test Script 1 | | Test Script 2 | | Test Script 3 | ++----------------+ +----------------+ +----------------+ + | + | + v + +-----------------------+ + | After Test Scripts | + +-----------------------+ + +``` + +If you are interested in learning more, please check out the resources listed below: + +- [Databend Long Run Tests](https://github.com/datafuselabs/databend/tree/main/tests/longrun) + +## Highlights + +We have also made these improvements to Databend that we hope you will find helpful: + +- Added more execution information in `system.query_profile`, which makes it easier than ever to profile your queries. +- Added the basic read support for Iceberg table. +- Added support for `ntile` window function. +- Added support for distributed copy into (first version). +- Read documents [Docs | Loading Data with Tools - Addax](https://databend.rs/doc/load-data/load-db/addax) and [Docs | Loading Data with Tools - DataX](https://databend.rs/doc/load-data/load-db/datax) to learn how to import data efficiently and conveniently. +- Read document [Docs | Working with Stages - Staging Files](https://databend.rs/doc/load-data/stage/stage-files) to learn how to use the Presigned URL method for uploading files to the stage. + +## What's Up Next + +We're always open to cutting-edge technologies and innovative ideas. You're more than welcome to join the community and bring them to Databend. + +### Release Proposal: Nightly v1.3 + +Databend v1.3 is scheduled for release on August 1st and will primarily focus on enhancing stability. + +| Task | Status | +| -------------------------------------------------------------------------------------------- | ----------- | +| (Query) [JSON indexing#6994](https://github.com/datafuselabs/databend/issues/6994) | IN PROGRESS | +| (Query+Storage) Create index feature | IN PROGRESS | +| (Query+Storage)[Distributed COPY#8594](https://github.com/datafuselabs/databend/issues/8594) | IN PROGRESS | +| (Query+Storage) Distributed REPLACE | PLAN | +| [COPY returns more status](https://github.com/datafuselabs/databend/issues/7730) | PLAN | +| (Query+Storage) Query apache/iceberg | IN PROGRESS | +| (Processor) OrderBy Spill | IN PROGRESS | +| (Stability) Fast update/delete with fuse engine | IN PROGRESS | +| (Stability) Query profiling | IN PROGRESS | +| (Test) Longrun framework:BendRun | IN PROGRESS | + +[Issue #11868 | Release proposal: Nightly v1.3](https://github.com/datafuselabs/databend/issues/11868) + +Please let us know if you're interested in contributing to this issue, or pick up a good first issue at to get started. + +## Changelog + +You can check the changelog of Databend Nightly for details about our latest developments. + +**Full Changelog**: diff --git a/website/static/img/blog/weekly/weekly-100.png b/website/static/img/blog/weekly/weekly-100.png new file mode 100644 index 000000000000..ec7f7a1d0af4 Binary files /dev/null and b/website/static/img/blog/weekly/weekly-100.png differ