Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(subscription): Support specified pk read log store #19274

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions e2e_test/subscription/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,41 @@ def test_order_multi_pk():
check_rows_data([17,17,17,17],row[4],"Insert")
drop_table_subscription()

def test_explain_cursor():
print(f"test_explain_cursor")
create_table_subscription()
conn = psycopg2.connect(
host="localhost",
port="4566",
user="root",
database="dev"
)
execute_insert("insert into t5 values(1,1,1,1)",conn)
execute_insert("flush",conn)
execute_insert("insert into t5 values(2,2,2,2)",conn)
execute_insert("flush",conn)
execute_insert("declare cur subscription cursor for sub5 full",conn)
execute_insert("insert into t5 values(3,3,3,3)",conn)
execute_insert("flush",conn)
execute_insert("insert into t5 values(4,4,4,4)",conn)
execute_insert("flush",conn)
plan = execute_query("explain fetch next from cur",conn)
assert plan[0][0] == "BatchExchange { order: [t5.v1 ASC, t5.v2 ASC], dist: Single }"
assert plan[1][0] == "└─BatchScan { table: t5, columns: [v1, v2, v3, v4] }"
execute_query("fetch next from cur",conn)
plan = execute_query("explain fetch next from cur",conn)
assert plan[0][0] == "BatchExchange { order: [t5.v1 ASC, t5.v2 ASC], dist: Single }"
assert plan[1][0] == "└─BatchScan { table: t5, columns: [v1, v2, v3, v4], scan_ranges: [(v1, v2) > (Int32(1), Int32(1))] }"
execute_query("fetch next from cur",conn)
execute_query("fetch next from cur",conn)
plan = execute_query("explain fetch next from cur",conn)
print(plan)
assert plan[0][0] == "BatchExchange { order: [t5.v1 ASC, t5.v2 ASC], dist: Single }"
assert "└─BatchLogSeqScan { table: t5, columns: [v1, v2, v3, v4, op]" in plan[1][0]
assert "scan_ranges: [(v1, v2) > (Int32(3), Int32(3))] }" in plan[1][0]
execute_query("fetch next from cur",conn)
drop_table_subscription()

if __name__ == "__main__":
test_cursor_snapshot()
test_cursor_op()
Expand All @@ -559,3 +594,4 @@ def test_order_multi_pk():
test_order_mv()
test_order_multi_pk()
test_block_cursor()
test_explain_cursor()
1 change: 1 addition & 0 deletions proto/batch_plan.proto
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ message LogRowSeqScanNode {
common.BatchQueryEpoch old_epoch = 4;
common.BatchQueryEpoch new_epoch = 5;
bool ordered = 6;
repeated ScanRange scan_ranges = 7;
}

message InsertNode {
Expand Down
49 changes: 34 additions & 15 deletions src/batch/executors/src/executor/log_row_seq_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@ use risingwave_storage::table::batch_table::storage_table::StorageTable;
use risingwave_storage::table::collect_data_chunk;
use risingwave_storage::{dispatch_state_store, StateStore};

use super::{BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder};
use super::{
BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder, ScanRange,
};
use crate::error::{BatchError, Result};
use crate::executor::build_scan_ranges_from_pb;
use crate::monitor::BatchMetrics;

pub struct LogRowSeqScanExecutor<S: StateStore> {
Expand All @@ -52,6 +55,7 @@ pub struct LogRowSeqScanExecutor<S: StateStore> {
new_epoch: u64,
version_id: HummockVersionId,
ordered: bool,
scan_ranges: Vec<ScanRange>,
}

impl<S: StateStore> LogRowSeqScanExecutor<S> {
Expand All @@ -64,6 +68,7 @@ impl<S: StateStore> LogRowSeqScanExecutor<S> {
identity: String,
metrics: Option<BatchMetrics>,
ordered: bool,
scan_ranges: Vec<ScanRange>,
) -> Self {
let mut schema = table.schema().clone();
schema.fields.push(Field::with_name(
Expand All @@ -80,6 +85,7 @@ impl<S: StateStore> LogRowSeqScanExecutor<S> {
new_epoch,
version_id,
ordered,
scan_ranges,
}
}
}
Expand Down Expand Up @@ -137,6 +143,9 @@ impl BoxedExecutorBuilder for LogStoreRowSeqScanExecutorBuilder {
let old_epoch = old_epoch.epoch;
let new_epoch = new_epoch.epoch;

let scan_ranges =
build_scan_ranges_from_pb(&log_store_seq_scan_node.scan_ranges, table_desc)?;

dispatch_state_store!(source.context().state_store(), state_store, {
let table = StorageTable::new_partial(state_store, column_ids, vnodes, table_desc);
Ok(Box::new(LogRowSeqScanExecutor::new(
Expand All @@ -148,6 +157,7 @@ impl BoxedExecutorBuilder for LogStoreRowSeqScanExecutorBuilder {
source.plan_node().get_identity().clone(),
metrics,
log_store_seq_scan_node.ordered,
scan_ranges,
)))
})
}
Expand Down Expand Up @@ -178,6 +188,7 @@ impl<S: StateStore> LogRowSeqScanExecutor<S> {
version_id,
schema,
ordered,
scan_ranges,
..
} = *self;
let table = std::sync::Arc::new(table);
Expand All @@ -189,20 +200,23 @@ impl<S: StateStore> LogRowSeqScanExecutor<S> {
// Range Scan
// WARN: DO NOT use `select` to execute range scans concurrently
// it can consume too much memory if there're too many ranges.
let stream = Self::execute_range(
table.clone(),
old_epoch,
new_epoch,
version_id,
chunk_size,
histogram,
Arc::new(schema.clone()),
ordered,
);
#[for_await]
for chunk in stream {
let chunk = chunk?;
yield chunk;
for range in scan_ranges {
let stream = Self::execute_range(
table.clone(),
old_epoch,
new_epoch,
version_id,
chunk_size,
histogram,
Arc::new(schema.clone()),
ordered,
range,
);
#[for_await]
for chunk in stream {
let chunk = chunk?;
yield chunk;
}
}
}

Expand All @@ -216,13 +230,18 @@ impl<S: StateStore> LogRowSeqScanExecutor<S> {
histogram: Option<impl Deref<Target = Histogram>>,
schema: Arc<Schema>,
ordered: bool,
scan_range: ScanRange,
) {
let pk_prefix = scan_range.pk_prefix.clone();
let range_bounds = scan_range.convert_to_range_bounds(&table);
// Range Scan.
let iter = table
.batch_iter_log_with_pk_bounds(
old_epoch,
HummockReadEpoch::BatchQueryCommitted(new_epoch, version_id),
ordered,
range_bounds,
pk_prefix,
)
.await?
.flat_map(|r| {
Expand Down
158 changes: 8 additions & 150 deletions src/batch/executors/src/executor/row_seq_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,33 +11,31 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::ops::{Bound, Deref};
use std::ops::Deref;
use std::sync::Arc;

use futures::{pin_mut, StreamExt};
use futures_async_stream::try_stream;
use itertools::Itertools;
use prometheus::Histogram;
use risingwave_common::array::DataChunk;
use risingwave_common::bitmap::Bitmap;
use risingwave_common::catalog::{ColumnId, Schema};
use risingwave_common::hash::VnodeCountCompat;
use risingwave_common::row::{OwnedRow, Row};
use risingwave_common::types::DataType;
use risingwave_common::util::chunk_coalesce::DataChunkBuilder;
use risingwave_common::util::value_encoding::deserialize_datum;
use risingwave_pb::batch_plan::plan_node::NodeBody;
use risingwave_pb::batch_plan::{scan_range, PbScanRange};
use risingwave_pb::common::BatchQueryEpoch;
use risingwave_pb::plan_common::as_of::AsOfType;
use risingwave_pb::plan_common::{as_of, PbAsOf, StorageTableDesc};
use risingwave_storage::store::PrefetchOptions;
use risingwave_storage::table::batch_table::storage_table::StorageTable;
use risingwave_storage::{dispatch_state_store, StateStore};

use super::ScanRange;
use crate::error::{BatchError, Result};
use crate::executor::{
BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder,
build_scan_ranges_from_pb, BoxedDataChunkStream, BoxedExecutor, BoxedExecutorBuilder, Executor,
ExecutorBuilder,
};
use crate::monitor::BatchMetrics;

Expand All @@ -57,17 +55,6 @@ pub struct RowSeqScanExecutor<S: StateStore> {
limit: Option<u64>,
as_of: Option<AsOf>,
}

/// Range for batch scan.
#[derive(Debug)]
pub struct ScanRange {
/// The prefix of the primary key.
pub pk_prefix: OwnedRow,

/// The range bounds of the next column.
pub next_col_bounds: (Bound<OwnedRow>, Bound<OwnedRow>),
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct AsOf {
pub timestamp: i64,
Expand Down Expand Up @@ -98,71 +85,6 @@ impl From<&AsOf> for PbAsOf {
}
}

impl ScanRange {
/// Create a scan range from the prost representation.
pub fn new(scan_range: PbScanRange, pk_types: Vec<DataType>) -> Result<Self> {
let mut index = 0;
let pk_prefix = OwnedRow::new(
scan_range
.eq_conds
.iter()
.map(|v| {
let ty = pk_types.get(index).unwrap();
index += 1;
deserialize_datum(v.as_slice(), ty)
})
.try_collect()?,
);
if scan_range.lower_bound.is_none() && scan_range.upper_bound.is_none() {
return Ok(Self {
pk_prefix,
..Self::full()
});
}

let build_bound = |bound: &scan_range::Bound, mut index| -> Result<Bound<OwnedRow>> {
let next_col_bounds = OwnedRow::new(
bound
.value
.iter()
.map(|v| {
let ty = pk_types.get(index).unwrap();
index += 1;
deserialize_datum(v.as_slice(), ty)
})
.try_collect()?,
);
if bound.inclusive {
Ok(Bound::Included(next_col_bounds))
} else {
Ok(Bound::Excluded(next_col_bounds))
}
};

let next_col_bounds: (Bound<OwnedRow>, Bound<OwnedRow>) = match (
scan_range.lower_bound.as_ref(),
scan_range.upper_bound.as_ref(),
) {
(Some(lb), Some(ub)) => (build_bound(lb, index)?, build_bound(ub, index)?),
(None, Some(ub)) => (Bound::Unbounded, build_bound(ub, index)?),
(Some(lb), None) => (build_bound(lb, index)?, Bound::Unbounded),
(None, None) => unreachable!(),
};
Ok(Self {
pk_prefix,
next_col_bounds,
})
}

/// Create a scan range for full table scan.
pub fn full() -> Self {
Self {
pk_prefix: OwnedRow::default(),
next_col_bounds: (Bound::Unbounded, Bound::Unbounded),
}
}
}

impl<S: StateStore> RowSeqScanExecutor<S> {
pub fn new(
table: StorageTable<S>,
Expand Down Expand Up @@ -219,31 +141,7 @@ impl BoxedExecutorBuilder for RowSeqScanExecutorBuilder {
None => Some(Bitmap::ones(table_desc.vnode_count()).into()),
};

let scan_ranges = {
let scan_ranges = &seq_scan_node.scan_ranges;
if scan_ranges.is_empty() {
vec![ScanRange::full()]
} else {
scan_ranges
.iter()
.map(|scan_range| {
let pk_types = table_desc
.pk
.iter()
.map(|order| {
DataType::from(
table_desc.columns[order.column_index as usize]
.column_type
.as_ref()
.unwrap(),
)
})
.collect_vec();
ScanRange::new(scan_range.clone(), pk_types)
})
.try_collect()?
}
};
let scan_ranges = build_scan_ranges_from_pb(&seq_scan_node.scan_ranges, table_desc)?;

let ordered = seq_scan_node.ordered;

Expand Down Expand Up @@ -429,55 +327,15 @@ impl<S: StateStore> RowSeqScanExecutor<S> {
limit: Option<u64>,
histogram: Option<impl Deref<Target = Histogram>>,
) {
let ScanRange {
pk_prefix,
next_col_bounds,
} = scan_range;

// The len of a valid pk_prefix should be less than or equal pk's num.
let order_type = table.pk_serializer().get_order_types()[pk_prefix.len()];
let (start_bound, end_bound) = if order_type.is_ascending() {
(next_col_bounds.0, next_col_bounds.1)
} else {
(next_col_bounds.1, next_col_bounds.0)
};

let start_bound_is_bounded = !matches!(start_bound, Bound::Unbounded);
let end_bound_is_bounded = !matches!(end_bound, Bound::Unbounded);

let build_bound = |other_bound_is_bounded: bool, bound, order_type_nulls| {
match bound {
Bound::Unbounded => {
if other_bound_is_bounded && order_type_nulls {
// `NULL`s are at the start bound side, we should exclude them to meet SQL semantics.
Bound::Excluded(OwnedRow::new(vec![None]))
} else {
// Both start and end are unbounded, so we need to select all rows.
Bound::Unbounded
}
}
Bound::Included(x) => Bound::Included(x),
Bound::Excluded(x) => Bound::Excluded(x),
}
};
let start_bound = build_bound(
end_bound_is_bounded,
start_bound,
order_type.nulls_are_first(),
);
let end_bound = build_bound(
start_bound_is_bounded,
end_bound,
order_type.nulls_are_last(),
);

let pk_prefix = scan_range.pk_prefix.clone();
let range_bounds = scan_range.convert_to_range_bounds(&table);
// Range Scan.
assert!(pk_prefix.len() < table.pk_indices().len());
let iter = table
.batch_chunk_iter_with_pk_bounds(
epoch.into(),
&pk_prefix,
(start_bound, end_bound),
range_bounds,
ordered,
chunk_size,
PrefetchOptions::new(limit.is_none(), true),
Expand Down
Loading
Loading