Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into tab/drop-table-connector
Browse files Browse the repository at this point in the history
  • Loading branch information
tabversion committed Jan 7, 2025
2 parents f008d79 + f530c10 commit 6d2f2d5
Show file tree
Hide file tree
Showing 52 changed files with 2,101 additions and 735 deletions.
274 changes: 59 additions & 215 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,6 @@ tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev =
# sqlx version: v0.8.2
# patch diffs: https://github.com/madsim-rs/sqlx/pull/4
sqlx = { git = "https://github.com/madsim-rs/sqlx.git", rev = "3efe6d0065963db2a2b7f30dee69f647e28dec81" }
futures-timer = { git = "https://github.com/madsim-rs/futures-timer.git", rev = "05b33b4" }
# patch to remove preserve_order from serde_json
bson = { git = "https://github.com/risingwavelabs/bson-rust", rev = "e5175ec" }
# TODO: unpatch after PR merged https://github.com/tokio-rs/prost/pull/1210
Expand Down
6 changes: 6 additions & 0 deletions ci/scripts/check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ configure_static_openssl
echo "--- Run trailing spaces check"
scripts/check/check-trailing-spaces.sh

echo "--- Check protobuf code format && Lint protobuf"
cd proto
buf format -d --exit-code
buf lint
cd ..

echo "--- Rust cargo-sort check"
cargo sort --check --workspace --grouped

Expand Down
5 changes: 5 additions & 0 deletions ci/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ export RW_SECRET_STORE_PRIVATE_KEY_HEX="0123456789abcdef0123456789abcdef"
unset LANG

function dump_diagnose_info() {
ret=$?
if [ $ret -eq 0 ]; then
exit 0
fi

echo "^^^ +++"
echo "--- Failed to run command! Dumping diagnose info..."
if [ -f .risingwave/config/risedev-env ]; then
Expand Down
2 changes: 1 addition & 1 deletion ci/scripts/e2e-kafka-sink-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ rpk topic delete test-rw-sink-debezium

# test different encoding
echo "preparing confluent schema registry"
python3 -m pip install --break-system-packages requests confluent-kafka
python3 -m pip install --break-system-packages -r e2e_test/requirements.txt

echo "testing protobuf"
risedev slt 'e2e_test/sink/kafka/protobuf.slt'
Expand Down
11 changes: 0 additions & 11 deletions ci/scripts/misc-check.sh

This file was deleted.

7 changes: 6 additions & 1 deletion ci/scripts/run-backfill-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ else
RUNTIME_CLUSTER_PROFILE='ci-backfill-3cn-1fe-with-monitoring'
MINIO_RATE_LIMIT_CLUSTER_PROFILE='ci-backfill-3cn-1fe-with-monitoring-and-minio-rate-limit'
fi
export RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
export RUST_LOG="info,risingwave_stream=info,risingwave_stream::executor::backfill=debug,risingwave_batch=info,risingwave_storage=info,risingwave_meta::barrier=debug" \

run_sql_file() {
psql -h localhost -p 4566 -d dev -U root -f "$@"
Expand Down Expand Up @@ -304,6 +304,11 @@ test_snapshot_backfill() {

wait

TEST_NAME=nexmark_q3 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/scale.slt' &
TEST_NAME=nexmark_q7 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/scale.slt' &

wait

TEST_NAME=nexmark_q3 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/drop_mv.slt' &
TEST_NAME=nexmark_q7 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/drop_mv.slt' &

Expand Down
15 changes: 0 additions & 15 deletions ci/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -656,21 +656,6 @@ steps:
cancel_on_build_failing: true
retry: *auto-retry

- label: "misc check"
command: "ci/scripts/misc-check.sh"
if: |
!(build.pull_request.labels includes "ci/pr/run-selected") && build.env("CI_STEPS") == null
|| build.pull_request.labels includes "ci/run-misc-check"
|| build.env("CI_STEPS") =~ /(^|,)misc-check(,|$$)/
plugins:
- docker-compose#v5.5.0:
run: rw-build-env
config: ci/docker-compose.yml
- shellcheck#v1.2.0:
files: ./**/*.sh
timeout_in_minutes: 5
retry: *auto-retry

# The following jobs are triggered only when PR has corresponding labels.

# Generates cpu flamegraph env
Expand Down
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-user-dashboard.json

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions docs/dev/src/tests/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ RisingWave requires all code to pass fmt, clippy, sort and hakari checks. Run th
./risedev c # Run all checks. Shortcut for ./risedev check
```

There are also some miscellaneous checks. See `ci/scripts/misc-check.sh`.

## Unit and integration tests

RiseDev runs unit tests with cargo-nextest. To run unit tests:
Expand Down
15 changes: 15 additions & 0 deletions e2e_test/backfill/snapshot_backfill/scale.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
control substitution on

statement ok
alter materialized view ${TEST_NAME}_mv set parallelism to 1;

sleep 3s

include ./check_data_equal.slt.part

statement ok
alter materialized view ${TEST_NAME}_mv set parallelism to 4;

sleep 3s

include ./check_data_equal.slt.part
55 changes: 55 additions & 0 deletions e2e_test/s3/file_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,46 @@ def do_test(config, file_num, item_num_per_file, prefix):
def _table():
return 's3_test_parquet'

print("test table function file scan")
cur.execute(f'''
SELECT
id,
name,
sex,
mark,
test_int,
test_int8,
test_uint8,
test_uint16,
test_uint32,
test_uint64,
test_float_16,
test_real,
test_double_precision,
test_varchar,
test_bytea,
test_date,
test_time,
test_timestamp_s,
test_timestamp_ms,
test_timestamp_us,
test_timestamp_ns,
test_timestamptz_s,
test_timestamptz_ms,
test_timestamptz_us,
test_timestamptz_ns
FROM file_scan(
'parquet',
's3',
'http://127.0.0.1:9301',
'hummockadmin',
'hummockadmin',
's3://hummock001/test_file_scan/test_file_scan.parquet'
);''')
result = cur.fetchone()
assert result[0] == 0, f'file scan assertion failed: the first column is {result[0]}, expect 0.'

print("file scan test pass")
# Execute a SELECT statement
cur.execute(f'''CREATE TABLE {_table()}(
id bigint primary key,
Expand Down Expand Up @@ -491,6 +531,21 @@ def _assert_greater(field, got, expect):
_s3(idx),
_local(idx)
)
# put parquet file to test table function file scan
if data:
first_file_data = data[0]
first_table = pa.Table.from_pandas(pd.DataFrame(first_file_data))

first_file_name = f"test_file_scan.parquet"
first_file_path = f"test_file_scan/{first_file_name}"

pq.write_table(first_table, "data_0.parquet")

client.fput_object(
"hummock001",
first_file_path,
"data_0.parquet"
)

# do test
do_test(config, FILE_NUM, ITEM_NUM_PER_FILE, run_id)
Expand Down
4 changes: 2 additions & 2 deletions e2e_test/source_inline/pubsub/prepare-data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
---cargo
[dependencies]
anyhow = "1"
google-cloud-googleapis = { version = "0.13", features = ["pubsub"] }
google-cloud-pubsub = "0.25"
google-cloud-googleapis = { version = "0.16", features = ["pubsub"] }
google-cloud-pubsub = "0.30"
tokio = { version = "0.2", package = "madsim-tokio", features = [
"rt",
"rt-multi-thread",
Expand Down
108 changes: 108 additions & 0 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -4661,6 +4661,113 @@ def section_udf(outer_panels):
)
]

def section_alert_overview(panels):
return [
panels.row("Alert Overview"),
panels.timeseries_count(
"Alerts",
"""Alerts in the system group by type:
- Too Many Barriers: there are too many uncommitted barriers generated. This means the streaming graph is stuck or under heavy load. Check 'Barrier Latency' panel.
- Recovery Triggered: cluster recovery is triggered. Check 'Errors by Type' / 'Node Count' panels.
- Lagging Version: the checkpointed or pinned version id is lagging behind the current version id. Check 'Hummock Manager' section in dev dashboard.
- Lagging Compaction: there are too many ssts in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard, and take care of the type of 'Commit Flush Bytes' and 'Compaction Throughput', whether the throughput is too low.
- Lagging Vacuum: there are too many stale files waiting to be cleaned. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
- Abnormal Meta Cache Memory: the meta cache memory usage is too large, exceeding the expected 10 percent.
- Abnormal Block Cache Memory: the block cache memory usage is too large, exceeding the expected 10 percent.
- Abnormal Uploading Memory Usage: uploading memory is more than 70 percent of the expected, and is about to spill.
- Write Stall: Compaction cannot keep up. Stall foreground write, Check 'Compaction' section in dev dashboard.
- Abnormal Version Size: the size of the version is too large, exceeding the expected 300MB. Check 'Hummock Manager' section in dev dashboard.
- Abnormal Delta Log Number: the number of delta logs is too large, exceeding the expected 5000. Check 'Hummock Manager' and `Compaction` section in dev dashboard and take care of the type of 'Compaction Success Count', whether the number of trivial-move tasks spiking.
- Abnormal Pending Event Number: the number of pending events is too large, exceeding the expected 10000000. Check 'Hummock Write' section in dev dashboard and take care of the 'Event handle latency', whether the time consumed exceeds the barrier latency.
- Abnormal Object Storage Failure: the number of object storage failures is too large, exceeding the expected 50. Check 'Object Storage' section in dev dashboard and take care of the 'Object Storage Failure Rate', whether the rate is too high.
""",
[
panels.target(
f"{metric('all_barrier_nums')} >= bool 200",
"Too Many Barriers {{database_id}}",
),
panels.target(
f"sum(rate({metric('recovery_latency_count')}[$__rate_interval])) > bool 0 + sum({metric('recovery_failure_cnt')}) > bool 0",
"Recovery Triggered",
),
panels.target(
f"(({metric('storage_current_version_id')} - {metric('storage_checkpoint_version_id')}) >= bool 100) + "
+ f"(({metric('storage_current_version_id')} - {metric('storage_min_pinned_version_id')}) >= bool 100)",
"Lagging Version",
),
panels.target(
f"sum(label_replace({metric('storage_level_total_file_size')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
+ f"{metric('storage_level_total_file_size')}) by (L0) >= bool 52428800",
"Lagging Compaction",
),
panels.target(
f"{metric('storage_stale_object_count')} >= bool 200",
"Lagging Vacuum",
),
panels.target(
f"{metric('state_store_meta_cache_usage_ratio')} >= bool 1.1",
"Abnormal Meta Cache Memory",
),
panels.target(
f"{metric('state_store_block_cache_usage_ratio')} >= bool 1.1",
"Abnormal Block Cache Memory",
),
panels.target(
f"{metric('state_store_uploading_memory_usage_ratio')} >= bool 0.7",
"Abnormal Uploading Memory Usage",
),
panels.target(
f"{metric('storage_write_stop_compaction_groups')} > bool 0",
"Write Stall",
),
panels.target(
f"{metric('storage_version_size')} >= bool 314572800",
"Abnormal Version Size",
),
panels.target(
f"{metric('storage_delta_log_count')} >= bool 5000",
"Abnormal Delta Log Number",
),
panels.target(
f"{metric('state_store_event_handler_pending_event')} >= bool 10000000",
"Abnormal Pending Event Number",
),
panels.target(
f"{metric('object_store_failure_count')} >= bool 50",
"Abnormal Object Storage Failure",
),
],
["last"],
),
panels.timeseries_count(
"Errors",
"Errors in the system group by type",
[
panels.target(
f"sum({metric('user_compute_error')}) by (error_type, executor_name, fragment_id)",
"{{error_type}} @ {{executor_name}} (fragment_id={{fragment_id}})",
),
panels.target(
f"sum({metric('user_source_error')}) by (error_type, source_id, source_name, fragment_id)",
"{{error_type}} @ {{source_name}} (source_id={{source_id}} fragment_id={{fragment_id}})",
),
panels.target(
f"sum({metric('user_sink_error')}) by (error_type, sink_id, sink_name, fragment_id)",
"{{error_type}} @ {{sink_name}} (sink_id={{sink_id}} fragment_id={{fragment_id}})",
),
panels.target(
f"{metric('source_status_is_up')} == 0",
"source error: source_id={{source_id}}, source_name={{source_name}} @ {{%s}}"
% NODE_LABEL,
),
panels.target(
f"sum(rate({metric('object_store_failure_count')}[$__rate_interval])) by ({NODE_LABEL}, {COMPONENT_LABEL}, type)",
"remote storage error {{type}}: {{%s}} @ {{%s}}"
% (COMPONENT_LABEL, NODE_LABEL),
),
],
),
]

templating_list = []
if dynamic_source_enabled:
Expand Down Expand Up @@ -4840,5 +4947,6 @@ def section_udf(outer_panels):
*section_network_connection(panels),
*section_iceberg_metrics(panels),
*section_udf(panels),
*section_alert_overview(panels),
],
).auto_panel_ids()
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

33 changes: 24 additions & 9 deletions grafana/risingwave-user-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,16 @@ def section_overview(panels):
- Too Many Barriers: there are too many uncommitted barriers generated. This means the streaming graph is stuck or under heavy load. Check 'Barrier Latency' panel.
- Recovery Triggered: cluster recovery is triggered. Check 'Errors by Type' / 'Node Count' panels.
- Lagging Version: the checkpointed or pinned version id is lagging behind the current version id. Check 'Hummock Manager' section in dev dashboard.
- Lagging Epoch: the pinned or safe epoch is lagging behind the current max committed epoch. Check 'Hummock Manager' section in dev dashboard.
- Lagging Compaction: there are too many files in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
- Lagging Compaction: there are too many ssts in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard, and take care of the type of 'Commit Flush Bytes' and 'Compaction Throughput', whether the throughput is too low.
- Lagging Vacuum: there are too many stale files waiting to be cleaned. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
- Abnormal Meta Cache Memory: the meta cache memory usage is too large, exceeding the expected 10 percent.
- Abnormal Block Cache Memory: the block cache memory usage is too large, exceeding the expected 10 percent.
- Abnormal Uploading Memory Usage: uploading memory is more than 70 percent of the expected, and is about to spill.
- Write Stall: Compaction cannot keep up. Stall foreground write.
- Write Stall: Compaction cannot keep up. Stall foreground write, Check 'Compaction' section in dev dashboard.
- Abnormal Version Size: the size of the version is too large, exceeding the expected 300MB. Check 'Hummock Manager' section in dev dashboard.
- Abnormal Delta Log Number: the number of delta logs is too large, exceeding the expected 5000. Check 'Hummock Manager' and `Compaction` section in dev dashboard and take care of the type of 'Compaction Success Count', whether the number of trivial-move tasks spiking.
- Abnormal Pending Event Number: the number of pending events is too large, exceeding the expected 10000000. Check 'Hummock Write' section in dev dashboard and take care of the 'Event handle latency', whether the time consumed exceeds the barrier latency.
- Abnormal Object Storage Failure: the number of object storage failures is too large, exceeding the expected 50. Check 'Object Storage' section in dev dashboard and take care of the 'Object Storage Failure Rate', whether the rate is too high.
""",
[
panels.target(
Expand All @@ -154,12 +157,8 @@ def section_overview(panels):
"Lagging Version",
),
panels.target(
f"(({metric('storage_max_committed_epoch')} - {metric('storage_min_pinned_epoch')}) >= bool 6553600000 unless + {metric('storage_min_pinned_epoch')} == 0)",
"Lagging Epoch",
),
panels.target(
f"sum(label_replace({metric('storage_level_sst_num')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
+ f"{metric('storage_level_sst_num')}) by (L0) >= bool 200",
f"sum(label_replace({metric('storage_level_total_file_size')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
+ f"{metric('storage_level_total_file_size')}) by (L0) >= bool 52428800",
"Lagging Compaction",
),
panels.target(
Expand All @@ -182,6 +181,22 @@ def section_overview(panels):
f"{metric('storage_write_stop_compaction_groups')} > bool 0",
"Write Stall",
),
panels.target(
f"{metric('storage_version_size')} >= bool 314572800",
"Abnormal Version Size",
),
panels.target(
f"{metric('storage_delta_log_count')} >= bool 5000",
"Abnormal Delta Log Number",
),
panels.target(
f"{metric('state_store_event_handler_pending_event')} >= bool 10000000",
"Abnormal Pending Event Number",
),
panels.target(
f"{metric('object_store_failure_count')} >= bool 50",
"Abnormal Object Storage Failure",
),
],
["last"],
),
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-user-dashboard.json

Large diffs are not rendered by default.

Loading

0 comments on commit 6d2f2d5

Please sign in to comment.