Skip to content

Commit

Permalink
[release-16.0] Upgrade-Downgrade Fix: Schema-initialization stuck on …
Browse files Browse the repository at this point in the history
…semi-sync ACKs while upgrading (#13411) (#13441)
  • Loading branch information
GuptaManan100 authored Jul 5, 2023
1 parent 7251cb3 commit 99d39f9
Show file tree
Hide file tree
Showing 9 changed files with 347 additions and 433 deletions.
12 changes: 1 addition & 11 deletions .github/workflows/upgrade_downgrade_test_backups_manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,6 @@ jobs:
source build.env ; cd examples/backups
./take_backups.sh
# Stopping the tablets so we can perform the upgrade.
- name: Stop tablets
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./stop_tablets.sh
# We upgrade: we swap binaries and use the version N of the tablet.
- name: Upgrade - Swap binaries, use VTTablet N
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
Expand All @@ -293,9 +285,7 @@ jobs:
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./restart_tablets.sh
# give enough time to the tablets to restore the backup
sleep 90
./upgrade_cluster.sh
# We count the number of rows in every table to check that the restore step was successful.
- name: Assert the number of rows in every table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,14 +272,6 @@ jobs:
source build.env ; cd examples/backups
./take_backups.sh
# Stopping the tablets so we can perform the upgrade.
- name: Stop tablets
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./stop_tablets.sh
# We upgrade: we swap binaries and use the version N of the tablet.
- name: Upgrade - Swap binaries, use VTTablet N
if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true'
Expand All @@ -296,9 +288,7 @@ jobs:
timeout-minutes: 10
run: |
source build.env ; cd examples/backups
./restart_tablets.sh
# give enough time to the tablets to restore the backup
sleep 90
./upgrade_cluster.sh
# We count the number of rows in every table to check that the restore step was successful.
- name: Assert the number of rows in every table
Expand Down
6 changes: 3 additions & 3 deletions examples/backups/restart_tablets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ for i in 101 201 301; do
exit 1
done

vtctldclient InitShardPrimary --force commerce/0 zone1-100
vtctldclient InitShardPrimary --force customer/-80 zone1-200
vtctldclient InitShardPrimary --force customer/80- zone1-300
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-100"
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-200"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-300"
10 changes: 7 additions & 3 deletions examples/backups/start_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ fi
# start vtctld
CELL=zone1 ../common/scripts/vtctld-up.sh

# Create keyspace and set the semi_sync durability policy.
vtctldclient CreateKeyspace --durability-policy=semi_sync commerce || fail "Failed to create and configure the commerce keyspace"

# start vttablets for keyspace commerce
for i in 100 101 102; do
Expand All @@ -39,12 +41,14 @@ for i in 100 101 102; do
done

# set one of the replicas to primary
vtctldclient InitShardPrimary --force commerce/0 zone1-100
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-100"

# create the schema for commerce
vtctlclient ApplySchema -- --sql-file ./create_commerce_schema.sql commerce || fail "Could not apply schema for the commerce keyspace"
vtctlclient ApplyVSchema -- --vschema_file ../local/vschema_commerce_seq.json commerce || fail "Could not apply vschema for the commerce keyspace"

# Create keyspace and set the semi_sync durability policy.
vtctldclient CreateKeyspace --durability-policy=semi_sync customer || fail "Failed to create and configure the customer keyspace"

# start vttablets for keyspace customer
for i in 200 201 202; do
Expand All @@ -57,8 +61,8 @@ for i in 300 301 302; do
done

# set one of the replicas to primary
vtctldclient InitShardPrimary --force customer/-80 zone1-200
vtctldclient InitShardPrimary --force customer/80- zone1-300
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-200"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-300"

for shard in "-80" "80-"; do
wait_for_healthy_shard customer "${shard}" || exit 1
Expand Down
97 changes: 97 additions & 0 deletions examples/backups/upgrade_cluster.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/bin/bash

# Copyright 2023 The Vitess Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# this script brings up new tablets for the two new shards that we will
# be creating in the customer keyspace and copies the schema

source ../common/env.sh

# Restart the replica tablets so that they come up with new vttablet versions
for i in 101 102; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
CELL=zone1 KEYSPACE=commerce TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

for i in 201 202; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
SHARD=-80 CELL=zone1 KEYSPACE=customer TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

for i in 301 302; do
echo "Shutting down tablet zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/vttablet-down.sh
echo "Shutting down mysql zone1-$i"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-down.sh
echo "Removing tablet directory zone1-$i"
vtctlclient DeleteTablet -- --allow_primary=true zone1-$i
rm -Rf $VTDATAROOT/vt_0000000$i
echo "Starting tablet zone1-$i again"
CELL=zone1 TABLET_UID=$i ../common/scripts/mysqlctl-up.sh
SHARD=80- CELL=zone1 KEYSPACE=customer TABLET_UID=$i ../common/scripts/vttablet-up.sh
done

# Wait for all the replica tablets to be in the serving state before reparenting to them.
totalTime=600
for i in 101 201 301; do
while [ $totalTime -gt 0 ]; do
status=$(curl "http://$hostname:15$i/debug/status_details")
echo "$status" | grep "REPLICA: Serving" && break
totalTime=$((totalTime-1))
sleep 0.1
done
done

# Check that all the replica tablets have reached REPLICA: Serving state
for i in 101 201 301; do
status=$(curl "http://$hostname:15$i/debug/status_details")
echo "$status" | grep "REPLICA: Serving" && continue
echo "tablet-$i did not reach REPLICA: Serving state. Exiting due to failure."
exit 1
done

# Promote the replica tablets to primary
vtctldclient PlannedReparentShard commerce/0 --new-primary "zone1-101"
vtctldclient PlannedReparentShard customer/-80 --new-primary "zone1-201"
vtctldclient PlannedReparentShard customer/80- --new-primary "zone1-301"

# Restart the old primary tablets so that they are on the latest version of vttablet too.
echo "Restarting tablet zone1-100"
CELL=zone1 TABLET_UID=100 ../common/scripts/vttablet-down.sh
CELL=zone1 KEYSPACE=commerce TABLET_UID=100 ../common/scripts/vttablet-up.sh

echo "Restarting tablet zone1-200"
CELL=zone1 TABLET_UID=200 ../common/scripts/vttablet-down.sh
SHARD=-80 CELL=zone1 KEYSPACE=customer TABLET_UID=200 ../common/scripts/vttablet-up.sh

echo "Restarting tablet zone1-300"
CELL=zone1 TABLET_UID=300 ../common/scripts/vttablet-down.sh
SHARD=80- CELL=zone1 KEYSPACE=customer TABLET_UID=300 ../common/scripts/vttablet-up.sh
1 change: 0 additions & 1 deletion examples/common/scripts/vttablet-up.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ vttablet \
--service_map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \
--pid_file $VTDATAROOT/$tablet_dir/vttablet.pid \
--vtctld_addr http://$hostname:$vtctld_web_port/ \
--disable_active_reparents \
> $VTDATAROOT/$tablet_dir/vttablet.out 2>&1 &

# Block waiting for the tablet to be listening
Expand Down
Loading

0 comments on commit 99d39f9

Please sign in to comment.