Skip to content

Commit

Permalink
change(state): Prepare for in-place database format upgrades, but don…
Browse files Browse the repository at this point in the history
…'t make any format changes yet (#7031)

* Move format upgrades to their own module and enum

* Launch a format change thread if needed, and shut it down during shutdown

* Add some TODOs and remove a redundant timer

* Regularly check for panics in the state upgrade task

* Only run example upgrade once, change version field names

* Increment database format to 25.0.2: add format change task

* Log the running and initial disk database format versions on startup

* Add initial disk and running state versions to cached state images in CI

* Fix missing imports

* Fix typo in logs workflow command

* Add a force_save_to_disk argument to the CI workflow

* Move use_internet_connection into zebrad_config()

* fastmod can_spawn_zebrad_for_rpc can_spawn_zebrad_for_test_type zebra*

* Add a spawn_zebrad_without_rpc() function

* Remove unused copy_state() test code

* Assert that upgrades and downgrades happen with the correct versions

* Add a kill_and_return_output() method for tests

* Add a test for new_state_format() versions (no upgrades or downgrades)

* Add use_internet_connection to can_spawn_zebrad_for_test_type()

* Fix workflow parameter passing

* Check that reopening a new database doesn't upgrade (or downgrade) the format

* Allow ephemeral to be set to false even if we don't have a cached state

* Add a test type that will accept any kind of state

* When re-using a directory, configure the state test config with that path

* Actually mark newly created databases with their format versions

* Wait for the state to be opened before testing the format

* Run state format tests on mainnet and testnet configs (no network access)

* run multiple reopens in tests

* Test upgrades run correctly

* Test that version downgrades work as expected (best effort)

* Add a TODO for testing partial updates

* Fix missing test arguments

* clippy if chain

* Fix typo

* another typo

* Pass a database instance to the format upgrade task

* Fix a timing issue in the tests

* Fix version matching in CI

* Use correct env var reference

* Use correct github env file

* Wait for the database to be written before killing Zebra

* Use correct workflow syntax

* Version changes aren't always upgrades

---------

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
teor2345 and mergify[bot] authored Jul 13, 2023
1 parent 3b34e48 commit be5cfad
Show file tree
Hide file tree
Showing 21 changed files with 1,260 additions and 250 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/continous-integration-docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ on:
default: false
description: 'Just run a lightwalletd full sync and update tip disks'
required: true
force_save_to_disk:
required: false
type: boolean
default: false
description: 'Force tests to always create a cached state disk, if they already create disks'
no_cache:
description: 'Disable the Docker cache for this build'
required: false
Expand Down Expand Up @@ -337,6 +342,7 @@ jobs:
network: ${{ inputs.network || vars.ZCASH_NETWORK }}
needs_zebra_state: false
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_suffix: checkpoint
height_grep_text: 'flushing database to disk .*height.*=.*Height.*\('
secrets: inherit
Expand Down Expand Up @@ -399,6 +405,7 @@ jobs:
is_long_test: true
needs_zebra_state: false
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_suffix: tip
height_grep_text: 'current_height.*=.*Height.*\('
secrets: inherit
Expand Down Expand Up @@ -439,6 +446,7 @@ jobs:
needs_zebra_state: true
# update the disk on every PR, to increase CI speed
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_suffix: tip
root_state_path: '/var/cache'
zebra_state_dir: 'zebrad-cache'
Expand Down Expand Up @@ -511,6 +519,7 @@ jobs:
is_long_test: true
needs_zebra_state: false
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_suffix: tip
height_grep_text: 'current_height.*=.*Height.*\('
secrets: inherit
Expand Down Expand Up @@ -554,6 +563,7 @@ jobs:
# update the disk on every PR, to increase CI speed
# we don't have a test-update-sync-testnet job, so we need to update the disk here
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_suffix: tip
root_state_path: '/var/cache'
zebra_state_dir: 'zebrad-cache'
Expand Down Expand Up @@ -587,6 +597,7 @@ jobs:
needs_zebra_state: true
needs_lwd_state: false
saves_to_disk: true
force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
disk_prefix: lwd-cache
disk_suffix: tip
root_state_path: '/var/cache'
Expand Down
113 changes: 106 additions & 7 deletions .github/workflows/deploy-gcp-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ on:
saves_to_disk:
required: true
type: boolean
description: 'Does the test create a new cached state disk?'
description: 'Can this test create new or updated cached state disks?'
force_save_to_disk:
required: false
type: boolean
default: false
description: 'Force this test to create a new or updated cached state disk'
app_name:
required: false
type: string
Expand Down Expand Up @@ -1702,7 +1707,7 @@ jobs:
# We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
# Normally, if a job is skipped, all the jobs that depend on it are also skipped.
# So we need to override the default success() check to make this job run.
if: ${{ !cancelled() && !failure() && inputs.saves_to_disk }}
if: ${{ !cancelled() && !failure() && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
permissions:
contents: 'read'
id-token: 'write'
Expand Down Expand Up @@ -1791,6 +1796,96 @@ jobs:
echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"
# Get the full initial and running database versions from the test logs.
# These versions are used as part of the disk description and labels.
#
# If these versions are missing from the logs, the job fails.
#
# Typically, the database versions are around line 20 in the logs..
# But we check the first 1000 log lines, just in case the test harness recompiles all the
# dependencies before running the test. (This can happen if the cache is invalid.)
#
# Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
# $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
- name: Get database versions from logs
run: |
INITIAL_DISK_DB_VERSION=""
RUNNING_DB_VERSION=""
DB_VERSION_SUMMARY=""
DOCKER_LOGS=$( \
gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
--zone ${{ vars.GCP_ZONE }} \
--ssh-flag="-o ServerAliveInterval=5" \
--ssh-flag="-o ConnectionAttempts=20" \
--ssh-flag="-o ConnectTimeout=5" \
--command=" \
sudo docker logs ${{ inputs.test_id }} | head -1000 \
")
# either a semantic version or "creating new database"
INITIAL_DISK_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
# Fail the tests, because Zebra didn't log the initial disk database version,
# or the regex in this step is wrong.
false
fi
if [[ "$INITIAL_DISK_DB_VERSION" = "creating.new.database" ]]; then
INITIAL_DISK_DB_VERSION="new"
else
INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
fi
echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"
RUNNING_DB_VERSION=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
grep --extended-regexp --only-matching '[0-9\.]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$RUNNING_DB_VERSION" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing running database version in logs: $RUNNING_DB_VERSION"
# Fail the tests, because Zebra didn't log the running database version,
# or the regex in this step is wrong.
false
fi
RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
echo "Found running database version in logs: $RUNNING_DB_VERSION"
echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"
if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
else
DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
fi
echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"
# Get the sync height from the test logs, which is later used as part of the
# disk description and labels.
#
Expand All @@ -1800,7 +1895,7 @@ jobs:
#
# If the sync height is missing from the logs, the job fails.
#
# Passes the sync height to subsequent steps using $SYNC_HEIGHT env variable.
# Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
- name: Get sync height from logs
run: |
SYNC_HEIGHT=""
Expand All @@ -1818,12 +1913,16 @@ jobs:
SYNC_HEIGHT=$( \
echo "$DOCKER_LOGS" | \
grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
grep --extended-regexp --only-matching '[0-9]+' | \
grep --extended-regexp --only-matching '[0-9]+' | \
tail -1 || \
[[ $? == 1 ]] \
)
if [[ -z "$SYNC_HEIGHT" ]]; then
echo "Checked logs:"
echo ""
echo "$DOCKER_LOGS"
echo ""
echo "Missing sync height in logs: $SYNC_HEIGHT"
# Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
# or the CI workflow sync height regex is wrong.
Expand Down Expand Up @@ -1885,15 +1984,15 @@ jobs:
- name: Create image from state disk
run: |
MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]]; then
if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then
gcloud compute images create \
"${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
--force \
--source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
--source-disk-zone=${{ vars.GCP_ZONE }} \
--storage-location=us \
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},updated-from-height=${ORIGINAL_HEIGHT},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
--description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
--labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
else
echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT"
fi
Expand Down
24 changes: 19 additions & 5 deletions zebra-state/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ impl Default for Config {
}

// Cleaning up old database versions
// TODO: put this in a different module?

/// Spawns a task that checks if there are old database folders,
/// and deletes them from the filesystem.
Expand Down Expand Up @@ -292,6 +293,8 @@ fn parse_version_number(dir_name: &str) -> Option<u64> {
.and_then(|version| version.parse().ok())
}

// TODO: move these to the format upgrade module

/// Returns the full semantic version of the currently running database format code.
///
/// This is the version implemented by the Zebra code that's currently running,
Expand Down Expand Up @@ -336,29 +339,40 @@ pub fn database_format_version_on_disk(
)))
}

/// Writes the currently running semantic database version to the on-disk database.
/// Writes `changed_version` to the on-disk database after the format is changed.
/// (Or a new database is created.)
///
/// # Correctness
///
/// This should only be called after all running format upgrades are complete.
/// This should only be called:
/// - after each format upgrade is complete,
/// - when creating a new database, or
/// - when an older Zebra version opens a newer database.
///
/// # Concurrency
///
/// This must only be called while RocksDB has an open database for `config`.
/// Otherwise, multiple Zebra processes could write the version at the same time,
/// corrupting the file.
///
/// # Panics
///
/// If the major versions do not match. (The format is incompatible.)
pub fn write_database_format_version_to_disk(
changed_version: &Version,
config: &Config,
network: Network,
) -> Result<(), BoxError> {
let version_path = config.version_file_path(network);

// The major version is already in the directory path.
let version = format!(
"{}.{}",
DATABASE_FORMAT_MINOR_VERSION, DATABASE_FORMAT_PATCH_VERSION
assert_eq!(
changed_version.major, DATABASE_FORMAT_VERSION,
"tried to do in-place database format change to an incompatible version"
);

let version = format!("{}.{}", changed_version.minor, changed_version.patch);

// # Concurrency
//
// The caller handles locking for this file write.
Expand Down
16 changes: 9 additions & 7 deletions zebra-state/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use regex::Regex;

// For doc comment links
#[allow(unused_imports)]
use crate::config::{database_format_version_in_code, database_format_version_on_disk};
use crate::config::{self, Config};

pub use zebra_chain::transparent::MIN_TRANSPARENT_COINBASE_MATURITY;

Expand Down Expand Up @@ -37,9 +37,9 @@ pub const MAX_BLOCK_REORG_HEIGHT: u32 = MIN_TRANSPARENT_COINBASE_MATURITY - 1;
/// - we previously added compatibility code, and
/// - it's available in all supported Zebra versions.
///
/// Use [`database_format_version_in_code()`] or [`database_format_version_on_disk()`]
/// to get the full semantic format version.
pub const DATABASE_FORMAT_VERSION: u64 = 25;
/// Use [`config::database_format_version_in_code()`] or
/// [`config::database_format_version_on_disk()`] to get the full semantic format version.
pub(crate) const DATABASE_FORMAT_VERSION: u64 = 25;

/// The database format minor version, incremented each time the on-disk database format has a
/// significant data format change.
Expand All @@ -48,14 +48,16 @@ pub const DATABASE_FORMAT_VERSION: u64 = 25;
/// - adding new column families,
/// - changing the format of a column family in a compatible way, or
/// - breaking changes with compatibility code in all supported Zebra versions.
pub const DATABASE_FORMAT_MINOR_VERSION: u64 = 0;
pub(crate) const DATABASE_FORMAT_MINOR_VERSION: u64 = 0;

/// The database format patch version, incremented each time the on-disk database format has a
/// significant format compatibility fix.
pub const DATABASE_FORMAT_PATCH_VERSION: u64 = 1;
pub(crate) const DATABASE_FORMAT_PATCH_VERSION: u64 = 2;

/// The name of the file containing the minor and patch database versions.
pub const DATABASE_FORMAT_VERSION_FILE_NAME: &str = "version";
///
/// Use [`Config::version_file_path()`] to get the path to this file.
pub(crate) const DATABASE_FORMAT_VERSION_FILE_NAME: &str = "version";

/// The maximum number of blocks to check for NU5 transactions,
/// before we assume we are on a pre-NU5 legacy chain.
Expand Down
8 changes: 7 additions & 1 deletion zebra-state/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ mod service;
#[cfg(test)]
mod tests;

pub use config::{check_and_delete_old_databases, Config};
pub use config::{
check_and_delete_old_databases, database_format_version_in_code,
database_format_version_on_disk, Config,
};
pub use constants::MAX_BLOCK_REORG_HEIGHT;
pub use error::{
BoxError, CloneError, CommitSemanticallyVerifiedError, DuplicateNullifierError,
Expand Down Expand Up @@ -57,4 +60,7 @@ pub use service::{
init_test, init_test_services, ReadStateService,
};

#[cfg(any(test, feature = "proptest-impl"))]
pub use config::write_database_format_version_to_disk;

pub(crate) use request::ContextuallyVerifiedBlock;
8 changes: 6 additions & 2 deletions zebra-state/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ impl Drop for ReadStateService {
// The read state service shares the state,
// so dropping it should check if we can shut down.

// TODO: move this into a try_shutdown() method
if let Some(block_write_task) = self.block_write_task.take() {
if let Some(block_write_task_handle) = Arc::into_inner(block_write_task) {
// We're the last database user, so we can tell it to shut down (blocking):
Expand All @@ -280,6 +281,7 @@ impl Drop for ReadStateService {
#[cfg(test)]
debug!("waiting for the block write task to finish");

// TODO: move this into a check_for_panics() method
if let Err(thread_panic) = block_write_task_handle.join() {
std::panic::resume_unwind(thread_panic);
} else {
Expand Down Expand Up @@ -343,9 +345,7 @@ impl StateService {
.tip_block()
.map(CheckpointVerifiedBlock::from)
.map(ChainTipBlock::from);
timer.finish(module_path!(), line!(), "fetching database tip");

let timer = CodeTimer::start();
let (chain_tip_sender, latest_chain_tip, chain_tip_change) =
ChainTipSender::new(initial_tip, network);

Expand Down Expand Up @@ -1161,6 +1161,8 @@ impl Service<ReadRequest> for ReadStateService {

fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
// Check for panics in the block write task
//
// TODO: move into a check_for_panics() method
let block_write_task = self.block_write_task.take();

if let Some(block_write_task) = block_write_task {
Expand All @@ -1177,6 +1179,8 @@ impl Service<ReadRequest> for ReadStateService {
}
}

self.db.check_for_panics();

Poll::Ready(Ok(()))
}

Expand Down
Loading

0 comments on commit be5cfad

Please sign in to comment.