change(state): Prepare for in-place database format upgrades, but don…

…'t make any format changes yet (#7031) * Move format upgrades to their own module and enum * Launch a format change thread if needed, and shut it down during shutdown * Add some TODOs and remove a redundant timer * Regularly check for panics in the state upgrade task * Only run example upgrade once, change version field names * Increment database format to 25.0.2: add format change task * Log the running and initial disk database format versions on startup * Add initial disk and running state versions to cached state images in CI * Fix missing imports * Fix typo in logs workflow command * Add a force_save_to_disk argument to the CI workflow * Move use_internet_connection into zebrad_config() * fastmod can_spawn_zebrad_for_rpc can_spawn_zebrad_for_test_type zebra* * Add a spawn_zebrad_without_rpc() function * Remove unused copy_state() test code * Assert that upgrades and downgrades happen with the correct versions * Add a kill_and_return_output() method for tests * Add a test for new_state_format() versions (no upgrades or downgrades) * Add use_internet_connection to can_spawn_zebrad_for_test_type() * Fix workflow parameter passing * Check that reopening a new database doesn't upgrade (or downgrade) the format * Allow ephemeral to be set to false even if we don't have a cached state * Add a test type that will accept any kind of state * When re-using a directory, configure the state test config with that path * Actually mark newly created databases with their format versions * Wait for the state to be opened before testing the format * Run state format tests on mainnet and testnet configs (no network access) * run multiple reopens in tests * Test upgrades run correctly * Test that version downgrades work as expected (best effort) * Add a TODO for testing partial updates * Fix missing test arguments * clippy if chain * Fix typo * another typo * Pass a database instance to the format upgrade task * Fix a timing issue in the tests * Fix version matching in CI * Use correct env var reference * Use correct github env file * Wait for the database to be written before killing Zebra * Use correct workflow syntax * Version changes aren't always upgrades --------- Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
ZcashFoundation · Jul 13, 2023 · be5cfad · be5cfad
1 parent 3b34e48
commit be5cfad
Show file tree

Hide file tree

Showing 21 changed files with 1,260 additions and 250 deletions.
diff --git a/.github/workflows/continous-integration-docker.yml b/.github/workflows/continous-integration-docker.yml
@@ -35,6 +35,11 @@ on:
         default: false
         description: 'Just run a lightwalletd full sync and update tip disks'
         required: true
+      force_save_to_disk:
+        required: false
+        type: boolean
+        default: false
+        description: 'Force tests to always create a cached state disk, if they already create disks'
       no_cache:
         description: 'Disable the Docker cache for this build'
         required: false
@@ -337,6 +342,7 @@ jobs:
       network: ${{ inputs.network || vars.ZCASH_NETWORK }}
       needs_zebra_state: false
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_suffix: checkpoint
       height_grep_text: 'flushing database to disk .*height.*=.*Height.*\('
     secrets: inherit
@@ -399,6 +405,7 @@ jobs:
       is_long_test: true
       needs_zebra_state: false
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_suffix: tip
       height_grep_text: 'current_height.*=.*Height.*\('
     secrets: inherit
@@ -439,6 +446,7 @@ jobs:
       needs_zebra_state: true
       # update the disk on every PR, to increase CI speed
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_suffix: tip
       root_state_path: '/var/cache'
       zebra_state_dir: 'zebrad-cache'
@@ -511,6 +519,7 @@ jobs:
       is_long_test: true
       needs_zebra_state: false
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_suffix: tip
       height_grep_text: 'current_height.*=.*Height.*\('
     secrets: inherit
@@ -554,6 +563,7 @@ jobs:
       # update the disk on every PR, to increase CI speed
       # we don't have a test-update-sync-testnet job, so we need to update the disk here
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_suffix: tip
       root_state_path: '/var/cache'
       zebra_state_dir: 'zebrad-cache'
@@ -587,6 +597,7 @@ jobs:
       needs_zebra_state: true
       needs_lwd_state: false
       saves_to_disk: true
+      force_save_to_disk: ${{ inputs.force_save_to_disk || false }}
       disk_prefix: lwd-cache
       disk_suffix: tip
       root_state_path: '/var/cache'

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
@@ -79,7 +79,12 @@ on:
       saves_to_disk:
         required: true
         type: boolean
-        description: 'Does the test create a new cached state disk?'
+        description: 'Can this test create new or updated cached state disks?'
+      force_save_to_disk:
+        required: false
+        type: boolean
+        default: false
+        description: 'Force this test to create a new or updated cached state disk'
       app_name:
         required: false
         type: string
@@ -1702,7 +1707,7 @@ jobs:
     # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
     # Normally, if a job is skipped, all the jobs that depend on it are also skipped.
     # So we need to override the default success() check to make this job run.
-    if: ${{ !cancelled() && !failure() && inputs.saves_to_disk }}
+    if: ${{ !cancelled() && !failure() && (inputs.saves_to_disk || inputs.force_save_to_disk) }}
     permissions:
       contents: 'read'
       id-token: 'write'
@@ -1791,6 +1796,96 @@ jobs:
           echo "UPDATE_SUFFIX=$UPDATE_SUFFIX" >> "$GITHUB_ENV"
           echo "TIME_SUFFIX=$TIME_SUFFIX" >> "$GITHUB_ENV"
 
+      # Get the full initial and running database versions from the test logs.
+      # These versions are used as part of the disk description and labels.
+      #
+      # If these versions are missing from the logs, the job fails.
+      #
+      # Typically, the database versions are around line 20 in the logs..
+      # But we check the first 1000 log lines, just in case the test harness recompiles all the
+      # dependencies before running the test. (This can happen if the cache is invalid.)
+      #
+      # Passes the versions to subsequent steps using the $INITIAL_DISK_DB_VERSION,
+      # $RUNNING_DB_VERSION, and $DB_VERSION_SUMMARY env variables.
+      - name: Get database versions from logs
+        run: |
+          INITIAL_DISK_DB_VERSION=""
+          RUNNING_DB_VERSION=""
+          DB_VERSION_SUMMARY=""
+
+          DOCKER_LOGS=$( \
+          gcloud compute ssh ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ vars.GCP_ZONE }} \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --ssh-flag="-o ConnectionAttempts=20" \
+          --ssh-flag="-o ConnectTimeout=5" \
+          --command=" \
+          sudo docker logs ${{ inputs.test_id }} | head -1000 \
+          ")
+
+          # either a semantic version or "creating new database"
+          INITIAL_DISK_DB_VERSION=$( \
+          echo "$DOCKER_LOGS" | \
+          grep --extended-regexp --only-matching 'initial disk state version: [0-9a-z\.]+' | \
+          grep --extended-regexp --only-matching '[0-9a-z\.]+' | \
+          tail -1 || \
+          [[ $? == 1 ]] \
+          )
+
+          if [[ -z "$INITIAL_DISK_DB_VERSION" ]]; then
+              echo "Checked logs:"
+              echo ""
+              echo "$DOCKER_LOGS"
+              echo ""
+              echo "Missing initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
+              # Fail the tests, because Zebra didn't log the initial disk database version,
+              # or the regex in this step is wrong.
+              false
+          fi
+
+          if [[ "$INITIAL_DISK_DB_VERSION" = "creating.new.database" ]]; then
+              INITIAL_DISK_DB_VERSION="new"
+          else
+              INITIAL_DISK_DB_VERSION="v${INITIAL_DISK_DB_VERSION//./-}"
+          fi
+
+          echo "Found initial disk database version in logs: $INITIAL_DISK_DB_VERSION"
+          echo "INITIAL_DISK_DB_VERSION=$INITIAL_DISK_DB_VERSION" >> "$GITHUB_ENV"
+
+          RUNNING_DB_VERSION=$( \
+          echo "$DOCKER_LOGS" | \
+          grep --extended-regexp --only-matching 'running state version: [0-9\.]+' | \
+          grep --extended-regexp --only-matching '[0-9\.]+' | \
+          tail -1 || \
+          [[ $? == 1 ]] \
+          )
+
+          if [[ -z "$RUNNING_DB_VERSION" ]]; then
+              echo "Checked logs:"
+              echo ""
+              echo "$DOCKER_LOGS"
+              echo ""
+              echo "Missing running database version in logs: $RUNNING_DB_VERSION"
+              # Fail the tests, because Zebra didn't log the running database version,
+              # or the regex in this step is wrong.
+              false
+          fi
+
+          RUNNING_DB_VERSION="v${RUNNING_DB_VERSION//./-}"
+          echo "Found running database version in logs: $RUNNING_DB_VERSION"
+          echo "RUNNING_DB_VERSION=$RUNNING_DB_VERSION" >> "$GITHUB_ENV"
+
+          if [[ "$INITIAL_DISK_DB_VERSION" = "$RUNNING_DB_VERSION" ]]; then
+              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION"
+          elif [[ "$INITIAL_DISK_DB_VERSION" = "new" ]]; then
+              DB_VERSION_SUMMARY="$RUNNING_DB_VERSION in new database"
+          else
+              DB_VERSION_SUMMARY="$INITIAL_DISK_DB_VERSION changing to $RUNNING_DB_VERSION"
+          fi
+
+          echo "Summarised database versions from logs: $DB_VERSION_SUMMARY"
+          echo "DB_VERSION_SUMMARY=$DB_VERSION_SUMMARY" >> "$GITHUB_ENV"
+
       # Get the sync height from the test logs, which is later used as part of the
       # disk description and labels.
       #
@@ -1800,7 +1895,7 @@ jobs:
       #
       # If the sync height is missing from the logs, the job fails.
       #
-      # Passes the sync height to subsequent steps using $SYNC_HEIGHT env variable.
+      # Passes the sync height to subsequent steps using the $SYNC_HEIGHT env variable.
       - name: Get sync height from logs
         run: |
           SYNC_HEIGHT=""
@@ -1818,12 +1913,16 @@ jobs:
           SYNC_HEIGHT=$( \
           echo "$DOCKER_LOGS" | \
           grep --extended-regexp --only-matching '${{ inputs.height_grep_text }}[0-9]+' | \
-          grep --extended-regexp --only-matching  '[0-9]+' | \
+          grep --extended-regexp --only-matching '[0-9]+' | \
           tail -1 || \
           [[ $? == 1 ]] \
           )
 
           if [[ -z "$SYNC_HEIGHT" ]]; then
+              echo "Checked logs:"
+              echo ""
+              echo "$DOCKER_LOGS"
+              echo ""
               echo "Missing sync height in logs: $SYNC_HEIGHT"
               # Fail the tests, because Zebra and lightwalletd didn't log their sync heights,
               # or the CI workflow sync height regex is wrong.
@@ -1885,15 +1984,15 @@ jobs:
       - name: Create image from state disk
         run: |
           MINIMUM_UPDATE_HEIGHT=$((ORIGINAL_HEIGHT+CACHED_STATE_UPDATE_LIMIT))
-          if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]]; then
+          if [[ -z "$UPDATE_SUFFIX" ]] || [[ "$SYNC_HEIGHT" -gt "$MINIMUM_UPDATE_HEIGHT" ]] || [[ "${{ inputs.force_save_to_disk }}" == "true" ]]; then
              gcloud compute images create \
               "${{ inputs.disk_prefix }}-${SHORT_GITHUB_REF}-${{ env.GITHUB_SHA_SHORT }}-v${{ env.STATE_VERSION }}-${NETWORK}-${{ inputs.disk_suffix }}${UPDATE_SUFFIX}-${TIME_SUFFIX}" \
               --force \
               --source-disk=${{ inputs.test_id }}-${{ env.GITHUB_SHA_SHORT }} \
               --source-disk-zone=${{ vars.GCP_ZONE }} \
               --storage-location=us \
-              --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }}" \
-              --labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},updated-from-height=${ORIGINAL_HEIGHT},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
+              --description="Created from commit ${{ env.GITHUB_SHA_SHORT }} with height ${{ env.SYNC_HEIGHT }} and database format ${{ env.DB_VERSION_SUMMARY }}" \
+              --labels="height=${{ env.SYNC_HEIGHT }},purpose=${{ inputs.disk_prefix }},commit=${{ env.GITHUB_SHA_SHORT }},state-version=${{ env.STATE_VERSION }},state-running-version=${RUNNING_DB_VERSION},initial-state-disk-version=${INITIAL_DISK_DB_VERSION},network=${NETWORK},target-height-kind=${{ inputs.disk_suffix }},update-flag=${UPDATE_SUFFIX},force-save=${{ inputs.force_save_to_disk }},updated-from-height=${ORIGINAL_HEIGHT},test-id=${{ inputs.test_id }},app-name=${{ inputs.app_name }}"
           else
               echo "Skipped cached state update because the new sync height $SYNC_HEIGHT was less than $CACHED_STATE_UPDATE_LIMIT blocks above the original height $ORIGINAL_HEIGHT"
           fi

diff --git a/zebra-state/src/config.rs b/zebra-state/src/config.rs
@@ -175,6 +175,7 @@ impl Default for Config {
 }
 
 // Cleaning up old database versions
+// TODO: put this in a different module?
 
 /// Spawns a task that checks if there are old database folders,
 /// and deletes them from the filesystem.
@@ -292,6 +293,8 @@ fn parse_version_number(dir_name: &str) -> Option<u64> {
         .and_then(|version| version.parse().ok())
 }
 
+// TODO: move these to the format upgrade module
+
 /// Returns the full semantic version of the currently running database format code.
 ///
 /// This is the version implemented by the Zebra code that's currently running,
@@ -336,29 +339,40 @@ pub fn database_format_version_on_disk(
     )))
 }
 
-/// Writes the currently running semantic database version to the on-disk database.
+/// Writes `changed_version` to the on-disk database after the format is changed.
+/// (Or a new database is created.)
 ///
 /// # Correctness
 ///
-/// This should only be called after all running format upgrades are complete.
+/// This should only be called:
+/// - after each format upgrade is complete,
+/// - when creating a new database, or
+/// - when an older Zebra version opens a newer database.
 ///
 /// # Concurrency
 ///
 /// This must only be called while RocksDB has an open database for `config`.
 /// Otherwise, multiple Zebra processes could write the version at the same time,
 /// corrupting the file.
+///
+/// # Panics
+///
+/// If the major versions do not match. (The format is incompatible.)
 pub fn write_database_format_version_to_disk(
+    changed_version: &Version,
     config: &Config,
     network: Network,
 ) -> Result<(), BoxError> {
     let version_path = config.version_file_path(network);
 
     // The major version is already in the directory path.
-    let version = format!(
-        "{}.{}",
-        DATABASE_FORMAT_MINOR_VERSION, DATABASE_FORMAT_PATCH_VERSION
+    assert_eq!(
+        changed_version.major, DATABASE_FORMAT_VERSION,
+        "tried to do in-place database format change to an incompatible version"
     );
 
+    let version = format!("{}.{}", changed_version.minor, changed_version.patch);
+
     // # Concurrency
     //
     // The caller handles locking for this file write.

diff --git a/zebra-state/src/constants.rs b/zebra-state/src/constants.rs
@@ -5,7 +5,7 @@ use regex::Regex;
 
 // For doc comment links
 #[allow(unused_imports)]
-use crate::config::{database_format_version_in_code, database_format_version_on_disk};
+use crate::config::{self, Config};
 
 pub use zebra_chain::transparent::MIN_TRANSPARENT_COINBASE_MATURITY;
 
@@ -37,9 +37,9 @@ pub const MAX_BLOCK_REORG_HEIGHT: u32 = MIN_TRANSPARENT_COINBASE_MATURITY - 1;
 /// - we previously added compatibility code, and
 /// - it's available in all supported Zebra versions.
 ///
-/// Use [`database_format_version_in_code()`] or [`database_format_version_on_disk()`]
-/// to get the full semantic format version.
-pub const DATABASE_FORMAT_VERSION: u64 = 25;
+/// Use [`config::database_format_version_in_code()`] or
+/// [`config::database_format_version_on_disk()`] to get the full semantic format version.
+pub(crate) const DATABASE_FORMAT_VERSION: u64 = 25;
 
 /// The database format minor version, incremented each time the on-disk database format has a
 /// significant data format change.
@@ -48,14 +48,16 @@ pub const DATABASE_FORMAT_VERSION: u64 = 25;
 /// - adding new column families,
 /// - changing the format of a column family in a compatible way, or
 /// - breaking changes with compatibility code in all supported Zebra versions.
-pub const DATABASE_FORMAT_MINOR_VERSION: u64 = 0;
+pub(crate) const DATABASE_FORMAT_MINOR_VERSION: u64 = 0;
 
 /// The database format patch version, incremented each time the on-disk database format has a
 /// significant format compatibility fix.
-pub const DATABASE_FORMAT_PATCH_VERSION: u64 = 1;
+pub(crate) const DATABASE_FORMAT_PATCH_VERSION: u64 = 2;
 
 /// The name of the file containing the minor and patch database versions.
-pub const DATABASE_FORMAT_VERSION_FILE_NAME: &str = "version";
+///
+/// Use [`Config::version_file_path()`] to get the path to this file.
+pub(crate) const DATABASE_FORMAT_VERSION_FILE_NAME: &str = "version";
 
 /// The maximum number of blocks to check for NU5 transactions,
 /// before we assume we are on a pre-NU5 legacy chain.

diff --git a/zebra-state/src/lib.rs b/zebra-state/src/lib.rs
@@ -29,7 +29,10 @@ mod service;
 #[cfg(test)]
 mod tests;
 
-pub use config::{check_and_delete_old_databases, Config};
+pub use config::{
+    check_and_delete_old_databases, database_format_version_in_code,
+    database_format_version_on_disk, Config,
+};
 pub use constants::MAX_BLOCK_REORG_HEIGHT;
 pub use error::{
     BoxError, CloneError, CommitSemanticallyVerifiedError, DuplicateNullifierError,
@@ -57,4 +60,7 @@ pub use service::{
     init_test, init_test_services, ReadStateService,
 };
 
+#[cfg(any(test, feature = "proptest-impl"))]
+pub use config::write_database_format_version_to_disk;
+
 pub(crate) use request::ContextuallyVerifiedBlock;
diff --git a/zebra-state/src/service.rs b/zebra-state/src/service.rs
@@ -263,6 +263,7 @@ impl Drop for ReadStateService {
         // The read state service shares the state,
         // so dropping it should check if we can shut down.
 
+        // TODO: move this into a try_shutdown() method
         if let Some(block_write_task) = self.block_write_task.take() {
             if let Some(block_write_task_handle) = Arc::into_inner(block_write_task) {
                 // We're the last database user, so we can tell it to shut down (blocking):
@@ -280,6 +281,7 @@ impl Drop for ReadStateService {
                 #[cfg(test)]
                 debug!("waiting for the block write task to finish");
 
+                // TODO: move this into a check_for_panics() method
                 if let Err(thread_panic) = block_write_task_handle.join() {
                     std::panic::resume_unwind(thread_panic);
                 } else {
@@ -343,9 +345,7 @@ impl StateService {
             .tip_block()
             .map(CheckpointVerifiedBlock::from)
             .map(ChainTipBlock::from);
-        timer.finish(module_path!(), line!(), "fetching database tip");
 
-        let timer = CodeTimer::start();
         let (chain_tip_sender, latest_chain_tip, chain_tip_change) =
             ChainTipSender::new(initial_tip, network);
 
@@ -1161,6 +1161,8 @@ impl Service<ReadRequest> for ReadStateService {
 
     fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
         // Check for panics in the block write task
+        //
+        // TODO: move into a check_for_panics() method
         let block_write_task = self.block_write_task.take();
 
         if let Some(block_write_task) = block_write_task {
@@ -1177,6 +1179,8 @@ impl Service<ReadRequest> for ReadStateService {
             }
         }
 
+        self.db.check_for_panics();
+
         Poll::Ready(Ok(()))
     }