diff --git a/Cargo.lock b/Cargo.lock index 4ebcc29d4d4b5..0d2f8010dfc39 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2904,10 +2904,12 @@ dependencies = [ "itertools 0.10.5", "maplit", "once_cell", + "ordered-float 3.9.2", "pin-project 1.1.3", "proptest", "proptest-derive 0.4.0", "rand 0.7.3", + "rand 0.8.5", "rand_core 0.5.1", "serde", "serde_bytes", @@ -7923,6 +7925,22 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flexi_logger" +version = "0.27.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469e584c031833564840fb0cdbce99bdfe946fd45480a188545e73a76f45461c" +dependencies = [ + "chrono", + "glob", + "is-terminal", + "lazy_static", + "log", + "nu-ansi-term 0.49.0", + "regex", + "thiserror", +] + [[package]] name = "float-cmp" version = "0.9.0" @@ -10649,8 +10667,10 @@ dependencies = [ "codespan-reporting", "datatest-stable", "ethnum", + "flexi_logger", "im", "itertools 0.10.5", + "log", "move-binary-format", "move-bytecode-source-map", "move-command-line-common", @@ -10674,6 +10694,7 @@ version = "0.1.0" dependencies = [ "datatest-stable", "move-command-line-common", + "move-compiler-v2", "move-transactional-test-runner", "once_cell", ] @@ -11565,6 +11586,15 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "nu-ansi-term" +version = "0.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c073d3c1930d0751774acf49e66653acecb416c3a54c6ec095a9b11caddb5a68" +dependencies = [ + "windows-sys 0.48.0", +] + [[package]] name = "num" version = "0.3.1" @@ -16114,7 +16144,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", - "nu-ansi-term", + "nu-ansi-term 0.46.0", "once_cell", "regex", "serde", diff --git a/aptos-move/aptos-aggregator/src/delayed_change.rs b/aptos-move/aptos-aggregator/src/delayed_change.rs index 2507441a379a0..df1fef613eec4 100644 --- a/aptos-move/aptos-aggregator/src/delayed_change.rs +++ b/aptos-move/aptos-aggregator/src/delayed_change.rs @@ -3,11 +3,9 @@ use crate::{ delta_change_set::{DeltaOp, DeltaWithMax}, - types::{ - code_invariant_error, DelayedFieldValue, DelayedFieldsSpeculativeError, PanicOr, - SnapshotToStringFormula, - }, + types::{code_invariant_error, DelayedFieldValue, DelayedFieldsSpeculativeError, PanicOr}, }; +use aptos_types::delayed_fields::SnapshotToStringFormula; #[derive(Clone, Debug, Eq, PartialEq)] pub enum DelayedApplyChange { @@ -400,7 +398,7 @@ mod test { delta: DeltaWithMax::new(SignedU128::Positive(3), 100), }); let snapshot_change_2 = Apply(SnapshotDelta { - base_aggregator: DelayedFieldID::new(1), + base_aggregator: DelayedFieldID::new_for_test_for_u64(1), delta: DeltaWithMax::new(SignedU128::Positive(2), 100), }); @@ -411,7 +409,7 @@ mod test { assert_eq!( result.unwrap(), Apply(SnapshotDelta { - base_aggregator: DelayedFieldID::new(1), + base_aggregator: DelayedFieldID::new_for_test_for_u64(1), delta: DeltaWithMax::new(SignedU128::Positive(5), 100) }) ); diff --git a/aptos-move/aptos-aggregator/src/delayed_field_extension.rs b/aptos-move/aptos-aggregator/src/delayed_field_extension.rs index 8fdf77aafcf29..ce4c3dd7fbd44 100644 --- a/aptos-move/aptos-aggregator/src/delayed_field_extension.rs +++ b/aptos-move/aptos-aggregator/src/delayed_field_extension.rs @@ -8,10 +8,13 @@ use crate::{ resolver::DelayedFieldResolver, types::{ code_invariant_error, expect_ok, DelayedFieldID, DelayedFieldValue, - DelayedFieldsSpeculativeError, PanicOr, ReadPosition, SnapshotToStringFormula, - SnapshotValue, + DelayedFieldsSpeculativeError, PanicOr, ReadPosition, }, }; +use aptos_types::delayed_fields::{ + calculate_width_for_constant_string, calculate_width_for_integer_embeded_string, + SnapshotToStringFormula, +}; use move_binary_format::errors::PartialVMResult; use std::collections::{btree_map::Entry, BTreeMap}; @@ -163,6 +166,7 @@ impl DelayedFieldData { &mut self, aggregator_id: DelayedFieldID, max_value: u128, + width: u32, resolver: &dyn DelayedFieldResolver, ) -> PartialVMResult { let aggregator = self.delayed_fields.get(&aggregator_id); @@ -199,36 +203,62 @@ impl DelayedFieldData { }, }; - let snapshot_id = resolver.generate_delayed_field_id(); + let snapshot_id = resolver.generate_delayed_field_id(width); self.delayed_fields.insert(snapshot_id, change); Ok(snapshot_id) } pub fn create_new_snapshot( &mut self, - value: SnapshotValue, + value: u128, + width: u32, resolver: &dyn DelayedFieldResolver, ) -> DelayedFieldID { - let change = DelayedChange::Create(value.into()); - let snapshot_id = resolver.generate_delayed_field_id(); + let change = DelayedChange::Create(DelayedFieldValue::Snapshot(value)); + let snapshot_id = resolver.generate_delayed_field_id(width); self.delayed_fields.insert(snapshot_id, change); snapshot_id } + pub fn create_new_derived( + &mut self, + value: Vec, + resolver: &dyn DelayedFieldResolver, + ) -> PartialVMResult { + // cast shouldn't fail because we assert on low limit for value before this call. + let width = + u32::try_from(calculate_width_for_constant_string(value.len())).map_err(|_| { + code_invariant_error("Calculated DerivedStringSnapshot width exceeds u32") + })?; + let change = DelayedChange::Create(DelayedFieldValue::Derived(value)); + let snapshot_id = resolver.generate_delayed_field_id(width); + + self.delayed_fields.insert(snapshot_id, change); + Ok(snapshot_id) + } + pub fn read_snapshot( &mut self, snapshot_id: DelayedFieldID, resolver: &dyn DelayedFieldResolver, - ) -> PartialVMResult { - Ok(SnapshotValue::try_from(self.read_value( - snapshot_id, - resolver, - ReadPosition::AfterCurrentTxn, - )?)?) + ) -> PartialVMResult { + Ok(self + .read_value(snapshot_id, resolver, ReadPosition::AfterCurrentTxn)? + .into_snapshot_value()?) } - pub fn string_concat( + pub fn read_derived( + &mut self, + snapshot_id: DelayedFieldID, + resolver: &dyn DelayedFieldResolver, + ) -> PartialVMResult> { + Ok(self + .read_value(snapshot_id, resolver, ReadPosition::AfterCurrentTxn)? + .into_derived_value()?) + } + + pub fn derive_string_concat( &mut self, snapshot_id: DelayedFieldID, prefix: Vec, @@ -236,6 +266,12 @@ impl DelayedFieldData { resolver: &dyn DelayedFieldResolver, ) -> PartialVMResult { let snapshot = self.delayed_fields.get(&snapshot_id); + // cast shouldn't fail because we assert on low limit for prefix and suffix before this call. + let width = u32::try_from(calculate_width_for_integer_embeded_string( + prefix.len() + suffix.len(), + snapshot_id, + )?) + .map_err(|_| code_invariant_error("Calculated DerivedStringSnapshot width exceeds u32"))?; let formula = SnapshotToStringFormula::Concat { prefix, suffix }; let change = match snapshot { @@ -257,7 +293,7 @@ impl DelayedFieldData { }, }; - let new_id = resolver.generate_delayed_field_id(); + let new_id = resolver.generate_delayed_field_id(width); self.delayed_fields.insert(new_id, change); Ok(new_id) } @@ -280,7 +316,7 @@ mod test { fn test_aggregator_not_in_storage() { let resolver = FakeAggregatorView::default(); let mut data = DelayedFieldData::default(); - let id = DelayedFieldID::new(200); + let id = DelayedFieldID::new_for_test_for_u64(200); let max_value = 700; assert_err!(data.read_aggregator(id, &resolver)); @@ -299,7 +335,7 @@ mod test { fn test_operations_on_new_aggregator() { let resolver = FakeAggregatorView::default(); let mut data = DelayedFieldData::default(); - let id = DelayedFieldID::new(200); + let id = DelayedFieldID::new_for_test_for_u64(200); let max_value = 200; data.create_new_aggregator(id); @@ -354,7 +390,7 @@ mod test { fn test_successful_operations_in_delta_mode() { let mut resolver = FakeAggregatorView::default(); let mut data = DelayedFieldData::default(); - let id = DelayedFieldID::new(200); + let id = DelayedFieldID::new_for_test_for_u64(200); let max_value = 600; resolver.set_from_aggregator_id(id, 100); @@ -383,7 +419,7 @@ mod test { fn test_aggregator_overflows() { let mut resolver = FakeAggregatorView::default(); let mut data = DelayedFieldData::default(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; resolver.set_from_aggregator_id(id, 100); @@ -418,7 +454,7 @@ mod test { fn test_aggregator_underflows() { let mut resolver = FakeAggregatorView::default(); let mut data = DelayedFieldData::default(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; resolver.set_from_aggregator_id(id, 200); diff --git a/aptos-move/aptos-aggregator/src/delta_change_set.rs b/aptos-move/aptos-aggregator/src/delta_change_set.rs index c893821cc3205..8fe9d5b213e05 100644 --- a/aptos-move/aptos-aggregator/src/delta_change_set.rs +++ b/aptos-move/aptos-aggregator/src/delta_change_set.rs @@ -219,7 +219,7 @@ mod test { FakeAggregatorView, }; use aptos_types::{ - aggregator::PanicError, + delayed_fields::PanicError, state_store::{state_key::StateKey, state_value::StateValue}, write_set::WriteOp, }; @@ -532,7 +532,7 @@ mod test { Err(code_invariant_error("Error message from BadStorage.").into()) } - fn generate_delayed_field_id(&self) -> Self::Identifier { + fn generate_delayed_field_id(&self, _width: u32) -> Self::Identifier { unimplemented!("Irrelevant for the test") } diff --git a/aptos-move/aptos-aggregator/src/lib.rs b/aptos-move/aptos-aggregator/src/lib.rs index dbb43af4b028d..072864b0a8c31 100644 --- a/aptos-move/aptos-aggregator/src/lib.rs +++ b/aptos-move/aptos-aggregator/src/lib.rs @@ -9,7 +9,6 @@ pub mod delta_change_set; pub mod delta_math; pub mod resolver; pub mod types; -pub mod utils; #[cfg(any(test, feature = "testing"))] pub mod tests; diff --git a/aptos-move/aptos-aggregator/src/resolver.rs b/aptos-move/aptos-aggregator/src/resolver.rs index 907e89aa2f73d..4b5145255f365 100644 --- a/aptos-move/aptos-aggregator/src/resolver.rs +++ b/aptos-move/aptos-aggregator/src/resolver.rs @@ -11,7 +11,7 @@ use crate::{ }, }; use aptos_types::{ - aggregator::PanicError, + delayed_fields::PanicError, state_store::{ state_key::StateKey, state_value::{StateValue, StateValueMetadata}, @@ -164,7 +164,7 @@ pub trait TDelayedFieldView { /// Returns a unique per-block identifier that can be used when creating a /// new aggregator V2. - fn generate_delayed_field_id(&self) -> Self::Identifier; + fn generate_delayed_field_id(&self, width: u32) -> Self::Identifier; /// Validate that given value (from aggregator structure) is a valid delayed field identifier, /// and convert it to Self::Identifier if so. @@ -251,7 +251,7 @@ where /// Returns a unique per-block identifier that can be used when creating a /// new aggregator V2. - fn generate_delayed_field_id(&self) -> Self::Identifier { + fn generate_delayed_field_id(&self, _width: u32) -> Self::Identifier { unimplemented!("generate_delayed_field_id not implemented") } diff --git a/aptos-move/aptos-aggregator/src/tests/identifier_mappings.rs b/aptos-move/aptos-aggregator/src/tests/identifier_mappings.rs index 73f32acaa6ff9..1ae71a6d27d87 100644 --- a/aptos-move/aptos-aggregator/src/tests/identifier_mappings.rs +++ b/aptos-move/aptos-aggregator/src/tests/identifier_mappings.rs @@ -1,8 +1,8 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use super::super::utils::bytes_to_string; use crate::types::{DelayedFieldID, DelayedFieldValue, TryFromMoveValue, TryIntoMoveValue}; +use aptos_types::delayed_fields::bytes_and_width_to_derived_string_struct; use claims::{assert_err, assert_ok}; use move_core_types::value::{ IdentifierMappingKind, @@ -16,15 +16,23 @@ use test_case::test_case; use DelayedFieldValue as A; use IdentifierMappingKind as K; -static STRING: Lazy = Lazy::new(|| Struct(Runtime(vec![Vector(Box::new(U8))]))); +static DERIVED_STRING: Lazy = Lazy::new(|| { + Struct(Runtime(vec![ + // String value + Struct(Runtime(vec![Vector(Box::new(U8))])), + // Vec padding + Vector(Box::new(U8)), + ])) +}); -#[test_case(&U64)] -#[test_case(&U128)] -#[test_case(&*STRING)] -fn test_aggregator_id_roundtrip_ok(layout: &MoveTypeLayout) { - let value = assert_ok!(DelayedFieldID::new(100).try_into_move_value(layout)); - let id = assert_ok!(DelayedFieldID::try_from_move_value(layout, value, &())); - assert_eq!(id, DelayedFieldID::new(100)); +#[test_case(&U64, 8)] +#[test_case(&U128, 16)] +#[test_case(&*DERIVED_STRING, 20)] +fn test_aggregator_id_roundtrip_ok(layout: &MoveTypeLayout, width: u32) { + let input = DelayedFieldID::new_with_width(100, width); + let value = assert_ok!(input.try_into_move_value(layout)); + let (id, _) = assert_ok!(DelayedFieldID::try_from_move_value(layout, value, &())); + assert_eq!(id, input); } #[test_case(&U8)] @@ -32,7 +40,7 @@ fn test_aggregator_id_roundtrip_ok(layout: &MoveTypeLayout) { #[test_case(&Address)] #[test_case(&Vector(Box::new(U8)))] fn test_aggregator_id_to_value_err(layout: &MoveTypeLayout) { - assert_err!(DelayedFieldID::new(100).try_into_move_value(layout)); + assert_err!(DelayedFieldID::new_with_width(100, 8).try_into_move_value(layout)); } #[test_case(&U64, Value::u8(1))] @@ -43,29 +51,30 @@ fn test_aggregator_id_from_value_err(layout: &MoveTypeLayout, value: Value) { assert_err!(DelayedFieldID::try_from_move_value(layout, value, &())); } -#[test_case(A::Aggregator(10), &U64, K::Aggregator)] -#[test_case(A::Aggregator(10), &U128, K::Aggregator)] -#[test_case(A::Snapshot(10), &U64, K::Snapshot)] -#[test_case(A::Snapshot(10), &U128, K::Snapshot)] -#[test_case(A::Derived(vec![0, 1]), &*STRING, K::Snapshot)] +#[test_case(A::Aggregator(10), &U64, K::Aggregator, 8)] +#[test_case(A::Aggregator(10), &U128, K::Aggregator, 16)] +#[test_case(A::Snapshot(10), &U64, K::Snapshot, 8)] +#[test_case(A::Snapshot(10), &U128, K::Snapshot, 16)] +#[test_case(A::Derived(vec![0, 1]), &*DERIVED_STRING, K::DerivedString, 20)] fn test_aggregator_value_roundtrip_ok( aggregator_value: DelayedFieldValue, layout: &MoveTypeLayout, kind: IdentifierMappingKind, + width: u32, ) { - let value = assert_ok!(aggregator_value.clone().try_into_move_value(layout)); - let a = assert_ok!(DelayedFieldValue::try_from_move_value(layout, value, &kind)); + let value = assert_ok!(aggregator_value.clone().try_into_move_value(layout, width)); + let (a, _) = assert_ok!(DelayedFieldValue::try_from_move_value(layout, value, &kind)); assert_eq!(a, aggregator_value); } -#[test_case(&U8)] -#[test_case(&Bool)] -#[test_case(&Address)] -#[test_case(&Vector(Box::new(U8)))] -fn test_aggregator_value_to_value_err(layout: &MoveTypeLayout) { - assert_err!(DelayedFieldValue::Aggregator(0).try_into_move_value(layout)); - assert_err!(DelayedFieldValue::Snapshot(1).try_into_move_value(layout)); - assert_err!(DelayedFieldValue::Derived(vec![3]).try_into_move_value(layout)); +#[test_case(&U8, 1)] +#[test_case(&Bool, 1)] +#[test_case(&Address, 20)] +#[test_case(&Vector(Box::new(U8)), 5)] +fn test_aggregator_value_to_value_err(layout: &MoveTypeLayout, width: u32) { + assert_err!(DelayedFieldValue::Aggregator(0).try_into_move_value(layout, width)); + assert_err!(DelayedFieldValue::Snapshot(1).try_into_move_value(layout, width)); + assert_err!(DelayedFieldValue::Derived(vec![3]).try_into_move_value(layout, width)); } #[test_case(&U64, Value::u8(1), K::Aggregator)] @@ -73,7 +82,7 @@ fn test_aggregator_value_to_value_err(layout: &MoveTypeLayout) { #[test_case(&U8, Value::u8(1), K::Snapshot)] #[test_case(&Bool, Value::u8(1), K::Snapshot)] #[test_case(&Vector(Box::new(U8)), Value::vector_u8(vec![0, 1]), K::Snapshot)] -#[test_case(&*STRING, bytes_to_string(vec![1,2]), K::Aggregator)] +#[test_case(&*DERIVED_STRING, bytes_and_width_to_derived_string_struct(vec![1,2], 20).unwrap(), K::Aggregator)] fn test_aggregator_value_from_value_err( layout: &MoveTypeLayout, value: Value, diff --git a/aptos-move/aptos-aggregator/src/tests/types.rs b/aptos-move/aptos-aggregator/src/tests/types.rs index 5c185acfa1762..b4befda3c6d04 100644 --- a/aptos-move/aptos-aggregator/src/tests/types.rs +++ b/aptos-move/aptos-aggregator/src/tests/types.rs @@ -12,7 +12,7 @@ use crate::{ }, }; use aptos_types::{ - aggregator::PanicError, + delayed_fields::PanicError, state_store::{state_key::StateKey, state_value::StateValue}, write_set::WriteOp, }; @@ -112,11 +112,11 @@ impl TDelayedFieldView for FakeAggregatorView { Ok(math.unsigned_add_delta(base, delta).is_ok()) } - fn generate_delayed_field_id(&self) -> Self::Identifier { + fn generate_delayed_field_id(&self, width: u32) -> Self::Identifier { let mut counter = self.counter.borrow_mut(); - let id = Self::Identifier::new(*counter as u64); + let id = *counter; *counter += 1; - id + DelayedFieldID::new_with_width(id, width) } fn validate_and_convert_delayed_field_id( @@ -138,7 +138,7 @@ impl TDelayedFieldView for FakeAggregatorView { ))); } - Ok(Self::Identifier::new(id)) + Ok(Self::Identifier::from(id)) } fn get_reads_needing_exchange( diff --git a/aptos-move/aptos-aggregator/src/types.rs b/aptos-move/aptos-aggregator/src/types.rs index 8d6c578d17bf8..6276f1049e88d 100644 --- a/aptos-move/aptos-aggregator/src/types.rs +++ b/aptos-move/aptos-aggregator/src/types.rs @@ -1,13 +1,16 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use crate::{ - bounded_math::SignedU128, - utils::{bytes_to_string, is_string_layout, string_to_bytes}, -}; +use crate::bounded_math::SignedU128; use aptos_logger::error; +use aptos_types::delayed_fields::{ + bytes_and_width_to_derived_string_struct, derived_string_struct_to_bytes_and_length, + is_derived_string_struct_layout, +}; // TODO[agg_v2](cleanup): After aggregators_v2 branch land, consolidate these, instead of using alias here -pub use aptos_types::aggregator::{DelayedFieldID, PanicError, TryFromMoveValue, TryIntoMoveValue}; +pub use aptos_types::delayed_fields::{ + DelayedFieldID, PanicError, TryFromMoveValue, TryIntoMoveValue, +}; use move_binary_format::errors::PartialVMError; use move_core_types::{ value::{IdentifierMappingKind, MoveTypeLayout}, @@ -200,19 +203,33 @@ impl DelayedFieldValue { )), } } -} - -impl TryIntoMoveValue for DelayedFieldValue { - type Error = PartialVMError; - fn try_into_move_value(self, layout: &MoveTypeLayout) -> Result { + pub fn try_into_move_value( + self, + layout: &MoveTypeLayout, + width: u32, + ) -> Result { use DelayedFieldValue::*; use MoveTypeLayout::*; Ok(match (self, layout) { - (Aggregator(v) | Snapshot(v), U64) => Value::u64(v as u64), - (Aggregator(v) | Snapshot(v), U128) => Value::u128(v), - (Derived(bytes), layout) if is_string_layout(layout) => bytes_to_string(bytes), + (Aggregator(v) | Snapshot(v), U64) => { + if width != 8 { + return Err(PartialVMError::new(StatusCode::VM_EXTENSION_ERROR) + .with_message(format!("Expected width 8 for U64, got {}", width))); + } + Value::u64(v as u64) + }, + (Aggregator(v) | Snapshot(v), U128) => { + if width != 16 { + return Err(PartialVMError::new(StatusCode::VM_EXTENSION_ERROR) + .with_message(format!("Expected width 16 for U128, got {}", width))); + } + Value::u128(v) + }, + (Derived(bytes), layout) if is_derived_string_struct_layout(layout) => { + bytes_and_width_to_derived_string_struct(bytes, width as usize)? + }, (value, layout) => { return Err( PartialVMError::new(StatusCode::VM_EXTENSION_ERROR).with_message(format!( @@ -236,19 +253,20 @@ impl TryFromMoveValue for DelayedFieldValue { layout: &MoveTypeLayout, value: Value, hint: &Self::Hint, - ) -> Result { + ) -> Result<(Self, u32), Self::Error> { use DelayedFieldValue::*; use IdentifierMappingKind as K; use MoveTypeLayout as L; Ok(match (hint, layout) { - (K::Aggregator, L::U64) => Aggregator(value.value_as::()? as u128), - (K::Aggregator, L::U128) => Aggregator(value.value_as::()?), - (K::Snapshot, L::U64) => Snapshot(value.value_as::()? as u128), - (K::Snapshot, L::U128) => Snapshot(value.value_as::()?), - (K::Snapshot, layout) if is_string_layout(layout) => { - let bytes = string_to_bytes(value.value_as::()?)?; - Derived(bytes) + (K::Aggregator, L::U64) => (Aggregator(value.value_as::()? as u128), 8), + (K::Aggregator, L::U128) => (Aggregator(value.value_as::()?), 16), + (K::Snapshot, L::U64) => (Snapshot(value.value_as::()? as u128), 8), + (K::Snapshot, L::U128) => (Snapshot(value.value_as::()?), 16), + (K::DerivedString, layout) if is_derived_string_struct_layout(layout) => { + let (bytes, width) = + derived_string_struct_to_bytes_and_length(value.value_as::()?)?; + (Derived(bytes), width) }, _ => { return Err( @@ -262,69 +280,6 @@ impl TryFromMoveValue for DelayedFieldValue { } } -// TODO[agg_v2](cleanup) see if we need both AggregatorValue and SnapshotValue. -// Or alternatively, maybe they should be nested (i.e. DelayedFieldValue::Snapshot(SnapshotValue)) -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SnapshotValue { - Integer(u128), - String(Vec), -} - -impl SnapshotValue { - pub fn into_aggregator_value(self) -> Result { - match self { - SnapshotValue::Integer(value) => Ok(value), - SnapshotValue::String(_) => Err(code_invariant_error( - "Tried calling into_aggregator_value on String SnapshotValue", - )), - } - } -} - -impl TryFrom for SnapshotValue { - type Error = PanicError; - - fn try_from(value: DelayedFieldValue) -> Result { - match value { - DelayedFieldValue::Aggregator(_) => Err(code_invariant_error( - "Tried calling SnapshotValue::try_from on AggregatorValue(Aggregator)", - )), - DelayedFieldValue::Snapshot(v) => Ok(SnapshotValue::Integer(v)), - DelayedFieldValue::Derived(v) => Ok(SnapshotValue::String(v)), - } - } -} - -impl From for DelayedFieldValue { - fn from(value: SnapshotValue) -> DelayedFieldValue { - match value { - SnapshotValue::Integer(v) => DelayedFieldValue::Snapshot(v), - SnapshotValue::String(v) => DelayedFieldValue::Derived(v), - } - } -} - -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum SnapshotToStringFormula { - Concat { prefix: Vec, suffix: Vec }, -} - -impl SnapshotToStringFormula { - pub fn apply_to(&self, base: u128) -> Vec { - match self { - SnapshotToStringFormula::Concat { prefix, suffix } => { - let middle_string = base.to_string(); - let middle = middle_string.as_bytes(); - let mut result = Vec::with_capacity(prefix.len() + middle.len() + suffix.len()); - result.extend(prefix); - result.extend(middle); - result.extend(suffix); - result - }, - } - } -} - pub enum ReadPosition { BeforeCurrentTxn, AfterCurrentTxn, diff --git a/aptos-move/aptos-aggregator/src/utils.rs b/aptos-move/aptos-aggregator/src/utils.rs deleted file mode 100644 index 48c002fe24257..0000000000000 --- a/aptos-move/aptos-aggregator/src/utils.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright © Aptos Foundation -// SPDX-License-Identifier: Apache-2.0 - -use move_binary_format::errors::{PartialVMError, PartialVMResult}; -use move_core_types::{value::MoveTypeLayout, vm_status::StatusCode}; -use move_vm_types::values::{Struct, Value}; -use std::str::FromStr; - -/// Returns true if the type layout corresponds to a String, which should be a -/// struct with a single byte vector field. -pub(crate) fn is_string_layout(layout: &MoveTypeLayout) -> bool { - use MoveTypeLayout::*; - if let Struct(move_struct) = layout { - if let [Vector(elem)] = move_struct.fields().iter().as_slice() { - if let U8 = elem.as_ref() { - return true; - } - } - } - false -} - -pub fn bytes_to_string(bytes: Vec) -> Value { - Value::struct_(Struct::pack(vec![Value::vector_u8(bytes)])) -} - -pub fn string_to_bytes(value: Struct) -> PartialVMResult> { - value.unpack()?.collect::>().pop().map_or_else( - || { - Err(PartialVMError::new(StatusCode::VM_EXTENSION_ERROR) - .with_message("Unable to extract bytes from String".to_string())) - }, - |v| v.value_as::>(), - ) -} - -pub fn to_utf8_bytes(value: impl ToString) -> Vec { - value.to_string().into_bytes() -} - -pub fn from_utf8_bytes(bytes: Vec) -> PartialVMResult { - String::from_utf8(bytes) - .map_err(|e| { - PartialVMError::new(StatusCode::UNKNOWN_INVARIANT_VIOLATION_ERROR) - .with_message(format!("Unable to convert bytes to string: {}", e)) - })? - .parse::() - .map_err(|_| { - PartialVMError::new(StatusCode::UNKNOWN_INVARIANT_VIOLATION_ERROR) - .with_message("Unable to parse string".to_string()) - }) -} - -pub fn u128_to_u64(value: u128) -> PartialVMResult { - u64::try_from(value).map_err(|_| { - PartialVMError::new(StatusCode::UNKNOWN_INVARIANT_VIOLATION_ERROR) - .with_message("Cannot cast u128 into u64".to_string()) - }) -} diff --git a/aptos-move/aptos-vm-types/src/change_set.rs b/aptos-move/aptos-vm-types/src/change_set.rs index c619cf953edd4..13116ffa1d8f2 100644 --- a/aptos-move/aptos-vm-types/src/change_set.rs +++ b/aptos-move/aptos-vm-types/src/change_set.rs @@ -15,8 +15,8 @@ use aptos_aggregator::{ types::{code_invariant_error, DelayedFieldID}, }; use aptos_types::{ - aggregator::PanicError, contract_event::ContractEvent, + delayed_fields::PanicError, state_store::{ state_key::{StateKey, StateKeyInner}, state_value::StateValueMetadata, @@ -232,22 +232,22 @@ impl VMChangeSet { /// Builds a new change set from the storage representation. /// + /// **WARNING**: this creates a write set that assumes dynamic change set optimizations to be disabled. + /// this needs to be applied directly to storage, you cannot get appropriate reads from this in a + /// dynamic change set optimization enabled context. + /// We have two dynamic change set optimizations, both there to reduce conflicts between transactions: + /// - exchanging delayed fields and leaving their materialization to happen at the end + /// - unpacking resource groups and treating each resource inside it separately + /// /// **WARNING**: Has complexity O(#write_ops) because we need to iterate /// over blobs and split them into resources or modules. Only used to /// support transactions with write-set payload. /// /// Note: does not separate out individual resource group updates. - pub fn try_from_storage_change_set( + pub fn try_from_storage_change_set_with_delayed_field_optimization_disabled( change_set: StorageChangeSet, checker: &dyn CheckChangeSet, - // Pass in within which resolver context are we creating this change set. - // Used to eagerly reject changes created in an incompatible way. - is_delayed_field_optimization_capable: bool, ) -> VMResult { - assert!( - !is_delayed_field_optimization_capable, - "try_from_storage_change_set can only be called in non-is_delayed_field_optimization_capable context, as it doesn't support delayed field changes (type layout) and resource groups"); - let (write_set, events) = change_set.into_inner(); // There should be no aggregator writes if we have a change set from @@ -271,9 +271,6 @@ impl VMChangeSet { // We can set layout to None, as we are not in the is_delayed_field_optimization_capable context let events = events.into_iter().map(|event| (event, None)).collect(); let change_set = Self { - // TODO[agg_v2](fix): do we use same or different capable flag for resource groups? - // We should skip unpacking resource groups, as we are not in the is_delayed_field_optimization_capable - // context (i.e. if dynamic_change_set_optimizations_enabled is disabled) resource_write_set, module_write_set, delayed_field_change_set: BTreeMap::new(), diff --git a/aptos-move/aptos-vm-types/src/output.rs b/aptos-move/aptos-vm-types/src/output.rs index 499a1fd52519f..56b22af00faac 100644 --- a/aptos-move/aptos-vm-types/src/output.rs +++ b/aptos-move/aptos-vm-types/src/output.rs @@ -4,8 +4,8 @@ use crate::change_set::VMChangeSet; use aptos_aggregator::{resolver::AggregatorV1Resolver, types::code_invariant_error}; use aptos_types::{ - aggregator::PanicError, contract_event::ContractEvent, //contract_event::ContractEvent, + delayed_fields::PanicError, fee_statement::FeeStatement, state_store::state_key::StateKey, transaction::{TransactionOutput, TransactionStatus}, diff --git a/aptos-move/aptos-vm-types/src/resolver.rs b/aptos-move/aptos-vm-types/src/resolver.rs index 31cdfd5b1266c..9c6084680f82d 100644 --- a/aptos-move/aptos-vm-types/src/resolver.rs +++ b/aptos-move/aptos-vm-types/src/resolver.rs @@ -6,6 +6,7 @@ use aptos_aggregator::{ types::DelayedFieldID, }; use aptos_types::{ + serde_helper::bcs_utils::size_u32_as_uleb128, state_store::{ errors::StateviewError, state_key::StateKey, @@ -295,16 +296,6 @@ pub enum ResourceGroupSize { }, } -pub fn size_u32_as_uleb128(mut value: usize) -> usize { - let mut len = 1; - while value >= 0x80 { - // 7 (lowest) bits of data get written in a single byte. - len += 1; - value >>= 7; - } - len -} - impl ResourceGroupSize { pub fn zero_combined() -> Self { Self::Combined { @@ -333,12 +324,3 @@ impl ResourceGroupSize { } } } - -#[test] -fn test_size_u32_as_uleb128() { - assert_eq!(size_u32_as_uleb128(0), 1); - assert_eq!(size_u32_as_uleb128(127), 1); - assert_eq!(size_u32_as_uleb128(128), 2); - assert_eq!(size_u32_as_uleb128(128 * 128 - 1), 2); - assert_eq!(size_u32_as_uleb128(128 * 128), 3); -} diff --git a/aptos-move/aptos-vm-types/src/resource_group_adapter.rs b/aptos-move/aptos-vm-types/src/resource_group_adapter.rs index c6c23b3acea69..0912b8dad2cce 100644 --- a/aptos-move/aptos-vm-types/src/resource_group_adapter.rs +++ b/aptos-move/aptos-vm-types/src/resource_group_adapter.rs @@ -1,10 +1,10 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use crate::resolver::{ - size_u32_as_uleb128, ResourceGroupSize, ResourceGroupView, TResourceGroupView, TResourceView, +use crate::resolver::{ResourceGroupSize, ResourceGroupView, TResourceGroupView, TResourceView}; +use aptos_types::{ + serde_helper::bcs_utils::bcs_size_of_byte_array, state_store::state_key::StateKey, }; -use aptos_types::state_store::state_key::StateKey; use bytes::Bytes; use move_binary_format::errors::{PartialVMError, PartialVMResult}; use move_core_types::{language_storage::StructTag, value::MoveTypeLayout, vm_status::StatusCode}; @@ -52,8 +52,7 @@ pub fn group_tagged_resource_size( "Tag serialization error for tag {:?}: {:?}", tag, e )) - })? + value_byte_len - + size_u32_as_uleb128(value_byte_len)) as u64) + })? + bcs_size_of_byte_array(value_byte_len)) as u64) } /// Utility method to compute the size of the group as GroupSizeKind::AsSum. diff --git a/aptos-move/aptos-vm-types/src/tests/test_change_set.rs b/aptos-move/aptos-vm-types/src/tests/test_change_set.rs index 9b265b689411e..775850abcedc0 100644 --- a/aptos-move/aptos-vm-types/src/tests/test_change_set.rs +++ b/aptos-move/aptos-vm-types/src/tests/test_change_set.rs @@ -15,11 +15,11 @@ use aptos_aggregator::{ bounded_math::SignedU128, delayed_change::{DelayedApplyChange, DelayedChange}, delta_change_set::DeltaWithMax, - types::{DelayedFieldID, SnapshotToStringFormula}, + types::DelayedFieldID, }; use aptos_types::{ access_path::AccessPath, - aggregator::PanicError, + delayed_fields::{PanicError, SnapshotToStringFormula}, state_store::state_key::StateKey, transaction::ChangeSet as StorageChangeSet, write_set::{WriteOp, WriteSetMut}, @@ -386,11 +386,12 @@ fn test_roundtrip_to_storage_change_set() { .unwrap(); let storage_change_set_before = StorageChangeSet::new(write_set, vec![]); - let change_set = assert_ok!(VMChangeSet::try_from_storage_change_set( - storage_change_set_before.clone(), - &MockChangeSetChecker, - false, - )); + let change_set = assert_ok!( + VMChangeSet::try_from_storage_change_set_with_delayed_field_optimization_disabled( + storage_change_set_before.clone(), + &MockChangeSetChecker, + ) + ); let storage_change_set_after = assert_ok!(change_set.try_into_storage_change_set()); assert_eq!(storage_change_set_before, storage_change_set_after) } @@ -427,7 +428,7 @@ fn test_aggregator_v2_snapshots_and_derived() { use DelayedChange::*; let agg_changes_1 = vec![( - DelayedFieldID::new(1), + DelayedFieldID::new_for_test_for_u64(1), Apply(AggregatorDelta { delta: DeltaWithMax::new(SignedU128::Positive(3), 100), }), @@ -438,22 +439,22 @@ fn test_aggregator_v2_snapshots_and_derived() { let agg_changes_2 = vec![ ( - DelayedFieldID::new(1), + DelayedFieldID::new_for_test_for_u64(1), Apply(AggregatorDelta { delta: DeltaWithMax::new(SignedU128::Positive(5), 100), }), ), ( - DelayedFieldID::new(2), + DelayedFieldID::new_for_test_for_u64(2), Apply(SnapshotDelta { - base_aggregator: DelayedFieldID::new(1), + base_aggregator: DelayedFieldID::new_for_test_for_u64(1), delta: DeltaWithMax::new(SignedU128::Positive(2), 100), }), ), ( - DelayedFieldID::new(3), + DelayedFieldID::new_for_test_for_u64(3), Apply(SnapshotDerived { - base_snapshot: DelayedFieldID::new(2), + base_snapshot: DelayedFieldID::new_for_test_for_u64(2), formula: SnapshotToStringFormula::Concat { prefix: "p".as_bytes().to_vec(), suffix: "s".as_bytes().to_vec(), @@ -470,22 +471,22 @@ fn test_aggregator_v2_snapshots_and_derived() { let output_map = change_set_1.delayed_field_change_set(); assert_eq!(output_map.len(), 3); assert_some_eq!( - output_map.get(&DelayedFieldID::new(1)), + output_map.get(&DelayedFieldID::new_for_test_for_u64(1)), &Apply(AggregatorDelta { delta: DeltaWithMax::new(SignedU128::Positive(8), 100) }) ); assert_some_eq!( - output_map.get(&DelayedFieldID::new(2)), + output_map.get(&DelayedFieldID::new_for_test_for_u64(2)), &Apply(SnapshotDelta { - base_aggregator: DelayedFieldID::new(1), + base_aggregator: DelayedFieldID::new_for_test_for_u64(1), delta: DeltaWithMax::new(SignedU128::Positive(5), 100) }) ); assert_some_eq!( - output_map.get(&DelayedFieldID::new(3)), + output_map.get(&DelayedFieldID::new_for_test_for_u64(3)), &Apply(SnapshotDerived { - base_snapshot: DelayedFieldID::new(2), + base_snapshot: DelayedFieldID::new_for_test_for_u64(2), formula: SnapshotToStringFormula::Concat { prefix: "p".as_bytes().to_vec(), suffix: "s".as_bytes().to_vec() diff --git a/aptos-move/aptos-vm/Cargo.toml b/aptos-move/aptos-vm/Cargo.toml index 17362472fd3c5..79ff63345615d 100644 --- a/aptos-move/aptos-vm/Cargo.toml +++ b/aptos-move/aptos-vm/Cargo.toml @@ -66,13 +66,13 @@ tracing = { workspace = true } [dev-dependencies] aptos-aggregator = { workspace = true, features = ["testing"] } aptos-language-e2e-tests = { workspace = true } -aptos-types = { workspace = true } +aptos-types = { workspace = true, features = ["fuzzing"] } claims = { workspace = true } proptest = { workspace = true } rand_core = { workspace = true } [features] default = [] -fuzzing = ["move-core-types/fuzzing", "move-binary-format/fuzzing", "move-vm-types/fuzzing", "aptos-framework/fuzzing"] +fuzzing = ["move-core-types/fuzzing", "move-binary-format/fuzzing", "move-vm-types/fuzzing", "aptos-framework/fuzzing", "aptos-types/fuzzing"] failpoints = ["fail/failpoints", "move-vm-runtime/failpoints"] testing = ["move-unit-test", "aptos-framework/testing"] diff --git a/aptos-move/aptos-vm/src/aptos_vm.rs b/aptos-move/aptos-vm/src/aptos_vm.rs index 1ca09f430e2ef..e3bb0fbe552ae 100644 --- a/aptos-move/aptos-vm/src/aptos_vm.rs +++ b/aptos-move/aptos-vm/src/aptos_vm.rs @@ -1648,12 +1648,29 @@ impl AptosVM { ChangeSetConfigs::unlimited_at_gas_feature_version(self.gas_feature_version); match write_set_payload { - WriteSetPayload::Direct(change_set) => VMChangeSet::try_from_storage_change_set( - change_set.clone(), - &change_set_configs, - resolver.is_delayed_field_optimization_capable(), - ) - .map_err(|e| e.into_vm_status()), + WriteSetPayload::Direct(change_set) => { + // this transaction is never delayed field capable. + // it requires restarting execution afterwards, + // which allows it to be used as last transaction in delayed_field_enabled context. + let change = VMChangeSet::try_from_storage_change_set_with_delayed_field_optimization_disabled( + change_set.clone(), + &change_set_configs, + ) + .map_err(|e| e.into_vm_status())?; + + // validate_waypoint_change_set checks that this is true, so we only log here. + if !Self::should_restart_execution(&change) { + // This invariant needs to hold irrespectively, so we log error always. + // but if we are in delayed_field_optimization_capable context, we cannot execute any transaction after this. + // as transaction afterwards would be executed assuming delayed fields are exchanged and + // resource groups are split, but WriteSetPayload::Direct has materialized writes, + // and so after executing this transaction versioned state is inconsistent. + error!( + "[aptos_vm] direct write set finished without requiring should_restart_execution"); + } + + Ok(change) + }, WriteSetPayload::Script { script, execute_as } => { let mut tmp_session = self.new_session(resolver, session_id); let senders = match txn_sender { @@ -2006,10 +2023,9 @@ impl AptosVM { } } - pub fn should_restart_execution(vm_output: &VMOutput) -> bool { + pub fn should_restart_execution(vm_change_set: &VMChangeSet) -> bool { let new_epoch_event_key = new_epoch_event_key(); - vm_output - .change_set() + vm_change_set .events() .iter() .any(|(event, _)| event.event_key() == Some(&new_epoch_event_key)) diff --git a/aptos-move/aptos-vm/src/block_executor/vm_wrapper.rs b/aptos-move/aptos-vm/src/block_executor/vm_wrapper.rs index 9c729e4d4a37b..bfcd9716ecf0a 100644 --- a/aptos-move/aptos-vm/src/block_executor/vm_wrapper.rs +++ b/aptos-move/aptos-vm/src/block_executor/vm_wrapper.rs @@ -48,13 +48,6 @@ impl<'a, S: 'a + StateView + Sync> ExecutorTask for AptosExecutorTask<'a, S> { txn: &SignatureVerifiedTransaction, txn_idx: TxnIndex, ) -> ExecutionStatus { - if (executor_with_group_view.is_delayed_field_optimization_capable() - || executor_with_group_view.is_resource_group_split_in_change_set_capable()) - && !Self::is_transaction_dynamic_change_set_capable(txn) - { - return ExecutionStatus::DirectWriteSetTransactionNotCapableError; - } - let log_context = AdapterLogSchema::new(self.base_view.id(), txn_idx as usize); let resolver = self .vm @@ -90,13 +83,17 @@ impl<'a, S: 'a + StateView + Sync> ExecutorTask for AptosExecutorTask<'a, S> { ExecutionStatus::DelayedFieldsCodeInvariantError( vm_status.message().cloned().unwrap_or_default(), ) - } else if AptosVM::should_restart_execution(&vm_output) { + } else if AptosVM::should_restart_execution(vm_output.change_set()) { speculative_info!( &log_context, "Reconfiguration occurred: restart required".into() ); ExecutionStatus::SkipRest(AptosTransactionOutput::new(vm_output)) } else { + assert!( + Self::is_transaction_dynamic_change_set_capable(txn), + "DirectWriteSet should always create SkipRest transaction, validate_waypoint_change_set provides this guarantee" + ); ExecutionStatus::Success(AptosTransactionOutput::new(vm_output)) } }, diff --git a/aptos-move/aptos-vm/src/data_cache.rs b/aptos-move/aptos-vm/src/data_cache.rs index b2f390ff39853..9652dcc9f26cf 100644 --- a/aptos-move/aptos-vm/src/data_cache.rs +++ b/aptos-move/aptos-vm/src/data_cache.rs @@ -18,7 +18,7 @@ use aptos_aggregator::{ use aptos_table_natives::{TableHandle, TableResolver}; use aptos_types::{ access_path::AccessPath, - aggregator::PanicError, + delayed_fields::PanicError, on_chain_config::{ConfigStorage, Features, OnChainConfig}, state_store::{ errors::StateviewError, state_key::StateKey, state_storage_usage::StateStorageUsage, @@ -273,8 +273,8 @@ impl<'e, E: ExecutorView> TDelayedFieldView for StorageAdapter<'e, E> { .delayed_field_try_add_delta_outcome(id, base_delta, delta, max_value) } - fn generate_delayed_field_id(&self) -> Self::Identifier { - self.executor_view.generate_delayed_field_id() + fn generate_delayed_field_id(&self, width: u32) -> Self::Identifier { + self.executor_view.generate_delayed_field_id(width) } fn validate_and_convert_delayed_field_id( diff --git a/aptos-move/aptos-vm/src/move_vm_ext/respawned_session.rs b/aptos-move/aptos-vm/src/move_vm_ext/respawned_session.rs index 3c0c5170a34e5..98e5312aca03a 100644 --- a/aptos-move/aptos-vm/src/move_vm_ext/respawned_session.rs +++ b/aptos-move/aptos-vm/src/move_vm_ext/respawned_session.rs @@ -18,7 +18,7 @@ use aptos_aggregator::{ }; use aptos_gas_algebra::Fee; use aptos_types::{ - aggregator::PanicError, + delayed_fields::PanicError, state_store::{ errors::StateviewError, state_key::StateKey, @@ -263,8 +263,8 @@ impl<'r> TDelayedFieldView for ExecutorViewWithChangeSet<'r> { } } - fn generate_delayed_field_id(&self) -> Self::Identifier { - self.base_executor_view.generate_delayed_field_id() + fn generate_delayed_field_id(&self, width: u32) -> Self::Identifier { + self.base_executor_view.generate_delayed_field_id(width) } fn validate_and_convert_delayed_field_id( diff --git a/aptos-move/aptos-vm/src/natives.rs b/aptos-move/aptos-vm/src/natives.rs index bc839634c94ae..4be6d9c6d051b 100644 --- a/aptos-move/aptos-vm/src/natives.rs +++ b/aptos-move/aptos-vm/src/natives.rs @@ -26,8 +26,8 @@ use aptos_types::{ }; #[cfg(feature = "testing")] use aptos_types::{ - aggregator::PanicError, chain_id::ChainId, + delayed_fields::PanicError, state_store::{state_key::StateKey, state_value::StateValue}, write_set::WriteOp, }; @@ -104,7 +104,7 @@ impl TDelayedFieldView for AptosBlankStorage { unreachable!() } - fn generate_delayed_field_id(&self) -> Self::Identifier { + fn generate_delayed_field_id(&self, _width: u32) -> Self::Identifier { unreachable!() } diff --git a/aptos-move/aptos-vm/src/tests/mock_view.rs b/aptos-move/aptos-vm/src/tests/mock_view.rs index 5fed9c4cd5ab7..1d7c2d4b3359a 100644 --- a/aptos-move/aptos-vm/src/tests/mock_view.rs +++ b/aptos-move/aptos-vm/src/tests/mock_view.rs @@ -1,10 +1,13 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use aptos_aggregator::types::{DelayedFieldID, TryFromMoveValue, TryIntoMoveValue}; +use aptos_aggregator::types::{ + DelayedFieldID, DelayedFieldValue, TryFromMoveValue, TryIntoMoveValue, +}; use aptos_table_natives::{TableHandle, TableResolver}; use aptos_types::{access_path::AccessPath, state_store::state_key::StateKey}; use bytes::Bytes; +use claims::assert_some; use move_binary_format::errors::PartialVMError; use move_core_types::{ account_address::AccountAddress, @@ -44,7 +47,7 @@ impl MockDB { /// . 3. Actual storage backend. #[derive(Debug, Default)] pub(crate) struct MockStateView { - mapping: RefCell>, + mapping: RefCell>, in_memory_cache: BTreeMap, db: MockDB, } @@ -58,9 +61,9 @@ impl MockStateView { self.db.store_bytes(state_key, blob.into()); } - pub(crate) fn add_mapping(&self, identifier: u64, v: Value) { + pub(crate) fn add_mapping(&self, unique_index: u32, width: u32, v: Value) { let mut mapping = self.mapping.borrow_mut(); - mapping.insert(identifier, v); + mapping.insert(DelayedFieldID::new_with_width(unique_index, width), v); } pub(crate) fn add_to_in_memory_cache( @@ -77,25 +80,39 @@ impl MockStateView { self.in_memory_cache.insert(state_key, blob.into()); } - pub(crate) fn assert_mapping_equal_at(&self, identifier: u64, expected_value: Value) { - assert!(self - .mapping - .borrow() - .get(&identifier) - .is_some_and(|actual_value| { actual_value.equals(&expected_value).unwrap() })); + pub(crate) fn assert_mapping_equal_at( + &self, + unique_index: u32, + width: u32, + expected_value: Value, + ) { + let mapping = self.mapping.borrow(); + let actual_value = + assert_some!(mapping.get(&DelayedFieldID::new_with_width(unique_index, width))); + + assert!( + actual_value.equals(&expected_value).unwrap(), + "actual_value: {:?}, expected_value: {:?}", + actual_value, + expected_value + ); } } impl ValueToIdentifierMapping for MockStateView { fn value_to_identifier( &self, - _kind: &IdentifierMappingKind, + kind: &IdentifierMappingKind, layout: &MoveTypeLayout, value: Value, ) -> TransformationResult { + let (_base_value, width) = + DelayedFieldValue::try_from_move_value(layout, value.copy_value()?, kind)?; + let mut mapping = self.mapping.borrow_mut(); - let identifier = mapping.len() as u64; - let identifier_value = DelayedFieldID::new(identifier) + let unique_index = mapping.len() as u32; + let identifier = DelayedFieldID::new_with_width(unique_index, width); + let identifier_value = identifier .try_into_move_value(layout) .map_err(PartialVMError::from)?; @@ -109,9 +126,9 @@ impl ValueToIdentifierMapping for MockStateView { identifier: Value, ) -> TransformationResult { let mapping = self.mapping.borrow(); - let identifier = DelayedFieldID::try_from_move_value(layout, identifier, &()) - .map_err(PartialVMError::from)? - .as_u64(); + let (identifier, width) = DelayedFieldID::try_from_move_value(layout, identifier, &()) + .map_err(PartialVMError::from)?; + assert_eq!(identifier.extract_width(), width); Ok(mapping .get(&identifier) diff --git a/aptos-move/aptos-vm/src/tests/test_resolver_with_identifier_mapping.rs b/aptos-move/aptos-vm/src/tests/test_resolver_with_identifier_mapping.rs index d33f677408227..d560b7b85c2a5 100644 --- a/aptos-move/aptos-vm/src/tests/test_resolver_with_identifier_mapping.rs +++ b/aptos-move/aptos-vm/src/tests/test_resolver_with_identifier_mapping.rs @@ -2,9 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 use crate::tests::mock_view::MockStateView; -use aptos_aggregator::utils::{bytes_to_string, to_utf8_bytes}; use aptos_table_natives::{TableHandle, TableResolver}; -use aptos_types::{access_path::AccessPath, state_store::state_key::StateKey}; +use aptos_types::{ + access_path::AccessPath, + delayed_fields::{ + bytes_and_width_to_derived_string_struct, bytes_to_string, to_utf8_bytes, DelayedFieldID, + }, + state_store::state_key::StateKey, +}; use move_core_types::{ account_address::AccountAddress, language_storage::StructTag, @@ -15,6 +20,8 @@ use move_vm_types::values::{Struct, Value}; use once_cell::sync::Lazy; use std::{clone::Clone, str::FromStr}; +const DERIVED_STRING_TEST_WIDTH: u32 = 40; + macro_rules! test_struct { ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => { Value::struct_(Struct::pack(vec![ @@ -23,10 +30,25 @@ macro_rules! test_struct { Value::u128($c), Value::u128($d), bytes_to_string(to_utf8_bytes($e)), - bytes_to_string(to_utf8_bytes($f)), + bytes_and_width_to_derived_string_struct(to_utf8_bytes($f), DERIVED_STRING_TEST_WIDTH as usize) + .unwrap(), + ])) + }; +} + +macro_rules! test_struct_with_id { + ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => { + Value::struct_(Struct::pack(vec![ + Value::u64($a), + Value::u64($b), + Value::u128($c), + Value::u128($d), + bytes_to_string(to_utf8_bytes($e)), + $f.into_derived_string_struct().unwrap(), ])) }; } + static TEST_LAYOUT: Lazy = Lazy::new(|| { MoveTypeLayout::Struct(MoveStructLayout::Runtime(vec![ MoveTypeLayout::U64, @@ -43,8 +65,11 @@ static TEST_LAYOUT: Lazy = Lazy::new(|| { Box::new(MoveTypeLayout::U8), )])), MoveTypeLayout::Tagged( - LayoutTag::IdentifierMapping(IdentifierMappingKind::Aggregator), + LayoutTag::IdentifierMapping(IdentifierMappingKind::DerivedString), Box::new(MoveTypeLayout::Struct(MoveStructLayout::Runtime(vec![ + MoveTypeLayout::Struct(MoveStructLayout::Runtime(vec![MoveTypeLayout::Vector( + Box::new(MoveTypeLayout::U8), + )])), MoveTypeLayout::Vector(Box::new(MoveTypeLayout::U8)), ]))), ), @@ -91,11 +116,31 @@ fn test_resource_in_storage() { ) .unwrap(); let actual_value = Value::simple_deserialize(&blob.unwrap(), &TEST_LAYOUT).unwrap(); - let expected_value = test_struct!(100, 0, 300, 1, "foo", "00000000000000000002"); - assert!(actual_value.equals(&expected_value).unwrap()); - view.assert_mapping_equal_at(0, Value::u64(200)); - view.assert_mapping_equal_at(1, Value::u128(400)); - view.assert_mapping_equal_at(2, bytes_to_string(to_utf8_bytes("bar"))); + let expected_value = test_struct_with_id!( + 100, + DelayedFieldID::new_with_width(0, 8).as_u64(), + 300, + DelayedFieldID::new_with_width(1, 16).as_u64() as u128, + "foo", + DelayedFieldID::new_with_width(2, DERIVED_STRING_TEST_WIDTH) + ); + assert!( + actual_value.equals(&expected_value).unwrap(), + "actual_value: {:?}, expected_value: {:?}", + actual_value, + expected_value + ); + view.assert_mapping_equal_at(0, 8, Value::u64(200)); + view.assert_mapping_equal_at(1, 16, Value::u128(400)); + view.assert_mapping_equal_at( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_and_width_to_derived_string_struct( + to_utf8_bytes("bar"), + DERIVED_STRING_TEST_WIDTH as usize, + ) + .unwrap(), + ); } #[test] @@ -123,11 +168,31 @@ fn test_table_item_in_storage() { ) .unwrap(); let actual_value = Value::simple_deserialize(&blob.unwrap(), &TEST_LAYOUT).unwrap(); - let expected_value = test_struct!(100, 0, 300, 1, "foo", "00000000000000000002"); - assert!(actual_value.equals(&expected_value).unwrap()); - view.assert_mapping_equal_at(0, Value::u64(200)); - view.assert_mapping_equal_at(1, Value::u128(400)); - view.assert_mapping_equal_at(2, bytes_to_string(to_utf8_bytes("bar"))); + let expected_value = test_struct_with_id!( + 100, + DelayedFieldID::new_with_width(0, 8).as_u64(), + 300, + DelayedFieldID::new_with_width(1, 16).as_u64() as u128, + "foo", + DelayedFieldID::new_with_width(2, DERIVED_STRING_TEST_WIDTH) + ); + assert!( + actual_value.equals(&expected_value).unwrap(), + "actual_value: {:?}, expected_value: {:?}", + actual_value, + expected_value + ); + view.assert_mapping_equal_at(0, 8, Value::u64(200)); + view.assert_mapping_equal_at(1, 16, Value::u128(400)); + view.assert_mapping_equal_at( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_and_width_to_derived_string_struct( + to_utf8_bytes("bar"), + DERIVED_STRING_TEST_WIDTH as usize, + ) + .unwrap(), + ); } #[test] @@ -139,12 +204,20 @@ fn test_resource_in_memory_cache() { test_struct, (*TEST_LAYOUT).clone(), ); - view.add_mapping(0, Value::u64(200)); - view.add_mapping(1, Value::u128(400)); - view.add_mapping(2, bytes_to_string(to_utf8_bytes("bar"))); - view.assert_mapping_equal_at(0, Value::u64(200)); - view.assert_mapping_equal_at(1, Value::u128(400)); - view.assert_mapping_equal_at(2, bytes_to_string(to_utf8_bytes("bar"))); + view.add_mapping(0, 8, Value::u64(200)); + view.add_mapping(1, 16, Value::u128(400)); + view.add_mapping( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_to_string(to_utf8_bytes("bar")), + ); + view.assert_mapping_equal_at(0, 8, Value::u64(200)); + view.assert_mapping_equal_at(1, 16, Value::u128(400)); + view.assert_mapping_equal_at( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_to_string(to_utf8_bytes("bar")), + ); let (blob, _) = view .get_resource_bytes_with_metadata_and_layout(&TEST_ADDRESS, &TEST_RESOURCE_TAG, &[], None) @@ -175,12 +248,20 @@ fn test_table_item_in_memory_cache() { test_struct, (*TEST_LAYOUT).clone(), ); - view.add_mapping(0, Value::u64(200)); - view.add_mapping(1, Value::u128(400)); - view.add_mapping(2, bytes_to_string(to_utf8_bytes("bar"))); - view.assert_mapping_equal_at(0, Value::u64(200)); - view.assert_mapping_equal_at(1, Value::u128(400)); - view.assert_mapping_equal_at(2, bytes_to_string(to_utf8_bytes("bar"))); + view.add_mapping(0, 8, Value::u64(200)); + view.add_mapping(1, 16, Value::u128(400)); + view.add_mapping( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_to_string(to_utf8_bytes("bar")), + ); + view.assert_mapping_equal_at(0, 8, Value::u64(200)); + view.assert_mapping_equal_at(1, 16, Value::u128(400)); + view.assert_mapping_equal_at( + 2, + DERIVED_STRING_TEST_WIDTH, + bytes_to_string(to_utf8_bytes("bar")), + ); let blob = view .resolve_table_entry_bytes_with_layout(&TEST_TABLE_HANDLE, &TEST_TABLE_KEY, None) diff --git a/aptos-move/aptos-vm/src/tests/test_value_to_identifier_mapping.rs b/aptos-move/aptos-vm/src/tests/test_value_to_identifier_mapping.rs index 4921a64d4b791..0ff3d8a6b057c 100644 --- a/aptos-move/aptos-vm/src/tests/test_value_to_identifier_mapping.rs +++ b/aptos-move/aptos-vm/src/tests/test_value_to_identifier_mapping.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::tests::mock_view::MockStateView; +use aptos_types::delayed_fields::DelayedFieldID; use claims::assert_none; use move_core_types::value::{ IdentifierMappingKind, LayoutTag, MoveStructLayout::Runtime, MoveTypeLayout, @@ -99,8 +100,10 @@ fn test_exchange_u64() { &layout, ) .unwrap(); - exchange.assert_mapping_equal_at(0, Value::u64(200)); - assert!(patched_value.equals(&Value::u64(0)).unwrap()); + exchange.assert_mapping_equal_at(0, 8, Value::u64(200)); + assert!(patched_value + .equals(&Value::u64(DelayedFieldID::new_with_width(0, 8).as_u64())) + .unwrap()); assert!(unpatched_value.equals(&input_value).unwrap()); } @@ -121,8 +124,12 @@ fn test_exchange_u128() { &layout, ) .unwrap(); - exchange.assert_mapping_equal_at(0, Value::u128(300)); - assert!(patched_value.equals(&Value::u128(0)).unwrap()); + exchange.assert_mapping_equal_at(0, 16, Value::u128(300)); + assert!(patched_value + .equals(&Value::u128( + DelayedFieldID::new_with_width(0, 16).as_u64() as u128 + )) + .unwrap()); assert!(unpatched_value.equals(&input_value).unwrap()); } @@ -155,12 +162,12 @@ fn test_exchange_works_inside_struct() { &layout, ) .unwrap(); - exchange.assert_mapping_equal_at(0, Value::u64(500)); - exchange.assert_mapping_equal_at(1, Value::u128(600)); + exchange.assert_mapping_equal_at(0, 8, Value::u64(500)); + exchange.assert_mapping_equal_at(1, 16, Value::u128(600)); let expected_patched_value = Value::struct_(Struct::pack(vec![ Value::u64(400), - Value::u64(0), - Value::u128(1), + Value::u64(DelayedFieldID::new_with_width(0, 8).as_u64()), + Value::u128(DelayedFieldID::new_with_width(1, 16).as_u64() as u128), ])); assert!(patched_value.equals(&expected_patched_value).unwrap()); assert!(unpatched_value.equals(&input_value).unwrap()); diff --git a/aptos-move/aptos-vm/src/validator_txns/dummy.rs b/aptos-move/aptos-vm/src/validator_txns/dummy.rs deleted file mode 100644 index 3bc40caa15887..0000000000000 --- a/aptos-move/aptos-vm/src/validator_txns/dummy.rs +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright © Aptos Foundation - -use crate::{ - move_vm_ext::{AptosMoveResolver, SessionId}, - AptosVM, -}; -use aptos_types::{ - transaction::{ExecutionStatus, TransactionStatus}, - validator_txn::DummyValidatorTransaction, -}; -use aptos_vm_logging::log_schema::AdapterLogSchema; -use aptos_vm_types::output::VMOutput; -use move_core_types::vm_status::{AbortLocation, StatusCode, VMStatus}; - -impl AptosVM { - pub(crate) fn process_dummy_validator_txn( - &self, - _resolver: &impl AptosMoveResolver, - _log_context: &AdapterLogSchema, - _session_id: SessionId, - dummy_vtxn: DummyValidatorTransaction, - ) -> anyhow::Result<(VMStatus, VMOutput), VMStatus> { - let DummyValidatorTransaction { valid, .. } = dummy_vtxn; - if valid { - Ok(( - VMStatus::Executed, - VMOutput::empty_with_status(TransactionStatus::Keep(ExecutionStatus::Success)), - )) - } else { - Ok(( - VMStatus::MoveAbort(AbortLocation::Script, 0), - VMOutput::empty_with_status(TransactionStatus::Discard( - StatusCode::INVALID_SIGNATURE, - )), - )) - } - } -} diff --git a/aptos-move/aptos-vm/src/validator_txns/jwk.rs b/aptos-move/aptos-vm/src/validator_txns/jwk.rs index cdd9088d4e2dd..f302e3ee7e2fa 100644 --- a/aptos-move/aptos-vm/src/validator_txns/jwk.rs +++ b/aptos-move/aptos-vm/src/validator_txns/jwk.rs @@ -14,9 +14,7 @@ use crate::{ }, AptosVM, }; -use aptos_bitvec::BitVec; use aptos_types::{ - aggregate_signature::AggregateSignature, fee_statement::FeeStatement, jwks, jwks::{Issuer, ObservedJWKs, ProviderJWKs, QuorumCertifiedUpdate}, @@ -94,7 +92,6 @@ impl AptosVM { let verifier = ValidatorVerifier::from(&validator_set); let QuorumCertifiedUpdate { - authors, update: observed, multi_sig, } = update; @@ -104,27 +101,18 @@ impl AptosVM { return Err(Expected(IncorrectVersion)); } - let signer_bit_vec = BitVec::from( - verifier - .get_ordered_account_addresses() - .into_iter() - .map(|addr| authors.contains(&addr)) - .collect::>(), - ); - - // Verify multi-sig. - verifier - .verify_multi_signatures( - &observed, - &AggregateSignature::new(signer_bit_vec, Some(multi_sig)), - ) - .map_err(|_| Expected(MultiSigVerificationFailed))?; + let authors = multi_sig.get_signers_addresses(&verifier.get_ordered_account_addresses()); // Check voting power. verifier .check_voting_power(authors.iter(), true) .map_err(|_| Expected(NotEnoughVotingPower))?; + // Verify multi-sig. + verifier + .verify_multi_signatures(&observed, &multi_sig) + .map_err(|_| Expected(MultiSigVerificationFailed))?; + // All verification passed. Apply the `observed`. let mut gas_meter = UnmeteredGasMeter; let mut session = self.new_session(resolver, session_id); diff --git a/aptos-move/aptos-vm/src/validator_txns/mod.rs b/aptos-move/aptos-vm/src/validator_txns/mod.rs index eea258ffcf08f..567fc9081c2cd 100644 --- a/aptos-move/aptos-vm/src/validator_txns/mod.rs +++ b/aptos-move/aptos-vm/src/validator_txns/mod.rs @@ -24,13 +24,9 @@ impl AptosVM { ValidatorTransaction::ObservedJWKUpdate(jwk_update) => { self.process_jwk_update(resolver, log_context, session_id, jwk_update) }, - ValidatorTransaction::DummyTopic1(dummy) | ValidatorTransaction::DummyTopic2(dummy) => { - self.process_dummy_validator_txn(resolver, log_context, session_id, dummy) - }, } } } mod dkg; -mod dummy; mod jwk; diff --git a/aptos-move/block-executor/src/captured_reads.rs b/aptos-move/block-executor/src/captured_reads.rs index f96606645ea7d..f12c690b3abc9 100644 --- a/aptos-move/block-executor/src/captured_reads.rs +++ b/aptos-move/block-executor/src/captured_reads.rs @@ -20,7 +20,7 @@ use aptos_mvhashmap::{ versioned_group_data::VersionedGroupData, }; use aptos_types::{ - aggregator::PanicError, state_store::state_value::StateValueMetadata, + delayed_fields::PanicError, state_store::state_value::StateValueMetadata, transaction::BlockExecutableTransaction as Transaction, write_set::TransactionWrite, }; use aptos_vm_types::resolver::ResourceGroupSize; diff --git a/aptos-move/block-executor/src/errors.rs b/aptos-move/block-executor/src/errors.rs index 8b23c93efe567..f88c279055e99 100644 --- a/aptos-move/block-executor/src/errors.rs +++ b/aptos-move/block-executor/src/errors.rs @@ -4,7 +4,7 @@ use aptos_aggregator::types::PanicOr; use aptos_mvhashmap::types::TxnIndex; -use aptos_types::aggregator::PanicError; +use aptos_types::delayed_fields::PanicError; #[derive(Clone, Debug, PartialEq, Eq)] pub enum IntentionalFallbackToSequential { diff --git a/aptos-move/block-executor/src/executor.rs b/aptos-move/block-executor/src/executor.rs index 06a34b415ef92..7e48e9b1b5f2c 100644 --- a/aptos-move/block-executor/src/executor.rs +++ b/aptos-move/block-executor/src/executor.rs @@ -32,9 +32,9 @@ use aptos_mvhashmap::{ MVHashMap, }; use aptos_types::{ - aggregator::PanicError, block_executor::config::BlockExecutorConfig, contract_event::TransactionEvent, + delayed_fields::PanicError, executable::Executable, on_chain_config::BlockGasLimitType, state_store::TStateView, @@ -242,10 +242,6 @@ where // Record the status indicating abort. ExecutionStatus::Abort(BlockExecutionError::FatalVMError((err, idx_to_execute))) }, - ExecutionStatus::DirectWriteSetTransactionNotCapableError => { - // TODO[agg_v2](fix) decide how to handle/propagate. - panic!("PayloadWriteSet::Direct transaction not alone in a block"); - }, ExecutionStatus::SpeculativeExecutionAbortError(msg) => { read_set.capture_delayed_field_read_error(&PanicOr::Or( MVDelayedFieldsError::DeltaApplicationFailure, @@ -637,7 +633,9 @@ where { Ok((bytes, _)) => bytes, Err(_) => { - unreachable!("Failed to replace identifiers with values") + unreachable!( + "Failed to replace identifiers with values, {layout:?}" + ) }, }; let mut patched_write_op = write_op; @@ -898,11 +896,6 @@ where ExecutionStatus::Abort(_) => { txn_commit_listener.on_execution_aborted(txn_idx); }, - ExecutionStatus::DirectWriteSetTransactionNotCapableError => { - // This should already be handled and fallback to sequential called, - // such a transaction should never reach this point. - panic!("Cannot be materializing with DirectWriteSetTransactionNotCapableError"); - }, ExecutionStatus::SpeculativeExecutionAbortError(msg) | ExecutionStatus::DelayedFieldsCodeInvariantError(msg) => { panic!("Cannot be materializing with {}", msg); @@ -916,11 +909,6 @@ where final_results[txn_idx as usize] = t; }, ExecutionStatus::Abort(_) => (), - ExecutionStatus::DirectWriteSetTransactionNotCapableError => { - panic!("Cannot be materializing with DirectWriteSetTransactionNotCapableError"); - // This should already be handled and fallback to sequential called, - // such a transaction should never reach this point. - }, ExecutionStatus::SpeculativeExecutionAbortError(msg) | ExecutionStatus::DelayedFieldsCodeInvariantError(msg) => { panic!("Cannot be materializing with {}", msg); @@ -1217,7 +1205,6 @@ where executor_arguments: E::Argument, signature_verified_block: &[T], base_view: &S, - dynamic_change_set_optimizations_enabled: bool, ) -> BlockExecutionResult, E::Error> { let num_txns = signature_verified_block.len(); let init_timer = VM_INIT_SECONDS.start_timer(); @@ -1239,12 +1226,7 @@ where for (idx, txn) in signature_verified_block.iter().enumerate() { let latest_view = LatestView::::new( base_view, - ViewState::Unsync(SequentialState::new( - &unsync_map, - start_counter, - &counter, - dynamic_change_set_optimizations_enabled, - )), + ViewState::Unsync(SequentialState::new(&unsync_map, start_counter, &counter)), idx as TxnIndex, ); let res = executor.execute_transaction(&latest_view, txn, idx as TxnIndex); @@ -1310,7 +1292,8 @@ where // TODO[agg_v2](fix): return code invariant error if dynamic change set optimizations disabled. Self::apply_output_sequential(&unsync_map, &output)?; - if dynamic_change_set_optimizations_enabled { + // If dynamic change set materialization part (indented for clarity/variable scope): + { let group_metadata_ops = output.resource_group_metadata_ops(); let mut finalized_groups = Vec::with_capacity(group_metadata_ops.len()); for (group_key, group_metadata_op) in group_metadata_ops.into_iter() { @@ -1394,9 +1377,9 @@ where .collect(), patched_events, ); - } else { - output.set_txn_output_for_non_dynamic_change_set(); } + // If dynamic change set is disabled, this can be used to assert nothing needs patching instead: + // output.set_txn_output_for_non_dynamic_change_set(); if latest_view.is_incorrect_use() { panic!("Incorrect use in sequential execution") @@ -1414,9 +1397,6 @@ where // Record the status indicating abort. return Err(BlockExecutionError::FatalVMError((err, idx as TxnIndex))); }, - ExecutionStatus::DirectWriteSetTransactionNotCapableError => { - panic!("PayloadWriteSet::Direct transaction not alone in a block, in sequential execution") - }, ExecutionStatus::SpeculativeExecutionAbortError(msg) => { panic!( "Sequential execution must not have SpeculativeExecutionAbortError: {:?}", @@ -1460,12 +1440,7 @@ where signature_verified_block: &[T], base_view: &S, ) -> BlockExecutionResult, E::Error> { - let dynamic_change_set_optimizations_enabled = signature_verified_block.len() != 1 - || E::is_transaction_dynamic_change_set_capable(&signature_verified_block[0]); - - let mut ret = if self.config.local.concurrency_level > 1 - && dynamic_change_set_optimizations_enabled - { + let mut ret = if self.config.local.concurrency_level > 1 { self.execute_transactions_parallel( executor_arguments, signature_verified_block, @@ -1476,7 +1451,6 @@ where executor_arguments, signature_verified_block, base_view, - dynamic_change_set_optimizations_enabled, ) }; @@ -1512,7 +1486,6 @@ where executor_arguments, signature_verified_block, base_view, - dynamic_change_set_optimizations_enabled, ); } } diff --git a/aptos-move/block-executor/src/limit_processor.rs b/aptos-move/block-executor/src/limit_processor.rs index d5b09662a47da..b2b1780acae2c 100644 --- a/aptos-move/block-executor/src/limit_processor.rs +++ b/aptos-move/block-executor/src/limit_processor.rs @@ -261,7 +261,7 @@ mod test { proptest_types::types::{KeyType, MockEvent, MockTransaction}, types::InputOutputKey, }; - use aptos_types::aggregator::DelayedFieldID; + use aptos_types::delayed_fields::DelayedFieldID; use std::collections::HashSet; // TODO: add tests for accumulate_fee_statement / compute_conflict_multiplier for different BlockGasLimitType configs @@ -351,9 +351,7 @@ mod test { .map(|key| match key { InputOutputKey::Resource(k) => InputOutputKey::Resource(KeyType(*k, false)), InputOutputKey::Group(k, t) => InputOutputKey::Group(KeyType(*k, false), *t), - InputOutputKey::DelayedField(i) => { - InputOutputKey::DelayedField(DelayedFieldID::new(*i)) - }, + InputOutputKey::DelayedField(i) => InputOutputKey::DelayedField((*i).into()), }) .collect() } diff --git a/aptos-move/block-executor/src/proptest_types/tests.rs b/aptos-move/block-executor/src/proptest_types/tests.rs index b51c877e223db..d2b54cbb68ced 100644 --- a/aptos-move/block-executor/src/proptest_types/tests.rs +++ b/aptos-move/block-executor/src/proptest_types/tests.rs @@ -573,7 +573,7 @@ fn non_empty_group( executor_thread_pool.clone(), None, ) - .execute_transactions_sequential((), &transactions, &data_view, true); + .execute_transactions_sequential((), &transactions, &data_view); // TODO: test dynamic disabled as well. BaselineOutput::generate(&transactions, None).assert_output(&output); diff --git a/aptos-move/block-executor/src/task.rs b/aptos-move/block-executor/src/task.rs index a49766eee6d97..8c6e4b0ee23ec 100644 --- a/aptos-move/block-executor/src/task.rs +++ b/aptos-move/block-executor/src/task.rs @@ -30,8 +30,6 @@ pub enum ExecutionStatus { /// Transaction was executed successfully, but will skip the execution of the trailing /// transactions in the list SkipRest(O), - /// There is a DirectWriteTransaction with resolver not capable to handle it. - DirectWriteSetTransactionNotCapableError, /// Transaction detected that it is in inconsistent state due to speculative /// reads it did, and needs to be re-executed. SpeculativeExecutionAbortError(String), diff --git a/aptos-move/block-executor/src/txn_last_input_output.rs b/aptos-move/block-executor/src/txn_last_input_output.rs index 131b5cdca92e0..d00efb903816a 100644 --- a/aptos-move/block-executor/src/txn_last_input_output.rs +++ b/aptos-move/block-executor/src/txn_last_input_output.rs @@ -133,7 +133,6 @@ impl, E: Debug + Send + Clone> output.module_write_set() }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => BTreeMap::new(), }; @@ -281,7 +280,6 @@ impl, E: Debug + Send + Clone> ), ), ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => None, }) @@ -298,7 +296,6 @@ impl, E: Debug + Send + Clone> Some(t.resource_write_set()) }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => None, }) @@ -316,7 +313,6 @@ impl, E: Debug + Send + Clone> Some(t.delayed_field_change_set().into_keys()) }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => None, }) @@ -334,7 +330,6 @@ impl, E: Debug + Send + Clone> Some(t.reads_needing_delayed_field_exchange()) }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => None, }) @@ -352,7 +347,6 @@ impl, E: Debug + Send + Clone> Some(t.group_reads_needing_delayed_field_exchange()) }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => None, }) @@ -366,7 +360,6 @@ impl, E: Debug + Send + Clone> t.aggregator_v1_delta_set().into_keys().collect() }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => vec![], }, @@ -381,7 +374,6 @@ impl, E: Debug + Send + Clone> t.resource_group_metadata_ops() }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => vec![], }, @@ -400,7 +392,6 @@ impl, E: Debug + Send + Clone> Box::new(events.into_iter()) }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => { Box::new(empty::<(T::Event, Option)>()) @@ -447,7 +438,6 @@ impl, E: Debug + Send + Clone> ); }, ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => {}, }; @@ -472,7 +462,6 @@ impl, E: Debug + Send + Clone> { ExecutionStatus::Success(t) | ExecutionStatus::SkipRest(t) => t.get_write_summary(), ExecutionStatus::Abort(_) - | ExecutionStatus::DirectWriteSetTransactionNotCapableError | ExecutionStatus::SpeculativeExecutionAbortError(_) | ExecutionStatus::DelayedFieldsCodeInvariantError(_) => HashSet::new(), } diff --git a/aptos-move/block-executor/src/view.rs b/aptos-move/block-executor/src/view.rs index 96f8cc9103e21..3a429e4b0f572 100644 --- a/aptos-move/block-executor/src/view.rs +++ b/aptos-move/block-executor/src/view.rs @@ -33,7 +33,7 @@ use aptos_mvhashmap::{ MVHashMap, }; use aptos_types::{ - aggregator::PanicError, + delayed_fields::{ExtractUniqueIndex, PanicError}, executable::{Executable, ModulePath}, state_store::{ errors::StateviewError, @@ -759,7 +759,6 @@ pub(crate) struct SequentialState<'a, T: Transaction, X: Executable> { pub(crate) read_set: RefCell>, pub(crate) start_counter: u32, pub(crate) counter: &'a RefCell, - pub(crate) dynamic_change_set_optimizations_enabled: bool, pub(crate) incorrect_use: RefCell, } @@ -768,14 +767,12 @@ impl<'a, T: Transaction, X: Executable> SequentialState<'a, T, X> { unsync_map: &'a UnsyncMap, start_counter: u32, counter: &'a RefCell, - dynamic_change_set_optimizations_enabled: bool, ) -> Self { Self { unsync_map, read_set: RefCell::new(UnsyncReadSet::default()), start_counter, counter, - dynamic_change_set_optimizations_enabled, incorrect_use: RefCell::new(false), } } @@ -1570,7 +1567,7 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> TResourceGr fn is_resource_group_split_in_change_set_capable(&self) -> bool { match &self.latest_view { ViewState::Sync(_) => true, - ViewState::Unsync(state) => state.dynamic_change_set_optimizations_enabled, + ViewState::Unsync(_) => true, } } } @@ -1659,7 +1656,7 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> TDelayedFie fn is_delayed_field_optimization_capable(&self) -> bool { match &self.latest_view { ViewState::Sync(_) => true, - ViewState::Unsync(state) => state.dynamic_change_set_optimizations_enabled, + ViewState::Unsync(_) => true, } } @@ -1721,31 +1718,37 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> TDelayedFie } } - fn generate_delayed_field_id(&self) -> Self::Identifier { - match &self.latest_view { - ViewState::Sync(state) => (state.counter.fetch_add(1, Ordering::SeqCst) as u64).into(), + fn generate_delayed_field_id(&self, width: u32) -> Self::Identifier { + let index = match &self.latest_view { + ViewState::Sync(state) => state.counter.fetch_add(1, Ordering::SeqCst), ViewState::Unsync(state) => { let mut counter = state.counter.borrow_mut(); - let id = (*counter as u64).into(); + let id = *counter; *counter += 1; id }, - } + }; + + (index, width).into() } fn validate_and_convert_delayed_field_id( &self, id: u64, ) -> Result { + let result: Self::Identifier = id.into(); + + let unique_index = result.extract_unique_index(); + let start_counter = match &self.latest_view { ViewState::Sync(state) => state.start_counter, ViewState::Unsync(state) => state.start_counter, }; - if id < start_counter as u64 { + if unique_index < start_counter { return Err(code_invariant_error(format!( - "Invalid delayed field id: {}, we've started from {}", - id, start_counter + "Invalid delayed field id: {}, index: {}, we've started from {}", + id, unique_index, start_counter ))); } @@ -1754,14 +1757,14 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> TDelayedFie ViewState::Unsync(state) => *state.counter.borrow(), }; - if id > current as u64 { + if unique_index > current { return Err(code_invariant_error(format!( - "Invalid delayed field id: {}, we've only reached to {}", - id, current + "Invalid delayed field id: {}, index: {}, we've only reached to {}", + id, unique_index, current ))); } - Ok(id.into()) + Ok(result) } // TODO[agg_v2](cleanup) - update comment. @@ -1844,8 +1847,8 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> } } - fn generate_delayed_field_id(&self) -> T::Identifier { - self.latest_view.generate_delayed_field_id() + fn generate_delayed_field_id(&self, width: u32) -> T::Identifier { + self.latest_view.generate_delayed_field_id(width) } pub fn into_inner(self) -> HashSet { @@ -1865,8 +1868,8 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> ValueToIden layout: &MoveTypeLayout, value: Value, ) -> TransformationResult { - let id = self.generate_delayed_field_id(); - let base_value = DelayedFieldValue::try_from_move_value(layout, value, kind)?; + let (base_value, width) = DelayedFieldValue::try_from_move_value(layout, value, kind)?; + let id = self.generate_delayed_field_id(width); match &self.latest_view.latest_view { ViewState::Sync(state) => state.set_delayed_field_value(id, base_value), ViewState::Unsync(state) => { @@ -1883,21 +1886,20 @@ impl<'a, T: Transaction, S: TStateView, X: Executable> ValueToIden layout: &MoveTypeLayout, identifier_value: Value, ) -> TransformationResult { - let id = T::Identifier::try_from_move_value(layout, identifier_value, &()) + let (id, width) = T::Identifier::try_from_move_value(layout, identifier_value, &()) .map_err(|e| TransformationError(format!("{:?}", e)))?; self.delayed_field_keys.borrow_mut().insert(id); - match &self.latest_view.latest_view { - ViewState::Sync(state) => Ok(state + Ok(match &self.latest_view.latest_view { + ViewState::Sync(state) => state .versioned_map .delayed_fields() .read_latest_committed_value(&id, self.txn_idx, ReadPosition::AfterCurrentTxn) - .expect("Committed value for ID must always exist") - .try_into_move_value(layout)?), - ViewState::Unsync(state) => Ok(state + .expect("Committed value for ID must always exist"), + ViewState::Unsync(state) => state .read_delayed_field(id) - .expect("Delayed field value for ID must always exist in sequential execution") - .try_into_move_value(layout)?), + .expect("Delayed field value for ID must always exist in sequential execution"), } + .try_into_move_value(layout, width)?) } } @@ -1926,7 +1928,7 @@ impl ValueToIdentifierMapping for TemporaryExtractIdentifiersMap layout: &MoveTypeLayout, value: Value, ) -> TransformationResult { - let id = T::Identifier::try_from_move_value(layout, value, &()) + let (id, _) = T::Identifier::try_from_move_value(layout, value, &()) .map_err(|e| TransformationError(format!("{:?}", e)))?; self.delayed_field_keys.borrow_mut().insert(id); id.try_into_move_value(layout) @@ -1938,7 +1940,7 @@ impl ValueToIdentifierMapping for TemporaryExtractIdentifiersMap layout: &MoveTypeLayout, identifier_value: Value, ) -> TransformationResult { - let id = T::Identifier::try_from_move_value(layout, identifier_value, &()) + let (id, _) = T::Identifier::try_from_move_value(layout, identifier_value, &()) .map_err(|e| TransformationError(format!("{:?}", e)))?; self.delayed_field_keys.borrow_mut().insert(id); id.try_into_move_value(layout) @@ -1970,7 +1972,7 @@ mod test { MVHashMap, }; use aptos_types::{ - aggregator::DelayedFieldID, + delayed_fields::{bytes_and_width_to_derived_string_struct, to_utf8_bytes, DelayedFieldID}, executable::Executable, state_store::{ errors::StateviewError, state_storage_usage::StateStorageUsage, @@ -2057,7 +2059,7 @@ mod test { let mut view = FakeVersionedDelayedFieldView::default(); let captured_reads = RefCell::new(CapturedReads::::new()); let wait_for = FakeWaitForDependency(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; let math = BoundedMath::new(max_value); let txn_idx = 1; @@ -2196,7 +2198,7 @@ mod test { let mut view = FakeVersionedDelayedFieldView::default(); let captured_reads = RefCell::new(CapturedReads::::new()); let wait_for = FakeWaitForDependency(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; let math = BoundedMath::new(max_value); let txn_idx = 1; @@ -2335,7 +2337,7 @@ mod test { let mut view = FakeVersionedDelayedFieldView::default(); let captured_reads = RefCell::new(CapturedReads::::new()); let wait_for = FakeWaitForDependency(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; let math = BoundedMath::new(max_value); let txn_idx = 1; @@ -2474,7 +2476,7 @@ mod test { let mut view = FakeVersionedDelayedFieldView::default(); let captured_reads = RefCell::new(CapturedReads::::new()); let wait_for = FakeWaitForDependency(); - let id = DelayedFieldID::new(600); + let id = DelayedFieldID::new_for_test_for_u64(600); let max_value = 600; let txn_idx = 1; let storage_value = 200; @@ -2546,6 +2548,16 @@ mod test { )])) } + fn create_derived_string_layout() -> MoveTypeLayout { + MoveTypeLayout::Tagged( + LayoutTag::IdentifierMapping(IdentifierMappingKind::DerivedString), + Box::new(MoveTypeLayout::Struct(MoveStructLayout::new(vec![ + create_string_layout(), + create_vector_layout(MoveTypeLayout::U8), + ]))), + ) + } + fn create_string_layout() -> MoveTypeLayout { MoveTypeLayout::Struct(MoveStructLayout::Runtime(vec![MoveTypeLayout::Vector( Box::new(MoveTypeLayout::U8), @@ -2560,6 +2572,10 @@ mod test { Value::struct_(Struct::pack(vec![value])) } + fn create_derived_value(value: impl ToString, width: usize) -> Value { + bytes_and_width_to_derived_string_struct(to_utf8_bytes(value), width).unwrap() + } + fn create_struct_value(inner: Value) -> Value { Value::struct_(Struct::pack(vec![inner])) } @@ -2568,12 +2584,6 @@ mod test { Value::vector_for_testing_only(inner) } - fn create_string_value(value: &str) -> Value { - Value::struct_(Struct::pack(vec![Value::vector_u8( - bcs::to_bytes(value).unwrap().to_vec(), - )])) - } - fn create_state_value(value: &Value, layout: &MoveTypeLayout) -> StateValue { StateValue::new_legacy(value.simple_serialize(layout).unwrap().into()) } @@ -2622,9 +2632,10 @@ mod test { let unsync_map = UnsyncMap::new(); let counter = RefCell::new(5); let base_view = MockStateView::new(HashMap::new()); + let start_counter = 5; let latest_view = LatestView::::new( &base_view, - ViewState::Unsync(SequentialState::new(&unsync_map, 5, &counter, true)), + ViewState::Unsync(SequentialState::new(&unsync_map, start_counter, &counter)), 1, ); @@ -2673,7 +2684,7 @@ mod test { "One identifier should have been replaced in this case" ); assert!( - identifiers.contains(&DelayedFieldID::new(5)), + identifiers.contains(&DelayedFieldID::new_with_width(5, 8)), "The value 25 should have been replaced in the identifier 5" ); let (final_state_value, identifiers) = latest_view @@ -2710,9 +2721,18 @@ mod test { ); let patched_value = Value::struct_(Struct::pack(vec![Value::vector_for_testing_only(vec![ - Value::struct_(Struct::pack(vec![Value::u64(6), Value::u64(50)])), - Value::struct_(Struct::pack(vec![Value::u64(7), Value::u64(65)])), - Value::struct_(Struct::pack(vec![Value::u64(8), Value::u64(20)])), + Value::struct_(Struct::pack(vec![ + Value::u64(DelayedFieldID::new_with_width(6, 8).as_u64()), + Value::u64(50), + ])), + Value::struct_(Struct::pack(vec![ + Value::u64(DelayedFieldID::new_with_width(7, 8).as_u64()), + Value::u64(65), + ])), + Value::struct_(Struct::pack(vec![ + Value::u64(DelayedFieldID::new_with_width(8, 8).as_u64()), + Value::u64(20), + ])), ])])); assert_eq!( patched_state_value, @@ -2754,9 +2774,15 @@ mod test { ); let patched_value = Value::struct_(Struct::pack(vec![Value::vector_for_testing_only(vec![ - create_snapshot_value(Value::u128(9)), - create_snapshot_value(Value::u128(10)), - create_snapshot_value(Value::u128(11)), + create_snapshot_value(Value::u128( + DelayedFieldID::new_with_width(9, 16).as_u64() as u128 + )), + create_snapshot_value(Value::u128( + DelayedFieldID::new_with_width(10, 16).as_u64() as u128 + )), + create_snapshot_value(Value::u128( + DelayedFieldID::new_with_width(11, 16).as_u64() as u128 + )), ])])); assert_eq!( patched_state_value, @@ -2774,16 +2800,14 @@ mod test { /* layout = Struct { - snap: vec![AggregatorSnapshot] + snap: vec![DerivedStringSnapshot] } */ - let layout = create_struct_layout(create_vector_layout(create_snapshot_layout( - create_string_layout(), - ))); + let layout = create_struct_layout(create_vector_layout(create_derived_string_layout())); let value = create_struct_value(create_vector_value(vec![ - create_snapshot_value(create_string_value("hello")), - create_snapshot_value(create_string_value("ab")), - create_snapshot_value(create_string_value("c")), + create_derived_value("hello", 60), + create_derived_value("ab", 55), + create_derived_value("c", 50), ])); let state_value = StateValue::new_legacy(value.simple_serialize(&layout).unwrap().into()); let (patched_state_value, identifiers) = latest_view @@ -2797,17 +2821,23 @@ mod test { counter == RefCell::new(15), "The counter should have been updated to 15" ); - // TODO: This assertion is failing. The replaced identifier is not BCS encoded. - // let patched_value = Value::struct_(Struct::pack(vec![ - // Value::vector_for_testing_only(vec![ - // Value::struct_(Struct::pack(vec![Value::struct_(Struct::pack(vec![Value::vector_u8(bcs::to_bytes("12").unwrap().to_vec())]))])), - // Value::struct_(Struct::pack(vec![Value::struct_(Struct::pack(vec![Value::vector_u8(bcs::to_bytes("13").unwrap().to_vec())]))])), - // Value::struct_(Struct::pack(vec![Value::struct_(Struct::pack(vec![Value::vector_u8(bcs::to_bytes("14").unwrap().to_vec())]))])), - // ])])); - // assert_eq!( - // patched_state_value, - // StateValue::new_legacy(patched_value.simple_serialize(&layout).unwrap().into()) - // ); + + let patched_value = + Value::struct_(Struct::pack(vec![Value::vector_for_testing_only(vec![ + DelayedFieldID::new_with_width(12, 60) + .into_derived_string_struct() + .unwrap(), + DelayedFieldID::new_with_width(13, 55) + .into_derived_string_struct() + .unwrap(), + DelayedFieldID::new_with_width(14, 50) + .into_derived_string_struct() + .unwrap(), + ])])); + assert_eq!( + patched_state_value, + StateValue::new_legacy(patched_value.simple_serialize(&layout).unwrap().into()) + ); let (final_state_value, identifiers2) = latest_view .replace_identifiers_with_values(patched_state_value.bytes(), &layout) .unwrap(); @@ -2840,15 +2870,9 @@ mod test { fn create_sequential_latest_view<'a>( h: &'a Holder, - dynamic_change_set_optimizations_enabled: bool, ) -> LatestView<'a, TestTransactionType, MockStateView, MockExecutable> { let sequential_state: SequentialState<'a, TestTransactionType, MockExecutable> = - SequentialState::new( - &h.unsync_map, - *h.counter.borrow(), - &h.counter, - dynamic_change_set_optimizations_enabled, - ); + SequentialState::new(&h.unsync_map, *h.counter.borrow(), &h.counter); LatestView::<'a, TestTransactionType, MockStateView, MockExecutable>::new( &h.base_view, @@ -2885,7 +2909,7 @@ mod test { } fn new_view(&self) -> ViewsComparison<'_> { - let latest_view_seq = create_sequential_latest_view(&self.holder, true); + let latest_view_seq = create_sequential_latest_view(&self.holder); let latest_view_par = LatestView::::new( &self.base_view, @@ -3096,10 +3120,13 @@ mod test { let state_value = create_state_value(&value, &layout); let data = HashMap::from([(KeyType::(1, false), state_value.clone())]); - let holder = ComparisonHolder::new(data, 1000); + let start_counter = 1000; + let id = DelayedFieldID::new_with_width(start_counter, 8); + + let holder = ComparisonHolder::new(data, start_counter); let views = holder.new_view(); - let patched_value = create_struct_value(create_aggregator_value_u64(1000, 30)); + let patched_value = create_struct_value(create_aggregator_value_u64(id.as_u64(), 30)); let patched_state_value = create_state_value(&patched_value, &layout); match check_metadata { @@ -3119,10 +3146,7 @@ mod test { Some(patched_state_value.clone()) ); assert!(views - .get_reads_needing_exchange( - &HashSet::from([DelayedFieldID::new(1000)]), - &HashSet::new() - ) + .get_reads_needing_exchange(&HashSet::from([id]), &HashSet::new()) .unwrap() .contains_key(&KeyType(1, false))); assert_fetch_eq( @@ -3151,7 +3175,9 @@ mod test { let state_value_4 = StateValue::new_legacy(value.simple_serialize(&layout).unwrap().into()); data.insert(KeyType::(4, false), state_value_4); - let holder = ComparisonHolder::new(data, 1000); + let start_counter = 1000; + let id = DelayedFieldID::new_with_width(start_counter, 8); + let holder = ComparisonHolder::new(data, start_counter); let views = holder.new_view(); assert_eq!( @@ -3173,7 +3199,7 @@ mod test { Some(state_value_3.clone()) ); - //TODO: This is printing Ok(Versioned(Err(StorageVersion), ValueType { bytes: Some(b"!0\0\0\0\0\0\0"), metadata: None }, None)) + // TODO[agg_v2](fix): This is printing Ok(Versioned(Err(StorageVersion), ValueType { bytes: Some(b"!0\0\0\0\0\0\0"), metadata: None }, None)) // Is Err(StorageVersion) expected here? println!( "data: {:?}", @@ -3183,7 +3209,7 @@ mod test { .fetch_data(&KeyType::(3, false), 1) ); - let patched_value = create_struct_value(create_aggregator_value_u64(1000, 30)); + let patched_value = create_struct_value(create_aggregator_value_u64(id.as_u64(), 30)); let state_value_4 = StateValue::new_legacy(patched_value.simple_serialize(&layout).unwrap().into()); assert_eq!( @@ -3195,12 +3221,12 @@ mod test { // When we throw exception, it is not required read summaries to match, as they will not be used // assert_err_eq!( - // views.get_delayed_field_value(&DelayedFieldID::new(1005)), - // PanicOr::Or(DelayedFieldsSpeculativeError::NotFound(DelayedFieldID::new(1005))), + // views.get_delayed_field_value(&DelayedFieldID::new_for_test_for_u64(1005)), + // PanicOr::Or(DelayedFieldsSpeculativeError::NotFound(DelayedFieldID::new_for_test_for_u64(1005))), // ); assert_ok_eq!( - views.get_delayed_field_value(&DelayedFieldID::new(1000)), + views.get_delayed_field_value(&id), DelayedFieldValue::Aggregator(25), ); @@ -3208,14 +3234,14 @@ mod test { assert!(captured_reads.validate_data_reads(holder.versioned_map.data(), 1)); let read_set_with_delayed_fields = captured_reads.get_read_values_with_delayed_fields(); - // TODO: This prints + // TODO[agg_v2](fix): This prints // read: (KeyType(4, false), Versioned(Err(StorageVersion), Some(Struct(Runtime([Struct(Runtime([Tagged(IdentifierMapping(Aggregator), U64), U64]))]))))) // read: (KeyType(2, false), Versioned(Err(StorageVersion), Some(Struct(Runtime([Struct(Runtime([Tagged(IdentifierMapping(Aggregator), U64), U64]))]))))) for read in read_set_with_delayed_fields { println!("read: {:?}", read); } - // TODO: This assertion fails. + // TODO[agg_v2](fix): This assertion fails. // let data_read = DataRead::Versioned(Ok((1,0)), Arc::new(TransactionWrite::from_state_value(Some(state_value_4))), Some(Arc::new(layout))); // assert!(read_set_with_delayed_fields.any(|x| x == (&KeyType::(4, false), &data_read))); } diff --git a/aptos-move/e2e-move-tests/proptest-regressions/tests/aggregator_v2.txt b/aptos-move/e2e-move-tests/proptest-regressions/tests/aggregator_v2.txt new file mode 100644 index 0000000000000..200c42d0eb167 --- /dev/null +++ b/aptos-move/e2e-move-tests/proptest-regressions/tests/aggregator_v2.txt @@ -0,0 +1,7 @@ +# Seeds for failure cases proptest has generated in the past. It is +# automatically read and these particular cases re-run before any +# novel cases are generated. +# +# It is recommended to check this file in to source control so that +# everyone who runs the test benefits from these saved cases. +cc 9e3841acf3fe31aa057dcaee1fffe59461ac64f0637761cf45090eebdd55e802 # shrinks to test_env = TestEnvConfig { executor_mode: BothComparison, aggregator_execution_mode: DisabledOnly, block_split: Whole } diff --git a/aptos-move/e2e-move-tests/src/aggregator_v2.rs b/aptos-move/e2e-move-tests/src/aggregator_v2.rs index 54ce101c6057f..c0f9fa33e5173 100644 --- a/aptos-move/e2e-move-tests/src/aggregator_v2.rs +++ b/aptos-move/e2e-move-tests/src/aggregator_v2.rs @@ -163,6 +163,12 @@ impl AggregatorLocation { } } +pub enum StructType { + Aggregator, + Snapshot, + DerivedString, +} + impl AggV2TestHarness { pub fn run_block_in_parts_and_check( &mut self, @@ -210,17 +216,15 @@ impl AggV2TestHarness { account: Option<&Account>, use_type: UseType, element_type: ElementType, - aggregator: bool, + struct_type: StructType, ) -> SignedTransaction { self.harness.create_entry_function( account.unwrap_or(&self.account), - str::parse( - if aggregator { - "0x1::aggregator_v2_test::init_aggregator" - } else { - "0x1::aggregator_v2_test::init_snapshot" - }, - ) + str::parse(match struct_type { + StructType::Aggregator => "0x1::aggregator_v2_test::init_aggregator", + StructType::Snapshot => "0x1::aggregator_v2_test::init_snapshot", + StructType::DerivedString => "0x1::aggregator_v2_test::init_derived_string", + }) .unwrap(), vec![element_type.get_type_tag()], vec![bcs::to_bytes(&(use_type as u32)).unwrap()], @@ -267,6 +271,16 @@ impl AggV2TestHarness { ) } + pub fn check_derived( + &mut self, + snap_loc: &AggregatorLocation, + expected: u128, + ) -> SignedTransaction { + self.create_entry_agg_func_with_args("0x1::aggregator_v2_test::check_derived", snap_loc, &[ + expected, + ]) + } + #[allow(clippy::new_ret_no_self)] pub fn new(&mut self, agg_loc: &AggregatorLocation, max_value: u128) -> SignedTransaction { self.create_entry_agg_func_with_args("0x1::aggregator_v2_test::new", agg_loc, &[max_value]) @@ -452,27 +466,6 @@ impl AggV2TestHarness { } // idempotent verify functions: - - pub fn verify_copy_snapshot(&mut self) -> SignedTransaction { - self.txn_index += 1; - self.harness.create_entry_function( - &self.txn_accounts[self.txn_index % self.txn_accounts.len()], - str::parse("0x1::aggregator_v2_test::verify_copy_snapshot").unwrap(), - vec![], - vec![], - ) - } - - pub fn verify_copy_string_snapshot(&mut self) -> SignedTransaction { - self.txn_index += 1; - self.harness.create_entry_function( - &self.txn_accounts[self.txn_index % self.txn_accounts.len()], - str::parse("0x1::aggregator_v2_test::verify_copy_string_snapshot").unwrap(), - vec![], - vec![], - ) - } - pub fn verify_string_concat(&mut self) -> SignedTransaction { self.txn_index += 1; self.harness.create_entry_function( @@ -482,14 +475,4 @@ impl AggV2TestHarness { vec![], ) } - - pub fn verify_string_snapshot_concat(&mut self) -> SignedTransaction { - self.txn_index += 1; - self.harness.create_entry_function( - &self.txn_accounts[self.txn_index % self.txn_accounts.len()], - str::parse("0x1::aggregator_v2_test::verify_string_snapshot_concat").unwrap(), - vec![], - vec![], - ) - } } diff --git a/aptos-move/e2e-move-tests/src/tests/aggregator_v2.data/pack/sources/aggregator_v2_test.move b/aptos-move/e2e-move-tests/src/tests/aggregator_v2.data/pack/sources/aggregator_v2_test.move index f31cb886801c4..c9002fba848d3 100644 --- a/aptos-move/e2e-move-tests/src/tests/aggregator_v2.data/pack/sources/aggregator_v2_test.move +++ b/aptos-move/e2e-move-tests/src/tests/aggregator_v2.data/pack/sources/aggregator_v2_test.move @@ -1,5 +1,5 @@ module 0x1::aggregator_v2_test { - use aptos_framework::aggregator_v2::{Self, Aggregator, AggregatorSnapshot}; + use aptos_framework::aggregator_v2::{Self, Aggregator, AggregatorSnapshot, DerivedStringSnapshot}; use aptos_std::debug; use aptos_std::table::{Self, Table}; use std::vector; @@ -15,6 +15,12 @@ module 0x1::aggregator_v2_test { const EINVALID_ARG: u64 = 18; + const ERESOURCE_DOESNT_EXIST: u64 = 19; + const ETABLE_DOESNT_EXIST: u64 = 20; + const ERESOURCE_GROUP_DOESNT_EXIST: u64 = 21; + const EINDEX_DOESNT_EXIST: u64 = 22; + const EOPTION_DOESNT_EXIST: u64 = 23; + struct AggregatorInResource has key, store { data: vector>, } @@ -35,36 +41,16 @@ module 0x1::aggregator_v2_test { data: vector>, } - public entry fun verify_copy_snapshot() { - let snapshot = aggregator_v2::create_snapshot(42); - let snapshot2 = aggregator_v2::copy_snapshot(&snapshot); - assert!(aggregator_v2::read_snapshot(&snapshot) == 42, 1); - assert!(aggregator_v2::read_snapshot(&snapshot2) == 42, 2); - } - - public entry fun verify_copy_string_snapshot() { - let snapshot = aggregator_v2::create_snapshot(std::string::utf8(b"42")); - let snapshot2 = aggregator_v2::copy_snapshot(&snapshot); - assert!(aggregator_v2::read_snapshot(&snapshot) == std::string::utf8(b"42"), 3); - assert!(aggregator_v2::read_snapshot(&snapshot2) == std::string::utf8(b"42"), 4); - } - public entry fun verify_string_concat() { let snapshot = aggregator_v2::create_snapshot(42); - let snapshot2 = aggregator_v2::string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); - let val = aggregator_v2::read_snapshot(&snapshot2); + let snapshot2 = aggregator_v2::derive_string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); + let val = aggregator_v2::read_derived_string(&snapshot2); debug::print(&val); debug::print(&std::string::utf8(b"before42after")); assert!(val == std::string::utf8(b"before42after"), 5); } - public entry fun verify_string_snapshot_concat() { - let snapshot = aggregator_v2::create_snapshot(std::string::utf8(b"42")); - let snapshot2 = aggregator_v2::string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); - assert!(aggregator_v2::read_snapshot(&snapshot2) == std::string::utf8(b"before42after"), 6); - } - fun init(account: &signer, use_type: u32) { if (use_type == USE_RESOURCE_TYPE) { move_to(account, AggregatorInResource { data: vector::empty() }); @@ -85,12 +71,18 @@ module 0x1::aggregator_v2_test { init>(account, use_type); } + public entry fun init_derived_string(account: &signer, use_type: u32) { + init(account, use_type); + } + fun insert(account_addr: address, use_type: u32, i: u64, e: Agg) acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { assert!(use_type == USE_RESOURCE_TYPE || use_type == USE_TABLE_TYPE || use_type == USE_RESOURCE_GROUP_TYPE, EINVALID_ARG); let vector_data = if (use_type == USE_RESOURCE_TYPE) { + assert!(exists>(account_addr), ERESOURCE_DOESNT_EXIST); &mut borrow_global_mut>(account_addr).data } else if (use_type == USE_TABLE_TYPE) { + assert!(exists>(account_addr), ETABLE_DOESNT_EXIST); let data = &mut borrow_global_mut>(account_addr).data; let outer = i / 10; let inner = i % 10; @@ -101,6 +93,7 @@ module 0x1::aggregator_v2_test { table::borrow_mut(data, outer) } else { // if (use_type == USE_RESOURCE_GROUP_TYPE) { + assert!(exists>(account_addr), ERESOURCE_GROUP_DOESNT_EXIST); &mut borrow_global_mut>(account_addr).data }; @@ -115,18 +108,23 @@ module 0x1::aggregator_v2_test { inline fun for_element_ref(account_addr: address, use_type: u32, i: u64, f: |&Agg|R): R acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { assert!(use_type == USE_RESOURCE_TYPE || use_type == USE_TABLE_TYPE || use_type == USE_RESOURCE_GROUP_TYPE, EINVALID_ARG); let vector_data = if (use_type == USE_RESOURCE_TYPE) { + assert!(exists>(account_addr), ERESOURCE_DOESNT_EXIST); &borrow_global>(account_addr).data } else if (use_type == USE_TABLE_TYPE) { - let data = &borrow_global>(account_addr).data; + assert!(exists>(account_addr), ETABLE_DOESNT_EXIST); + let data = &borrow_global>(account_addr).data; let outer = i / 10; let inner = i % 10; i = inner; table::borrow(data, outer) } else { // if (use_type == USE_RESOURCE_GROUP_TYPE) { + assert!(exists>(account_addr), ERESOURCE_GROUP_DOESNT_EXIST); &borrow_global>(account_addr).data }; + assert!(vector::length(vector_data) > i, EINDEX_DOESNT_EXIST); let option_data = vector::borrow(vector_data, i); + assert!(option::is_some(option_data), EOPTION_DOESNT_EXIST); let value = option::borrow(option_data); f(value) @@ -262,10 +260,10 @@ module 0x1::aggregator_v2_test { } public entry fun concat(_account: &signer, addr_i: address, use_type_i: u32, i: u64, addr_j: address, use_type_j: u32, j: u64, prefix: String, suffix: String) acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { - let snapshot = for_element_ref, AggregatorSnapshot>(addr_i, use_type_i, i, |snapshot| { - aggregator_v2::string_concat(prefix, snapshot, suffix) + let snapshot = for_element_ref, DerivedStringSnapshot>(addr_i, use_type_i, i, |snapshot| { + aggregator_v2::derive_string_concat(prefix, snapshot, suffix) }); - insert>(addr_j, use_type_j, j, snapshot); + insert(addr_j, use_type_j, j, snapshot); } public entry fun read_snapshot(_account: &signer, addr: address, use_type: u32, i: u64) acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { @@ -277,6 +275,11 @@ module 0x1::aggregator_v2_test { assert!(actual == expected, ENOT_EQUAL) } + public entry fun check_derived(_account: &signer, addr: address, use_type: u32, i: u64, expected: String) acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { + let actual = for_element_ref(addr, use_type, i, |snapshot| aggregator_v2::read_derived_string(snapshot)); + assert!(actual == expected, ENOT_EQUAL) + } + public entry fun add_and_read_snapshot(_account: &signer, addr: address, use_type: u32, i: u64, value: Element) acquires AggregatorInResource, AggregatorInTable, AggregatorInResourceGroup { for_element_mut, Element>(addr, use_type, i, |aggregator| { aggregator_v2::add(aggregator, value); diff --git a/aptos-move/e2e-move-tests/src/tests/aggregator_v2.rs b/aptos-move/e2e-move-tests/src/tests/aggregator_v2.rs index 0410130b7025b..ac106efd5f895 100644 --- a/aptos-move/e2e-move-tests/src/tests/aggregator_v2.rs +++ b/aptos-move/e2e-move-tests/src/tests/aggregator_v2.rs @@ -4,14 +4,11 @@ use crate::{ aggregator_v2::{ initialize, initialize_enabled_disabled_comparison, AggV2TestHarness, AggregatorLocation, - ElementType, UseType, + ElementType, StructType, UseType, }, tests::common, BlockSplit, SUCCESS, }; -use aptos_framework::natives::aggregator_natives::aggregator_v2::{ - EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED, EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, -}; use aptos_language_e2e_tests::executor::ExecutorMode; use proptest::prelude::*; @@ -41,26 +38,6 @@ fn setup( mod test_cases { use super::*; - #[test] - fn test_copy_snapshot() { - let mut h = setup(DEFAULT_EXECUTOR_MODE, AggregatorMode::BothComparison, 1); - let txn = h.verify_copy_snapshot(); - h.run_block_in_parts_and_check(BlockSplit::Whole, vec![( - EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED, - txn, - )]); - } - - #[test] - fn test_copy_string_snapshot() { - let mut h = setup(DEFAULT_EXECUTOR_MODE, AggregatorMode::BothComparison, 1); - let txn = h.verify_copy_string_snapshot(); - h.run_block_in_parts_and_check(BlockSplit::Whole, vec![( - EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED, - txn, - )]); - } - #[test] fn test_snapshot_concat() { let mut h = setup(DEFAULT_EXECUTOR_MODE, AggregatorMode::BothComparison, 1); @@ -68,16 +45,6 @@ mod test_cases { h.run_block_in_parts_and_check(BlockSplit::Whole, vec![(SUCCESS, txn)]); } - #[test] - fn test_string_snapshot_concat() { - let mut h = setup(DEFAULT_EXECUTOR_MODE, AggregatorMode::BothComparison, 1); - let txn = h.verify_string_snapshot_concat(); - h.run_block_in_parts_and_check(BlockSplit::Whole, vec![( - EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, - txn, - )]); - } - #[test] fn test_aggregators_e2e() { println!("Testing test_aggregators_e2e"); @@ -86,7 +53,7 @@ mod test_cases { let mut h = setup(DEFAULT_EXECUTOR_MODE, AggregatorMode::BothComparison, 100); - let init_txn = h.init(None, use_type, element_type, true); + let init_txn = h.init(None, use_type, element_type, StructType::Aggregator); h.run_block_in_parts_and_check(BlockSplit::Whole, vec![(SUCCESS, init_txn)]); let addr = *h.account.address(); @@ -256,7 +223,7 @@ proptest! { let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new(&agg_loc, 1500)), (SUCCESS, h.add(&agg_loc, 400)), // 400 (SUCCESS, h.materialize(&agg_loc)), @@ -306,9 +273,9 @@ proptest! { println!("agg_3_loc: {:?}", agg_3_loc); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), - (SUCCESS, h.init(Some(&acc_2), use_type, element_type, true)), - (SUCCESS, h.init(Some(&acc_3), use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), + (SUCCESS, h.init(Some(&acc_2), use_type, element_type, StructType::Aggregator)), + (SUCCESS, h.init(Some(&acc_3), use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new_add(&agg_1_loc, 10, 5)), (SUCCESS, h.new_add(&agg_2_loc, 10, 5)), (SUCCESS, h.new_add(&agg_3_loc, 10, 5)), // 5, 5, 5 @@ -358,7 +325,7 @@ proptest! { let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new(&agg_loc, 600)), (SUCCESS, h.add(&agg_loc, 400)), // Value dropped below zero - abort with EAGGREGATOR_UNDERFLOW. @@ -381,7 +348,7 @@ proptest! { let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new(&agg_loc, 600)), // Underflow on materialized value leads to abort with EAGGREGATOR_UNDERFLOW. (EAGGREGATOR_UNDERFLOW, h.materialize_and_sub(&agg_loc, 400)), @@ -404,7 +371,7 @@ proptest! { let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new_add(&agg_loc, 600, 400)), // Limit exceeded - abort with EAGGREGATOR_OVERFLOW. (EAGGREGATOR_OVERFLOW, h.add(&agg_loc, 201)) @@ -427,7 +394,7 @@ proptest! { let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), (SUCCESS, h.new(&agg_loc, 399)), // Overflow on materialized value leads to abort with EAGGREGATOR_OVERFLOW. (EAGGREGATOR_OVERFLOW, h.materialize_and_add(&agg_loc, 400)), @@ -439,31 +406,29 @@ proptest! { ); } - // TODO[agg_v2](fix) Until string snapshot serialization is fixed, this cannot work. - // So lines with derived_snap_loc are commented out, and 9 changed to 7 #[test] - fn test_aggregator_snapshot(test_env in arb_test_env_non_equivalent(7)) { + fn test_aggregator_snapshot(test_env in arb_test_env_non_equivalent(10)) { println!("Testing test_aggregator_snapshot {:?}", test_env); let element_type = ElementType::U64; let use_type = UseType::UseResourceType; - let mut h = setup(test_env.executor_mode, test_env.aggregator_execution_mode, 7); + let mut h = setup(test_env.executor_mode, test_env.aggregator_execution_mode, 10); let agg_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); let snap_loc = AggregatorLocation::new(*h.account.address(), element_type, use_type, 0); - // let derived_snap_loc = AggregatorLocation::new(*h.account.address(), ElementType::String, use_type, 0); + let derived_snap_loc = AggregatorLocation::new(*h.account.address(), ElementType::String, use_type, 0); let txns = vec![ - (SUCCESS, h.init(None, use_type, element_type, true)), - (SUCCESS, h.init(None, use_type, element_type, false)), - // (SUCCESS, h.init(None, use_type, ElementType::String, false)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Aggregator)), + (SUCCESS, h.init(None, use_type, element_type, StructType::Snapshot)), + (SUCCESS, h.init(None, use_type, ElementType::String, StructType::DerivedString)), (SUCCESS, h.new_add(&agg_loc, 400, 100)), (SUCCESS, h.snapshot(&agg_loc, &snap_loc)), (SUCCESS, h.check_snapshot(&snap_loc, 100)), (SUCCESS, h.read_snapshot(&agg_loc)), (SUCCESS, h.add_and_read_snapshot_u128(&agg_loc, 100)), - // (SUCCESS, h.concat(&snap_loc, &derived_snap_loc, "12", "13")), - // (SUCCESS, h.check_snapshot(&derived_snap_loc, 1210013)), + (SUCCESS, h.concat(&snap_loc, &derived_snap_loc, "12", "13")), + (SUCCESS, h.check_derived(&derived_snap_loc, 1210013)), ]; h.run_block_in_parts_and_check( @@ -474,8 +439,7 @@ proptest! { } #[test] -#[should_panic] -fn test_aggregator_snapshot_not_equivalent_gas() { +fn test_aggregator_snapshot_equivalent_gas() { let test_env = TestEnvConfig { executor_mode: ExecutorMode::BothComparison, aggregator_execution_mode: AggregatorMode::BothComparison, @@ -498,9 +462,23 @@ fn test_aggregator_snapshot_not_equivalent_gas() { AggregatorLocation::new(*h.account.address(), ElementType::String, use_type, 0); let txns = vec![ - (0, h.init(None, use_type, element_type, true)), - (0, h.init(None, use_type, element_type, false)), - (0, h.init(None, use_type, ElementType::String, false)), + ( + 0, + h.init(None, use_type, element_type, StructType::Aggregator), + ), + ( + 0, + h.init(None, use_type, element_type, StructType::Snapshot), + ), + ( + 0, + h.init( + None, + use_type, + ElementType::String, + StructType::DerivedString, + ), + ), (0, h.new_add(&agg_loc, 400, 100)), (0, h.snapshot(&agg_loc, &snap_loc)), // string needs to be large, for gas rounding to be different diff --git a/aptos-move/e2e-testsuite/src/tests/genesis.rs b/aptos-move/e2e-testsuite/src/tests/genesis.rs index e7152687e68df..a0562143964cb 100644 --- a/aptos-move/e2e-testsuite/src/tests/genesis.rs +++ b/aptos-move/e2e-testsuite/src/tests/genesis.rs @@ -7,9 +7,10 @@ use aptos_language_e2e_tests::{ executor::FakeExecutor, }; use aptos_types::{ - transaction::{Transaction, TransactionStatus, WriteSetPayload}, + transaction::{ChangeSet, Transaction, TransactionStatus, WriteSetPayload}, write_set::TransactionWrite, }; +use move_core_types::vm_status::StatusCode; #[test] fn no_deletion_in_genesis() { @@ -30,9 +31,7 @@ fn execute_genesis_write_set() { assert!(!output.pop().unwrap().status().is_discarded()) } -// TODO[agg_v2](fix) - investigate/make BlockSTM discard instead of fail if WriteSetPayload::Direct is in the block -// #[test] -#[allow(unused)] +#[test] fn execute_genesis_and_drop_other_transaction() { let mut executor = FakeExecutor::no_genesis(); let txn = @@ -50,3 +49,18 @@ fn execute_genesis_and_drop_other_transaction() { assert_eq!(output.len(), 2); assert_eq!(output.pop().unwrap().status(), &TransactionStatus::Retry) } + +#[test] +fn fail_no_epoch_change_write_set() { + let mut executor = FakeExecutor::no_genesis(); + let txn = Transaction::GenesisTransaction(WriteSetPayload::Direct(ChangeSet::empty())); + + let sender = executor.create_raw_account_data(1_000_000, 10); + let receiver = executor.create_raw_account_data(100_000, 10); + let txn2 = peer_to_peer_txn(sender.account(), receiver.account(), 11, 1000, 0); + + let output_err = executor + .execute_transaction_block(vec![txn, Transaction::UserTransaction(txn2)]) + .unwrap_err(); + assert_eq!(StatusCode::INVALID_WRITE_SET, output_err.status_code()); +} diff --git a/aptos-move/framework/aptos-framework/doc/aggregator_v2.md b/aptos-move/framework/aptos-framework/doc/aggregator_v2.md index 55727d80c27ee..787882cd4179e 100644 --- a/aptos-move/framework/aptos-framework/doc/aggregator_v2.md +++ b/aptos-move/framework/aptos-framework/doc/aggregator_v2.md @@ -17,6 +17,7 @@ operation that also reduced parallelism, and should be avoided as much as possib - [Struct `Aggregator`](#0x1_aggregator_v2_Aggregator) - [Struct `AggregatorSnapshot`](#0x1_aggregator_v2_AggregatorSnapshot) +- [Struct `DerivedStringSnapshot`](#0x1_aggregator_v2_DerivedStringSnapshot) - [Constants](#@Constants_0) - [Function `max_value`](#0x1_aggregator_v2_max_value) - [Function `create_aggregator`](#0x1_aggregator_v2_create_aggregator) @@ -28,8 +29,11 @@ operation that also reduced parallelism, and should be avoided as much as possib - [Function `read`](#0x1_aggregator_v2_read) - [Function `snapshot`](#0x1_aggregator_v2_snapshot) - [Function `create_snapshot`](#0x1_aggregator_v2_create_snapshot) -- [Function `copy_snapshot`](#0x1_aggregator_v2_copy_snapshot) - [Function `read_snapshot`](#0x1_aggregator_v2_read_snapshot) +- [Function `read_derived_string`](#0x1_aggregator_v2_read_derived_string) +- [Function `create_derived_string`](#0x1_aggregator_v2_create_derived_string) +- [Function `derive_string_concat`](#0x1_aggregator_v2_derive_string_concat) +- [Function `copy_snapshot`](#0x1_aggregator_v2_copy_snapshot) - [Function `string_concat`](#0x1_aggregator_v2_string_concat) - [Specification](#@Specification_1) - [Function `create_aggregator`](#@Specification_1_create_aggregator) @@ -95,7 +99,34 @@ Unlike read() and storing the value directly, this enables parallel execution of while storing snapshot of aggregator state elsewhere. -
struct AggregatorSnapshot<Element> has drop, store
+
struct AggregatorSnapshot<IntElement> has drop, store
+
+ + + +
+Fields + + +
+
+value: IntElement +
+
+ +
+
+ + +
+ + + +## Struct `DerivedStringSnapshot` + + + +
struct DerivedStringSnapshot has drop, store
 
@@ -106,7 +137,13 @@ while storing snapshot of aggregator state elsewhere.
-value: Element +value: string::String +
+
+ +
+
+padding: vector<u8>
@@ -424,7 +461,7 @@ Creates a snapshot of a given value. Useful for when object is sometimes created via snapshot() or string_concat(), and sometimes directly. -
public fun create_snapshot<Element: copy, drop>(value: Element): aggregator_v2::AggregatorSnapshot<Element>
+
public fun create_snapshot<IntElement: copy, drop>(value: IntElement): aggregator_v2::AggregatorSnapshot<IntElement>
 
@@ -433,21 +470,24 @@ Useful for when object is sometimes created via snapshot() or string_concat(), a Implementation -
public native fun create_snapshot<Element: copy + drop>(value: Element): AggregatorSnapshot<Element>;
+
public native fun create_snapshot<IntElement: copy + drop>(value: IntElement): AggregatorSnapshot<IntElement>;
 
- + -## Function `copy_snapshot` +## Function `read_snapshot` -NOT YET IMPLEMENTED, always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. +Returns a value stored in this snapshot. +Note: This operation is resource-intensive, and reduces parallelism. +(Especially if called in a transaction that also modifies the aggregator, +or has other read/write conflicts) -
public fun copy_snapshot<Element: copy, drop>(snapshot: &aggregator_v2::AggregatorSnapshot<Element>): aggregator_v2::AggregatorSnapshot<Element>
+
public fun read_snapshot<IntElement>(snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>): IntElement
 
@@ -456,24 +496,24 @@ NOT YET IMPLEMENTED, always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. Implementation -
public native fun copy_snapshot<Element: copy + drop>(snapshot: &AggregatorSnapshot<Element>): AggregatorSnapshot<Element>;
+
public native fun read_snapshot<IntElement>(snapshot: &AggregatorSnapshot<IntElement>): IntElement;
 
- + -## Function `read_snapshot` +## Function `read_derived_string` -Returns a value stored in this snapshot. +Returns a value stored in this DerivedStringSnapshot. Note: This operation is resource-intensive, and reduces parallelism. (Especially if called in a transaction that also modifies the aggregator, or has other read/write conflicts) -
public fun read_snapshot<Element>(snapshot: &aggregator_v2::AggregatorSnapshot<Element>): Element
+
public fun read_derived_string(snapshot: &aggregator_v2::DerivedStringSnapshot): string::String
 
@@ -482,16 +522,40 @@ or has other read/write conflicts) Implementation -
public native fun read_snapshot<Element>(snapshot: &AggregatorSnapshot<Element>): Element;
+
public native fun read_derived_string(snapshot: &DerivedStringSnapshot): String;
 
- + + +## Function `create_derived_string` + +Creates a DerivedStringSnapshot of a given value. +Useful for when object is sometimes created via string_concat(), and sometimes directly. + + +
public fun create_derived_string(value: string::String): aggregator_v2::DerivedStringSnapshot
+
+ + + +
+Implementation + + +
public native fun create_derived_string(value: String): DerivedStringSnapshot;
+
+ -## Function `string_concat` + +
+ + + +## Function `derive_string_concat` Concatenates before, snapshot and after into a single string. snapshot passed needs to have integer type - currently supported types are u64 and u128. @@ -499,7 +563,55 @@ Raises EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE if called with another type. If length of prefix and suffix together exceed 256 bytes, ECONCAT_STRING_LENGTH_TOO_LARGE is raised. -
public fun string_concat<IntElement>(before: string::String, snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>, after: string::String): aggregator_v2::AggregatorSnapshot<string::String>
+
public fun derive_string_concat<IntElement>(before: string::String, snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>, after: string::String): aggregator_v2::DerivedStringSnapshot
+
+ + + +
+Implementation + + +
public native fun derive_string_concat<IntElement>(before: String, snapshot: &AggregatorSnapshot<IntElement>, after: String): DerivedStringSnapshot;
+
+ + + +
+ + + +## Function `copy_snapshot` + +NOT YET IMPLEMENTED, always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. + + +
#[deprecated]
+public fun copy_snapshot<IntElement: copy, drop>(snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>): aggregator_v2::AggregatorSnapshot<IntElement>
+
+ + + +
+Implementation + + +
public native fun copy_snapshot<IntElement: copy + drop>(snapshot: &AggregatorSnapshot<IntElement>): AggregatorSnapshot<IntElement>;
+
+ + + +
+ + + +## Function `string_concat` + +DEPRECATED, use derive_string_concat() instead. always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. + + +
#[deprecated]
+public fun string_concat<IntElement>(before: string::String, snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>, after: string::String): aggregator_v2::AggregatorSnapshot<string::String>
 
@@ -621,7 +733,7 @@ If length of prefix and suffix together exceed 256 bytes, ECONCAT_STRING_LENGTH_ ### Function `create_snapshot` -
public fun create_snapshot<Element: copy, drop>(value: Element): aggregator_v2::AggregatorSnapshot<Element>
+
public fun create_snapshot<IntElement: copy, drop>(value: IntElement): aggregator_v2::AggregatorSnapshot<IntElement>
 
@@ -637,7 +749,8 @@ If length of prefix and suffix together exceed 256 bytes, ECONCAT_STRING_LENGTH_ ### Function `copy_snapshot` -
public fun copy_snapshot<Element: copy, drop>(snapshot: &aggregator_v2::AggregatorSnapshot<Element>): aggregator_v2::AggregatorSnapshot<Element>
+
#[deprecated]
+public fun copy_snapshot<IntElement: copy, drop>(snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>): aggregator_v2::AggregatorSnapshot<IntElement>
 
@@ -653,7 +766,8 @@ If length of prefix and suffix together exceed 256 bytes, ECONCAT_STRING_LENGTH_ ### Function `string_concat` -
public fun string_concat<IntElement>(before: string::String, snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>, after: string::String): aggregator_v2::AggregatorSnapshot<string::String>
+
#[deprecated]
+public fun string_concat<IntElement>(before: string::String, snapshot: &aggregator_v2::AggregatorSnapshot<IntElement>, after: string::String): aggregator_v2::AggregatorSnapshot<string::String>
 
diff --git a/aptos-move/framework/aptos-framework/sources/aggregator_v2/aggregator_v2.move b/aptos-move/framework/aptos-framework/sources/aggregator_v2/aggregator_v2.move index ad620429151f9..c3fd48d2cae8f 100644 --- a/aptos-move/framework/aptos-framework/sources/aggregator_v2/aggregator_v2.move +++ b/aptos-move/framework/aptos-framework/sources/aggregator_v2/aggregator_v2.move @@ -46,8 +46,13 @@ module aptos_framework::aggregator_v2 { /// Represents a constant value, that was derived from an aggregator at given instant in time. /// Unlike read() and storing the value directly, this enables parallel execution of transactions, /// while storing snapshot of aggregator state elsewhere. - struct AggregatorSnapshot has store, drop { - value: Element, + struct AggregatorSnapshot has store, drop { + value: IntElement, + } + + struct DerivedStringSnapshot has store, drop { + value: String, + padding: vector, } /// Returns `max_value` exceeding which aggregator overflows. @@ -100,23 +105,42 @@ module aptos_framework::aggregator_v2 { /// Creates a snapshot of a given value. /// Useful for when object is sometimes created via snapshot() or string_concat(), and sometimes directly. - public native fun create_snapshot(value: Element): AggregatorSnapshot; - - /// NOT YET IMPLEMENTED, always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. - public native fun copy_snapshot(snapshot: &AggregatorSnapshot): AggregatorSnapshot; + public native fun create_snapshot(value: IntElement): AggregatorSnapshot; /// Returns a value stored in this snapshot. /// Note: This operation is resource-intensive, and reduces parallelism. /// (Especially if called in a transaction that also modifies the aggregator, /// or has other read/write conflicts) - public native fun read_snapshot(snapshot: &AggregatorSnapshot): Element; + public native fun read_snapshot(snapshot: &AggregatorSnapshot): IntElement; + + /// Returns a value stored in this DerivedStringSnapshot. + /// Note: This operation is resource-intensive, and reduces parallelism. + /// (Especially if called in a transaction that also modifies the aggregator, + /// or has other read/write conflicts) + public native fun read_derived_string(snapshot: &DerivedStringSnapshot): String; + + /// Creates a DerivedStringSnapshot of a given value. + /// Useful for when object is sometimes created via string_concat(), and sometimes directly. + public native fun create_derived_string(value: String): DerivedStringSnapshot; /// Concatenates `before`, `snapshot` and `after` into a single string. /// snapshot passed needs to have integer type - currently supported types are u64 and u128. /// Raises EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE if called with another type. /// If length of prefix and suffix together exceed 256 bytes, ECONCAT_STRING_LENGTH_TOO_LARGE is raised. + public native fun derive_string_concat(before: String, snapshot: &AggregatorSnapshot, after: String): DerivedStringSnapshot; + + // ===== DEPRECATE/NOT YET IMPLEMENTED ==== + + #[deprecated] + /// NOT YET IMPLEMENTED, always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. + public native fun copy_snapshot(snapshot: &AggregatorSnapshot): AggregatorSnapshot; + + #[deprecated] + /// DEPRECATED, use derive_string_concat() instead. always raises EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED. public native fun string_concat(before: String, snapshot: &AggregatorSnapshot, after: String): AggregatorSnapshot; + // ======================================== + #[test] fun test_aggregator() { let agg = create_aggregator(10); @@ -139,8 +163,8 @@ module aptos_framework::aggregator_v2 { let snapshot = create_snapshot(42); assert!(read_snapshot(&snapshot) == 42, 0); - let snapshot = create_snapshot(std::string::utf8(b"42")); - assert!(read_snapshot(&snapshot) == std::string::utf8(b"42"), 0); + let derived = create_derived_string(std::string::utf8(b"42")); + assert!(read_derived_string(&derived) == std::string::utf8(b"42"), 0); } #[test] @@ -153,15 +177,8 @@ module aptos_framework::aggregator_v2 { #[test] fun test_string_concat1() { let snapshot = create_snapshot(42); - let snapshot2 = string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); - assert!(read_snapshot(&snapshot2) == std::string::utf8(b"before42after"), 0); - } - - #[test] - #[expected_failure(abort_code = 0x030005, location = Self)] - fun test_string_concat_from_string_not_supported() { - let snapshot = create_snapshot(std::string::utf8(b"42")); - string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); + let derived = derive_string_concat(std::string::utf8(b"before"), &snapshot, std::string::utf8(b"after")); + assert!(read_derived_string(&derived) == std::string::utf8(b"before42after"), 0); } // Tests commented out, as flag used in rust cannot be disabled. diff --git a/aptos-move/framework/aptos-token-objects/doc/token.md b/aptos-move/framework/aptos-token-objects/doc/token.md index 991288d8dee2f..904d5214952e5 100644 --- a/aptos-move/framework/aptos-token-objects/doc/token.md +++ b/aptos-move/framework/aptos-token-objects/doc/token.md @@ -11,6 +11,7 @@ token are: - [Resource `Token`](#0x4_token_Token) +- [Resource `TokenIdentifiers`](#0x4_token_TokenIdentifiers) - [Resource `ConcurrentTokenIdentifiers`](#0x4_token_ConcurrentTokenIdentifiers) - [Struct `BurnRef`](#0x4_token_BurnRef) - [Struct `MutatorRef`](#0x4_token_MutatorRef) @@ -86,7 +87,7 @@ Represents the common fields to all tokens. index: u64
- Deprecated in favor of index inside ConcurrentTokenIdentifiers. + Deprecated in favor of index inside TokenIdentifiers. Will be populated until concurrent_assets_enabled feature flag is enabled. Unique identifier within the collection, optional, 0 means unassigned @@ -101,7 +102,7 @@ Represents the common fields to all tokens. name: string::String
- Deprecated in favor of name inside ConcurrentTokenIdentifiers. + Deprecated in favor of name inside TokenIdentifiers. Will be populated until concurrent_assets_enabled feature flag is enabled. The name of the token, which should be unique within the collection; the length of name @@ -125,16 +126,16 @@ Represents the common fields to all tokens. - + -## Resource `ConcurrentTokenIdentifiers` +## Resource `TokenIdentifiers` Represents first addition to the common fields for all tokens Starts being populated once aggregator_v2_api_enabled is enabled.
#[resource_group_member(#[group = 0x1::object::ObjectGroup])]
-struct ConcurrentTokenIdentifiers has key
+struct TokenIdentifiers has key
 
@@ -151,7 +152,7 @@ Starts being populated once aggregator_v2_api_enabled is enabled. Unique identifier within the collection, optional, 0 means unassigned
-name: aggregator_v2::AggregatorSnapshot<string::String> +name: aggregator_v2::DerivedStringSnapshot
The name of the token, which should be unique within the collection; the length of name @@ -160,6 +161,41 @@ Starts being populated once aggregator_v2_api_enabled is enabled.
+ + + + +## Resource `ConcurrentTokenIdentifiers` + + + +
#[resource_group_member(#[group = 0x1::object::ObjectGroup])]
+#[deprecated]
+struct ConcurrentTokenIdentifiers has key
+
+ + + +
+Fields + + +
+
+_1: aggregator_v2::AggregatorSnapshot<u64> +
+
+ +
+
+_2: aggregator_v2::AggregatorSnapshot<string::String> +
+
+ +
+
+ +
@@ -414,9 +450,9 @@ The token name is over the maximum length // If create_numbered_token called us, add index to the name. let name = if (option::is_some(&name_with_index_suffix)) { - aggregator_v2::string_concat(name_prefix, &index, option::extract(&mut name_with_index_suffix)) + aggregator_v2::derive_string_concat(name_prefix, &index, option::extract(&mut name_with_index_suffix)) } else { - aggregator_v2::create_snapshot(name_prefix) + aggregator_v2::create_derived_string(name_prefix) }; // Until concurrent_assets_enabled is enabled, we still need to write to deprecated fields. @@ -430,11 +466,11 @@ The token name is over the maximum length let deprecated_name = if (concurrent_assets_enabled) { string::utf8(b"") } else { - aggregator_v2::read_snapshot(&name) + aggregator_v2::read_derived_string(&name) }; // If aggregator_api_enabled, we always populate newly added fields - let token_concurrent = ConcurrentTokenIdentifiers { + let token_concurrent = TokenIdentifiers { index, name, }; @@ -443,7 +479,7 @@ The token name is over the maximum length (deprecated_index, deprecated_name) } else { // If aggregator_api_enabled is disabled, we cannot use increment_concurrent_supply or - // create ConcurrentTokenIdentifiers, so we fallback to the old behavior. + // create TokenIdentifiers, so we fallback to the old behavior. let id = collection::increment_supply(&collection, signer::address_of(&object_signer)); let index = option::get_with_default(&mut id, 0); @@ -921,10 +957,10 @@ as that would prohibit transactions to be executed in parallel. Implementation -
public fun name<T: key>(token: Object<T>): String acquires Token, ConcurrentTokenIdentifiers {
+
public fun name<T: key>(token: Object<T>): String acquires Token, TokenIdentifiers {
     let token_address = object::object_address(&token);
-    if (exists<ConcurrentTokenIdentifiers>(token_address)) {
-        aggregator_v2::read_snapshot(&borrow_global<ConcurrentTokenIdentifiers>(token_address).name)
+    if (exists<TokenIdentifiers>(token_address)) {
+        aggregator_v2::read_derived_string(&borrow_global<TokenIdentifiers>(token_address).name)
     } else {
         borrow(&token).name
     }
@@ -1013,10 +1049,10 @@ as that would prohibit transactions to be executed in parallel.
 Implementation
 
 
-
public fun index<T: key>(token: Object<T>): u64 acquires Token, ConcurrentTokenIdentifiers {
+
public fun index<T: key>(token: Object<T>): u64 acquires Token, TokenIdentifiers {
     let token_address = object::object_address(&token);
-    if (exists<ConcurrentTokenIdentifiers>(token_address)) {
-        aggregator_v2::read_snapshot(&borrow_global<ConcurrentTokenIdentifiers>(token_address).index)
+    if (exists<TokenIdentifiers>(token_address)) {
+        aggregator_v2::read_snapshot(&borrow_global<TokenIdentifiers>(token_address).index)
     } else {
         borrow(&token).index
     }
@@ -1070,7 +1106,7 @@ as that would prohibit transactions to be executed in parallel.
 Implementation
 
 
-
public fun burn(burn_ref: BurnRef) acquires Token, ConcurrentTokenIdentifiers {
+
public fun burn(burn_ref: BurnRef) acquires Token, TokenIdentifiers {
     let addr = if (option::is_some(&burn_ref.inner)) {
         let delete_ref = option::extract(&mut burn_ref.inner);
         let addr = object::address_from_delete_ref(&delete_ref);
@@ -1093,11 +1129,11 @@ as that would prohibit transactions to be executed in parallel.
         mutation_events,
     } = move_from<Token>(addr);
 
-    let index = if (exists<ConcurrentTokenIdentifiers>(addr)) {
-        let ConcurrentTokenIdentifiers {
+    let index = if (exists<TokenIdentifiers>(addr)) {
+        let TokenIdentifiers {
             index,
             name: _,
-        } = move_from<ConcurrentTokenIdentifiers>(addr);
+        } = move_from<TokenIdentifiers>(addr);
         aggregator_v2::read_snapshot(&index)
     } else {
         deprecated_index
@@ -1161,15 +1197,15 @@ as that would prohibit transactions to be executed in parallel.
 Implementation
 
 
-
public fun set_name(mutator_ref: &MutatorRef, name: String) acquires Token, ConcurrentTokenIdentifiers {
+
public fun set_name(mutator_ref: &MutatorRef, name: String) acquires Token, TokenIdentifiers {
     assert!(string::length(&name) <= MAX_TOKEN_NAME_LENGTH, error::out_of_range(ETOKEN_NAME_TOO_LONG));
 
     let token = borrow_mut(mutator_ref);
 
-    let old_name = if (exists<ConcurrentTokenIdentifiers>(mutator_ref.self)) {
-        let token_concurrent = borrow_global_mut<ConcurrentTokenIdentifiers>(mutator_ref.self);
-        let old_name = aggregator_v2::read_snapshot(&token_concurrent.name);
-        token_concurrent.name = aggregator_v2::create_snapshot(name);
+    let old_name = if (exists<TokenIdentifiers>(mutator_ref.self)) {
+        let token_concurrent = borrow_global_mut<TokenIdentifiers>(mutator_ref.self);
+        let old_name = aggregator_v2::read_derived_string(&token_concurrent.name);
+        token_concurrent.name = aggregator_v2::create_derived_string(name);
         old_name
     } else {
         let old_name = token.name;
diff --git a/aptos-move/framework/aptos-token-objects/sources/token.move b/aptos-move/framework/aptos-token-objects/sources/token.move
index 186dd763dde6b..cd1b8ecf43784 100644
--- a/aptos-move/framework/aptos-token-objects/sources/token.move
+++ b/aptos-move/framework/aptos-token-objects/sources/token.move
@@ -11,7 +11,7 @@ module aptos_token_objects::token {
     use std::string::{Self, String};
     use std::signer;
     use std::vector;
-    use aptos_framework::aggregator_v2::{Self, AggregatorSnapshot};
+    use aptos_framework::aggregator_v2::{Self, AggregatorSnapshot, DerivedStringSnapshot};
     use aptos_framework::event;
     use aptos_framework::object::{Self, ConstructorRef, Object};
     use aptos_std::string_utils::{to_string};
@@ -43,14 +43,14 @@ module aptos_token_objects::token {
     struct Token has key {
         /// The collection from which this token resides.
         collection: Object,
-        /// Deprecated in favor of `index` inside ConcurrentTokenIdentifiers.
+        /// Deprecated in favor of `index` inside TokenIdentifiers.
         /// Will be populated until concurrent_assets_enabled feature flag is enabled.
         ///
         /// Unique identifier within the collection, optional, 0 means unassigned
         index: u64, // DEPRECATED
         /// A brief description of the token.
         description: String,
-        /// Deprecated in favor of `name` inside ConcurrentTokenIdentifiers.
+        /// Deprecated in favor of `name` inside TokenIdentifiers.
         /// Will be populated until concurrent_assets_enabled feature flag is enabled.
         ///
         /// The name of the token, which should be unique within the collection; the length of name
@@ -66,12 +66,20 @@ module aptos_token_objects::token {
     #[resource_group_member(group = aptos_framework::object::ObjectGroup)]
     /// Represents first addition to the common fields for all tokens
     /// Starts being populated once aggregator_v2_api_enabled is enabled.
-    struct ConcurrentTokenIdentifiers has key {
+    struct TokenIdentifiers has key {
         /// Unique identifier within the collection, optional, 0 means unassigned
         index: AggregatorSnapshot,
         /// The name of the token, which should be unique within the collection; the length of name
         /// should be smaller than 128, characters, eg: "Aptos Animal #1234"
-        name: AggregatorSnapshot,
+        name: DerivedStringSnapshot,
+    }
+
+    // DEPRECATED, NEVER USED
+    #[deprecated]
+    #[resource_group_member(group = aptos_framework::object::ObjectGroup)]
+    struct ConcurrentTokenIdentifiers has key {
+        _1: AggregatorSnapshot,
+        _2: AggregatorSnapshot,
     }
 
     /// This enables burning an NFT, if possible, it will also delete the object. Note, the data
@@ -135,9 +143,9 @@ module aptos_token_objects::token {
 
             // If create_numbered_token called us, add index to the name.
             let name = if (option::is_some(&name_with_index_suffix)) {
-                aggregator_v2::string_concat(name_prefix, &index, option::extract(&mut name_with_index_suffix))
+                aggregator_v2::derive_string_concat(name_prefix, &index, option::extract(&mut name_with_index_suffix))
             } else {
-                aggregator_v2::create_snapshot(name_prefix)
+                aggregator_v2::create_derived_string(name_prefix)
             };
 
             // Until concurrent_assets_enabled is enabled, we still need to write to deprecated fields.
@@ -151,11 +159,11 @@ module aptos_token_objects::token {
             let deprecated_name = if (concurrent_assets_enabled) {
                 string::utf8(b"")
             } else {
-                aggregator_v2::read_snapshot(&name)
+                aggregator_v2::read_derived_string(&name)
             };
 
             // If aggregator_api_enabled, we always populate newly added fields
-            let token_concurrent = ConcurrentTokenIdentifiers {
+            let token_concurrent = TokenIdentifiers {
                 index,
                 name,
             };
@@ -164,7 +172,7 @@ module aptos_token_objects::token {
             (deprecated_index, deprecated_name)
         } else {
             // If aggregator_api_enabled is disabled, we cannot use increment_concurrent_supply or
-            // create ConcurrentTokenIdentifiers, so we fallback to the old behavior.
+            // create TokenIdentifiers, so we fallback to the old behavior.
             let id = collection::increment_supply(&collection, signer::address_of(&object_signer));
             let index = option::get_with_default(&mut id, 0);
 
@@ -345,10 +353,10 @@ module aptos_token_objects::token {
     // within the transaction that creates it, to set additional application-specific fields.
     //
     // /// This method allows minting to happen in parallel, making it efficient.
-    // fun name_snapshot(token: &Object): AggregatorSnapshot acquires Token, ConcurrentTokenIdentifiers {
+    // fun name_snapshot(token: &Object): AggregatorSnapshot acquires Token, TokenIdentifiers {
     //     let token_address = object::object_address(token);
-    //     if (exists(token_address)) {
-    //         aggregator_v2::copy_snapshot(&borrow_global(token_address).name)
+    //     if (exists(token_address)) {
+    //         aggregator_v2::copy_snapshot(&borrow_global(token_address).name)
     //     } else {
     //         aggregator_v2::create_snapshot(borrow(token).name)
     //     }
@@ -357,10 +365,10 @@ module aptos_token_objects::token {
     #[view]
     /// Avoid this method in the same transaction as the token is minted
     /// as that would prohibit transactions to be executed in parallel.
-    public fun name(token: Object): String acquires Token, ConcurrentTokenIdentifiers {
+    public fun name(token: Object): String acquires Token, TokenIdentifiers {
         let token_address = object::object_address(&token);
-        if (exists(token_address)) {
-            aggregator_v2::read_snapshot(&borrow_global(token_address).name)
+        if (exists(token_address)) {
+            aggregator_v2::read_derived_string(&borrow_global(token_address).name)
         } else {
             borrow(&token).name
         }
@@ -390,10 +398,10 @@ module aptos_token_objects::token {
     // within the transaction that creates it, to set additional application-specific fields.
     //
     // /// This method allows minting to happen in parallel, making it efficient.
-    // fun index_snapshot(token: &Object): AggregatorSnapshot acquires Token, ConcurrentTokenIdentifiers {
+    // fun index_snapshot(token: &Object): AggregatorSnapshot acquires Token, TokenIdentifiers {
     //     let token_address = object::object_address(token);
-    //     if (exists(token_address)) {
-    //         aggregator_v2::copy_snapshot(&borrow_global(token_address).index)
+    //     if (exists(token_address)) {
+    //         aggregator_v2::copy_snapshot(&borrow_global(token_address).index)
     //     } else {
     //         aggregator_v2::create_snapshot(borrow(token).index)
     //     }
@@ -402,10 +410,10 @@ module aptos_token_objects::token {
     #[view]
     /// Avoid this method in the same transaction as the token is minted
     /// as that would prohibit transactions to be executed in parallel.
-    public fun index(token: Object): u64 acquires Token, ConcurrentTokenIdentifiers {
+    public fun index(token: Object): u64 acquires Token, TokenIdentifiers {
         let token_address = object::object_address(&token);
-        if (exists(token_address)) {
-            aggregator_v2::read_snapshot(&borrow_global(token_address).index)
+        if (exists(token_address)) {
+            aggregator_v2::read_snapshot(&borrow_global(token_address).index)
         } else {
             borrow(&token).index
         }
@@ -421,7 +429,7 @@ module aptos_token_objects::token {
         borrow_global_mut(mutator_ref.self)
     }
 
-    public fun burn(burn_ref: BurnRef) acquires Token, ConcurrentTokenIdentifiers {
+    public fun burn(burn_ref: BurnRef) acquires Token, TokenIdentifiers {
         let addr = if (option::is_some(&burn_ref.inner)) {
             let delete_ref = option::extract(&mut burn_ref.inner);
             let addr = object::address_from_delete_ref(&delete_ref);
@@ -444,11 +452,11 @@ module aptos_token_objects::token {
             mutation_events,
         } = move_from(addr);
 
-        let index = if (exists(addr)) {
-            let ConcurrentTokenIdentifiers {
+        let index = if (exists(addr)) {
+            let TokenIdentifiers {
                 index,
                 name: _,
-            } = move_from(addr);
+            } = move_from(addr);
             aggregator_v2::read_snapshot(&index)
         } else {
             deprecated_index
@@ -472,15 +480,15 @@ module aptos_token_objects::token {
         token.description = description;
     }
 
-    public fun set_name(mutator_ref: &MutatorRef, name: String) acquires Token, ConcurrentTokenIdentifiers {
+    public fun set_name(mutator_ref: &MutatorRef, name: String) acquires Token, TokenIdentifiers {
         assert!(string::length(&name) <= MAX_TOKEN_NAME_LENGTH, error::out_of_range(ETOKEN_NAME_TOO_LONG));
 
         let token = borrow_mut(mutator_ref);
 
-        let old_name = if (exists(mutator_ref.self)) {
-            let token_concurrent = borrow_global_mut(mutator_ref.self);
-            let old_name = aggregator_v2::read_snapshot(&token_concurrent.name);
-            token_concurrent.name = aggregator_v2::create_snapshot(name);
+        let old_name = if (exists(mutator_ref.self)) {
+            let token_concurrent = borrow_global_mut(mutator_ref.self);
+            let old_name = aggregator_v2::read_derived_string(&token_concurrent.name);
+            token_concurrent.name = aggregator_v2::create_derived_string(name);
             old_name
         } else {
             let old_name = token.name;
@@ -629,7 +637,7 @@ module aptos_token_objects::token {
     }
 
     #[test(creator = @0x123)]
-    fun test_set_name(creator: &signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_set_name(creator: &signer) acquires Token, TokenIdentifiers {
         let collection_name = string::utf8(b"collection name");
         let token_name = string::utf8(b"token name");
 
@@ -663,7 +671,7 @@ module aptos_token_objects::token {
     }
 
     #[test(creator = @0x123)]
-    fun test_burn_without_royalty(creator: &signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_burn_without_royalty(creator: &signer) acquires Token, TokenIdentifiers {
         let collection_name = string::utf8(b"collection name");
         let token_name = string::utf8(b"token name");
 
@@ -686,7 +694,7 @@ module aptos_token_objects::token {
     }
 
     #[test(creator = @0x123)]
-    fun test_burn_with_royalty(creator: &signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_burn_with_royalty(creator: &signer) acquires Token, TokenIdentifiers {
         let collection_name = string::utf8(b"collection name");
         let token_name = string::utf8(b"token name");
 
@@ -710,7 +718,7 @@ module aptos_token_objects::token {
     }
 
     #[test(creator = @0x123)]
-    fun test_create_from_account_burn_and_delete(creator: &signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_create_from_account_burn_and_delete(creator: &signer) acquires Token, TokenIdentifiers {
         use aptos_framework::account;
 
         let collection_name = string::utf8(b"collection name");
@@ -735,7 +743,7 @@ module aptos_token_objects::token {
     }
 
     #[test(creator = @0x123,fx = @std)]
-    fun test_create_burn_and_delete(creator: &signer, fx: signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_create_burn_and_delete(creator: &signer, fx: signer) acquires Token, TokenIdentifiers {
         use aptos_framework::account;
         use std::features;
 
@@ -764,7 +772,7 @@ module aptos_token_objects::token {
     }
 
     #[test(fx = @aptos_framework, creator = @0x123, trader = @0x456)]
-    fun test_upgrade_to_concurrent_and_numbered_tokens(fx: &signer, creator: &signer) acquires Token, ConcurrentTokenIdentifiers {
+    fun test_upgrade_to_concurrent_and_numbered_tokens(fx: &signer, creator: &signer) acquires Token, TokenIdentifiers {
         use std::debug;
 
         let feature = features::get_concurrent_assets_feature();
diff --git a/aptos-move/framework/move-stdlib/doc/vector.md b/aptos-move/framework/move-stdlib/doc/vector.md
index d600c933fe218..1c0e6367ecb15 100644
--- a/aptos-move/framework/move-stdlib/doc/vector.md
+++ b/aptos-move/framework/move-stdlib/doc/vector.md
@@ -63,6 +63,9 @@ the return on investment didn't seem worth it for these simple functions.
 -  [Function `any`](#0x1_vector_any)
 -  [Function `all`](#0x1_vector_all)
 -  [Function `destroy`](#0x1_vector_destroy)
+-  [Function `range`](#0x1_vector_range)
+-  [Function `range_with_step`](#0x1_vector_range_with_step)
+-  [Function `slice`](#0x1_vector_slice)
 -  [Specification](#@Specification_1)
     -  [Helper Functions](#@Helper_Functions_2)
     -  [Function `singleton`](#@Specification_1_singleton)
@@ -112,6 +115,26 @@ The index into the vector is out of bounds
 
 
 
+
+
+The range in slice is invalid.
+
+
+
const EINVALID_SLICE_RANGE: u64 = 131076;
+
+ + + + + +The step provided in range is invalid, must be greater than zero. + + +
const EINVALID_STEP: u64 = 131075;
+
+ + + The length of the vectors are not equal. @@ -1580,6 +1603,96 @@ when used in the context of destroying a vector. + + + + +## Function `range` + + + +
public fun range(start: u64, end: u64): vector<u64>
+
+ + + +
+Implementation + + +
public fun range(start: u64, end: u64): vector<u64> {
+    range_with_step(start, end, 1)
+}
+
+ + + +
+ + + +## Function `range_with_step` + + + +
public fun range_with_step(start: u64, end: u64, step: u64): vector<u64>
+
+ + + +
+Implementation + + +
public fun range_with_step(start: u64, end: u64, step: u64): vector<u64> {
+    assert!(step > 0, EINVALID_STEP);
+
+    let vec = vector[];
+    while (start < end) {
+        push_back(&mut vec, start);
+        start = start + step;
+    };
+    vec
+}
+
+ + + +
+ + + +## Function `slice` + + + +
public fun slice<Element: copy>(v: &vector<Element>, start: u64, end: u64): vector<Element>
+
+ + + +
+Implementation + + +
public fun slice<Element: copy>(
+    v: &vector<Element>,
+    start: u64,
+    end: u64
+): vector<Element> {
+    assert!(start <= end && end <= length(v), EINVALID_SLICE_RANGE);
+
+    let vec = vector[];
+    while (start < end) {
+        push_back(&mut vec, *borrow(v, start));
+        start = start + 1;
+    };
+    vec
+}
+
+ + +
diff --git a/aptos-move/framework/move-stdlib/sources/vector.move b/aptos-move/framework/move-stdlib/sources/vector.move index 885737608c610..05368acf4edbf 100644 --- a/aptos-move/framework/move-stdlib/sources/vector.move +++ b/aptos-move/framework/move-stdlib/sources/vector.move @@ -18,6 +18,12 @@ module std::vector { /// The length of the vectors are not equal. const EVECTORS_LENGTH_MISMATCH: u64 = 0x20002; + /// The step provided in `range` is invalid, must be greater than zero. + const EINVALID_STEP: u64 = 0x20003; + + /// The range in `slice` is invalid. + const EINVALID_SLICE_RANGE: u64 = 0x20004; + #[bytecode_instruction] /// Create an empty vector. native public fun empty(): vector; @@ -589,6 +595,36 @@ module std::vector { for_each_reverse(v, |e| d(e)) } + public fun range(start: u64, end: u64): vector { + range_with_step(start, end, 1) + } + + public fun range_with_step(start: u64, end: u64, step: u64): vector { + assert!(step > 0, EINVALID_STEP); + + let vec = vector[]; + while (start < end) { + push_back(&mut vec, start); + start = start + step; + }; + vec + } + + public fun slice( + v: &vector, + start: u64, + end: u64 + ): vector { + assert!(start <= end && end <= length(v), EINVALID_SLICE_RANGE); + + let vec = vector[]; + while (start < end) { + push_back(&mut vec, *borrow(v, start)); + start = start + 1; + }; + vec + } + // ================================================================= // Module Specification diff --git a/aptos-move/framework/move-stdlib/tests/vector_tests.move b/aptos-move/framework/move-stdlib/tests/vector_tests.move index 8d12fe18a18c8..b8c9e19a4dd84 100644 --- a/aptos-move/framework/move-stdlib/tests/vector_tests.move +++ b/aptos-move/framework/move-stdlib/tests/vector_tests.move @@ -888,6 +888,64 @@ module std::vector_tests { vector::insert(&mut v,6, 6); } + #[test] + fun test_range() { + let result = vector::range(5, 10); + assert!(result == vector[5, 6, 7, 8, 9], 1); + } + + #[test] + fun test_range_with_step() { + let result = vector::range_with_step(0, 10, 2); + assert!(result == vector[0, 2, 4, 6, 8], 1); + + let empty_result = vector::range_with_step(10, 10, 2); + assert!(empty_result == vector[], 1); + + // Test with `start` greater than `end` + let reverse_result = vector::range_with_step(10, 0, 2); + assert!(reverse_result == vector[], 1); + } + + #[test] + #[expected_failure(abort_code = V::EINVALID_STEP)] + fun test_range_with_invalid_step() { + vector::range_with_step(0, 10, 0); + } + + #[test] + fun test_slice() { + let v = &vector[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + + let slice_beginning = vector::slice(v, 0, 3); + assert!(slice_beginning == vector[0, 1, 2], 1); + + let slice_end = vector::slice(v, 7, 10); + assert!(slice_end == vector[7, 8, 9], 1); + + let empty_slice = vector::slice(v, 5, 5); + assert!(empty_slice == vector[], 1); + let empty_slice = vector::slice(v, 0, 0); + assert!(empty_slice == vector[], 1); + + let full_slice = &vector::slice(v, 0, 10); + assert!(full_slice == v, 1); + } + + #[test] + #[expected_failure(abort_code = V::EINVALID_SLICE_RANGE)] + fun test_slice_invalid_range() { + let v = &vector[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + vector::slice(v, 7, 6); // start is greater than end + } + + #[test] + #[expected_failure(abort_code = V::EINVALID_SLICE_RANGE)] + fun test_slice_out_of_bounds() { + let v = &vector[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]; + vector::slice(v, 0, 11); // end is out of bounds + } + #[test_only] struct MoveOnly {} diff --git a/aptos-move/framework/src/natives/aggregator_natives/aggregator_v2.rs b/aptos-move/framework/src/natives/aggregator_natives/aggregator_v2.rs index a4877b92e05b2..7781e733d1439 100644 --- a/aptos-move/framework/src/natives/aggregator_natives/aggregator_v2.rs +++ b/aptos-move/framework/src/natives/aggregator_natives/aggregator_v2.rs @@ -1,23 +1,24 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use crate::natives::{ - aggregator_natives::{ - helpers_v2::{ - aggregator_snapshot_field_value, aggregator_snapshot_value_field_as_id, - aggregator_value_field_as_id, get_aggregator_fields_u128, get_aggregator_fields_u64, - set_aggregator_value_field, - }, - NativeAggregatorContext, +use super::{ + helpers_v1::get_struct_field, + helpers_v2::{ + AGG_MAX_VALUE_FIELD_INDEX, AGG_SNAPSHOT_VALUE_FIELD_INDEX, AGG_VALUE_FIELD_INDEX, + DERIVED_STRING_VALUE_FIELD_INDEX, + }, +}; +use crate::natives::aggregator_natives::{ + helpers_v2::{ + aggregator_snapshot_value_field_as_id, aggregator_value_field_as_id, + derived_string_value_field_as_id, set_aggregator_value_field, }, - AccountAddress, + NativeAggregatorContext, }; use aptos_aggregator::{ bounded_math::{BoundedMath, SignedU128}, delayed_field_extension::DelayedFieldData, resolver::DelayedFieldResolver, - types::{SnapshotToStringFormula, SnapshotValue}, - utils::{string_to_bytes, to_utf8_bytes, u128_to_u64}, }; use aptos_gas_algebra::NumBytes; use aptos_gas_schedule::gas_params::natives::aptos_framework::*; @@ -25,18 +26,20 @@ use aptos_native_interface::{ safely_pop_arg, RawSafeNative, SafeNativeBuilder, SafeNativeContext, SafeNativeError, SafeNativeResult, }; -use move_binary_format::errors::PartialVMError; -use move_core_types::{ - value::{MoveStructLayout, MoveTypeLayout}, - vm_status::StatusCode, +use aptos_types::delayed_fields::{ + bytes_and_width_to_derived_string_struct, calculate_width_for_constant_string, + calculate_width_for_integer_embeded_string, string_to_bytes, u128_to_u64, DelayedFieldID, + SnapshotToStringFormula, }; +use move_binary_format::errors::PartialVMError; +use move_core_types::vm_status::StatusCode; use move_vm_runtime::native_functions::NativeFunction; use move_vm_types::{ loaded_data::runtime_types::Type, values::{Struct, StructRef, Value}, }; use smallvec::{smallvec, SmallVec}; -use std::{cell::RefMut, collections::VecDeque, ops::Deref}; +use std::{cell::RefMut, collections::VecDeque}; /// The generic type supplied to aggregator snapshots is not supported. pub const EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE: u64 = 0x03_0005; @@ -55,18 +58,9 @@ pub const EINPUT_STRING_LENGTH_TOO_LARGE: u64 = 0x03_0008; /// and any calls will raise this error. pub const EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED: u64 = 0x03_0009; -pub const STRING_SNAPSHOT_INPUT_MAX_LENGTH: usize = 256; - -/// Checks if the type argument `type_arg` is a string type. -fn is_string_type(context: &SafeNativeContext, type_arg: &Type) -> SafeNativeResult { - let ty = context.deref().type_to_fully_annotated_layout(type_arg)?; - if let MoveTypeLayout::Struct(MoveStructLayout::WithTypes { type_, .. }) = ty { - return Ok(type_.name.as_str() == "String" - && type_.module.as_str() == "string" - && type_.address == AccountAddress::ONE); - } - Ok(false) -} +/// The maximum length of the input string for derived string snapshot. +/// If we want to increase this, we need to modify BITS_FOR_SIZE in types/src/delayed_fields.rs. +pub const DERIVED_STRING_INPUT_MAX_LENGTH: usize = 1024; /// Given the native function argument and a type, returns a tuple of its /// fields: (`aggregator id`, `max_value`). @@ -76,11 +70,13 @@ pub fn get_aggregator_fields_by_type( ) -> SafeNativeResult<(u128, u128)> { match ty_arg { Type::U128 => { - let (id, max_value) = get_aggregator_fields_u128(agg)?; + let id = get_struct_field(agg, AGG_VALUE_FIELD_INDEX)?.value_as::()?; + let max_value = get_struct_field(agg, AGG_MAX_VALUE_FIELD_INDEX)?.value_as::()?; Ok((id, max_value)) }, Type::U64 => { - let (id, max_value) = get_aggregator_fields_u64(agg)?; + let id = get_struct_field(agg, AGG_VALUE_FIELD_INDEX)?.value_as::()?; + let max_value = get_struct_field(agg, AGG_MAX_VALUE_FIELD_INDEX)?.value_as::()?; Ok((id as u128, max_value as u128)) }, _ => Err(SafeNativeError::Abort { @@ -89,99 +85,77 @@ pub fn get_aggregator_fields_by_type( } } -/// Given the list of native function arguments and a type, pop the next argument if it is of given type. -pub fn pop_value_by_type(ty_arg: &Type, args: &mut VecDeque) -> SafeNativeResult { +pub fn get_snapshot_field_by_type( + ty_arg: &Type, + agg_snapshot: &StructRef, +) -> SafeNativeResult { match ty_arg { - Type::U128 => Ok(safely_pop_arg!(args, u128)), - Type::U64 => Ok(safely_pop_arg!(args, u64) as u128), + Type::U128 => { + let value = get_struct_field(agg_snapshot, AGG_SNAPSHOT_VALUE_FIELD_INDEX)? + .value_as::()?; + Ok(value) + }, + Type::U64 => { + let value = get_struct_field(agg_snapshot, AGG_SNAPSHOT_VALUE_FIELD_INDEX)? + .value_as::()?; + Ok(value as u128) + }, _ => Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_TYPE, + abort_code: EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, }), } } -pub fn create_value_by_type(ty_arg: &Type, value: u128) -> SafeNativeResult { +pub fn get_derived_string_field(derived_string: &StructRef) -> SafeNativeResult> { + Ok(string_to_bytes( + get_struct_field(derived_string, DERIVED_STRING_VALUE_FIELD_INDEX)?.value_as::()?, + ) + .map_err(PartialVMError::from)?) +} + +pub fn get_width_by_type(ty_arg: &Type, error_code_if_incorrect: u64) -> SafeNativeResult { match ty_arg { - Type::U128 => Ok(Value::u128(value)), - Type::U64 => Ok(Value::u64(u128_to_u64(value)?)), + Type::U128 => Ok(16), + Type::U64 => Ok(8), _ => Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_TYPE, + abort_code: error_code_if_incorrect, }), } } -// To avoid checking is_string_type multiple times, check type_arg only once, and convert into this enum -enum SnapshotType { - U128, - U64, - String, -} - -impl SnapshotType { - fn from_ty_arg(context: &SafeNativeContext, ty_arg: &Type) -> SafeNativeResult { - match ty_arg { - Type::U128 => Ok(Self::U128), - Type::U64 => Ok(Self::U64), - _ => { - // Check if the type is a string - if is_string_type(context, ty_arg)? { - Ok(Self::String) - } else { - // If not a string, return an error - Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, - }) - } - }, - } - } - - pub fn pop_snapshot_field_by_type( - &self, - args: &mut VecDeque, - ) -> SafeNativeResult { - self.parse_snapshot_value_by_type(aggregator_snapshot_field_value(&safely_pop_arg!( - args, StructRef - ))?) - } - - pub fn pop_snapshot_value_by_type( - &self, - args: &mut VecDeque, - ) -> SafeNativeResult { - match self { - SnapshotType::U128 => Ok(SnapshotValue::Integer(safely_pop_arg!(args, u128))), - SnapshotType::U64 => Ok(SnapshotValue::Integer(safely_pop_arg!(args, u64) as u128)), - SnapshotType::String => { - let input = string_to_bytes(safely_pop_arg!(args, Struct))?; - Ok(SnapshotValue::String(input)) - }, - } +/// Given the list of native function arguments and a type, pop the next argument if it is of given type. +pub fn pop_value_by_type( + ty_arg: &Type, + args: &mut VecDeque, + error_code_if_incorrect: u64, +) -> SafeNativeResult { + match ty_arg { + Type::U128 => Ok(safely_pop_arg!(args, u128)), + Type::U64 => Ok(safely_pop_arg!(args, u64) as u128), + _ => Err(SafeNativeError::Abort { + abort_code: error_code_if_incorrect, + }), } +} - pub fn parse_snapshot_value_by_type(&self, value: Value) -> SafeNativeResult { - // Simpler to wrap to be able to reuse safely_pop_arg functions - self.pop_snapshot_value_by_type(&mut VecDeque::from([value])) +pub fn create_value_by_type( + ty_arg: &Type, + value: u128, + error_code_if_incorrect: u64, +) -> SafeNativeResult { + match ty_arg { + Type::U128 => Ok(Value::u128(value)), + Type::U64 => Ok(Value::u64( + u128_to_u64(value).map_err(PartialVMError::from)?, + )), + _ => Err(SafeNativeError::Abort { + abort_code: error_code_if_incorrect, + }), } +} - pub fn create_snapshot_value_by_type(&self, value: SnapshotValue) -> SafeNativeResult { - match (self, value) { - (SnapshotType::U128, SnapshotValue::Integer(v)) => Ok(Value::u128(v)), - (SnapshotType::U64, SnapshotValue::Integer(v)) => Ok(Value::u64(u128_to_u64(v)?)), - (SnapshotType::String, value) => { - Ok(Value::struct_(Struct::pack(vec![Value::vector_u8( - match value { - SnapshotValue::String(v) => v, - SnapshotValue::Integer(v) => to_utf8_bytes(v), - }, - )]))) - }, - // Type cannot be Integer, if value is String - _ => Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, - }), - } - } +pub fn create_string_value(value: Vec) -> Value { + Value::struct_(Struct::pack(vec![Value::vector_u8(value)])) } fn get_context_data<'t, 'b>( @@ -219,7 +193,8 @@ fn create_aggregator_impl( ) -> SafeNativeResult> { let value_field_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { - let id = resolver.generate_delayed_field_id(); + let width = get_width_by_type(ty_arg, EUNSUPPORTED_AGGREGATOR_TYPE)?; + let id = resolver.generate_delayed_field_id(width); delayed_field_data.create_new_aggregator(id); id.as_u64() as u128 } else { @@ -227,8 +202,8 @@ fn create_aggregator_impl( }; Ok(smallvec![Value::struct_(Struct::pack(vec![ - create_value_by_type(ty_arg, value_field_value)?, - create_value_by_type(ty_arg, max_value)?, + create_value_by_type(ty_arg, value_field_value, EUNSUPPORTED_AGGREGATOR_TYPE)?, + create_value_by_type(ty_arg, max_value, EUNSUPPORTED_AGGREGATOR_TYPE)?, ]))]) } @@ -247,7 +222,7 @@ fn native_create_aggregator( debug_assert_eq!(ty_args.len(), 1); context.charge(AGGREGATOR_V2_CREATE_AGGREGATOR_BASE)?; - let max_value = pop_value_by_type(&ty_args[0], &mut args)?; + let max_value = pop_value_by_type(&ty_args[0], &mut args, EUNSUPPORTED_AGGREGATOR_TYPE)?; create_aggregator_impl(context, &ty_args[0], max_value) } @@ -296,7 +271,7 @@ fn native_try_add( debug_assert_eq!(ty_args.len(), 1); context.charge(AGGREGATOR_V2_TRY_ADD_BASE)?; - let input = pop_value_by_type(&ty_args[0], &mut args)?; + let input = pop_value_by_type(&ty_args[0], &mut args, EUNSUPPORTED_AGGREGATOR_TYPE)?; let agg_struct = safely_pop_arg!(args, StructRef); let (agg_value, agg_max_value) = get_aggregator_fields_by_type(&ty_args[0], &agg_struct)?; @@ -312,7 +287,10 @@ fn native_try_add( let math = BoundedMath::new(agg_max_value); match math.unsigned_add(agg_value, input) { Ok(sum) => { - set_aggregator_value_field(&agg_struct, create_value_by_type(&ty_args[0], sum)?)?; + set_aggregator_value_field( + &agg_struct, + create_value_by_type(&ty_args[0], sum, EUNSUPPORTED_AGGREGATOR_TYPE)?, + )?; true }, Err(_) => false, @@ -336,7 +314,7 @@ fn native_try_sub( debug_assert_eq!(ty_args.len(), 1); context.charge(AGGREGATOR_V2_TRY_SUB_BASE)?; - let input = pop_value_by_type(&ty_args[0], &mut args)?; + let input = pop_value_by_type(&ty_args[0], &mut args, EUNSUPPORTED_AGGREGATOR_TYPE)?; let agg_struct = safely_pop_arg!(args, StructRef); let (agg_value, agg_max_value) = get_aggregator_fields_by_type(&ty_args[0], &agg_struct)?; @@ -352,7 +330,10 @@ fn native_try_sub( let math = BoundedMath::new(agg_max_value); match math.unsigned_subtract(agg_value, input) { Ok(sum) => { - set_aggregator_value_field(&agg_struct, create_value_by_type(&ty_args[0], sum)?)?; + set_aggregator_value_field( + &agg_struct, + create_value_by_type(&ty_args[0], sum, EUNSUPPORTED_AGGREGATOR_TYPE)?, + )?; true }, Err(_) => false, @@ -391,7 +372,11 @@ fn native_read( StatusCode::UNKNOWN_INVARIANT_VIOLATION_ERROR, ))); }; - Ok(smallvec![create_value_by_type(&ty_args[0], result_value)?]) + Ok(smallvec![create_value_by_type( + &ty_args[0], + result_value, + EUNSUPPORTED_AGGREGATOR_TYPE + )?]) } /*************************************************************************************************** @@ -414,20 +399,25 @@ fn native_snapshot( let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { let aggregator_id = aggregator_value_field_as_id(agg_value, resolver)?; + let width = get_width_by_type(&ty_args[0], EUNSUPPORTED_AGGREGATOR_TYPE)?; delayed_field_data - .snapshot(aggregator_id, agg_max_value, resolver)? + .snapshot(aggregator_id, agg_max_value, width, resolver)? .as_u64() as u128 } else { agg_value }; Ok(smallvec![Value::struct_(Struct::pack(vec![ - create_value_by_type(&ty_args[0], result_value)? + create_value_by_type( + &ty_args[0], + result_value, + EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE + )? ]))]) } /*************************************************************************************************** - * native fun create_snapshot(value: Element): AggregatorSnapshot + * native fun create_snapshot(value: IntElement): AggregatorSnapshot **************************************************************************************************/ fn native_create_snapshot( @@ -441,32 +431,31 @@ fn native_create_snapshot( debug_assert_eq!(args.len(), 1); context.charge(AGGREGATOR_V2_CREATE_SNAPSHOT_BASE)?; - let snapshot_type = SnapshotType::from_ty_arg(context, &ty_args[0])?; - let input = snapshot_type.pop_snapshot_value_by_type(&mut args)?; - - if let SnapshotValue::String(v) = &input { - context.charge(AGGREGATOR_V2_CREATE_SNAPSHOT_PER_BYTE * NumBytes::new(v.len() as u64))?; - if v.len() > STRING_SNAPSHOT_INPUT_MAX_LENGTH { - return Err(SafeNativeError::Abort { - abort_code: EINPUT_STRING_LENGTH_TOO_LARGE, - }); - } - } + let input = pop_value_by_type( + &ty_args[0], + &mut args, + EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, + )?; let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { - let snapshot_id = delayed_field_data.create_new_snapshot(input, resolver); - SnapshotValue::Integer(snapshot_id.as_u64() as u128) + let width = get_width_by_type(&ty_args[0], EUNSUPPORTED_AGGREGATOR_TYPE)?; + let snapshot_id = delayed_field_data.create_new_snapshot(input, width, resolver); + snapshot_id.as_u64() as u128 } else { input }; Ok(smallvec![Value::struct_(Struct::pack(vec![ - snapshot_type.create_snapshot_value_by_type(result_value)? + create_value_by_type( + &ty_args[0], + result_value, + EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE + )? ]))]) } /*************************************************************************************************** - * native fun copy_snapshot(snapshot: &AggregatorSnapshot): AggregatorSnapshot + * native fun copy_snapshot(snapshot: &AggregatorSnapshot): AggregatorSnapshot **************************************************************************************************/ fn native_copy_snapshot( @@ -502,7 +491,7 @@ fn native_copy_snapshot( } /*************************************************************************************************** - * native fun read_snapshot(snapshot: &AggregatorSnapshot): Element; + * native fun read_snapshot(snapshot: &AggregatorSnapshot): IntElement; **************************************************************************************************/ fn native_read_snapshot( @@ -516,8 +505,8 @@ fn native_read_snapshot( debug_assert_eq!(args.len(), 1); context.charge(AGGREGATOR_V2_READ_SNAPSHOT_BASE)?; - let snapshot_type = SnapshotType::from_ty_arg(context, &ty_args[0])?; - let snapshot_value = snapshot_type.pop_snapshot_field_by_type(&mut args)?; + let snapshot_value = + get_snapshot_field_by_type(&ty_args[0], &safely_pop_arg!(args, StructRef))?; let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { let aggregator_id = aggregator_snapshot_value_field_as_id(snapshot_value, resolver)?; @@ -526,56 +515,122 @@ fn native_read_snapshot( snapshot_value }; - Ok(smallvec![ - snapshot_type.create_snapshot_value_by_type(result_value)? - ]) + Ok(smallvec![create_value_by_type( + &ty_args[0], + result_value, + EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE + )?]) } /*************************************************************************************************** * native fun string_concat(before: String, snapshot: &AggregatorSnapshot, after: String): AggregatorSnapshot; **************************************************************************************************/ - fn native_string_concat( + context: &mut SafeNativeContext, + _ty_args: Vec, + _args: VecDeque, +) -> SafeNativeResult> { + abort_if_not_enabled!(context); + + // Deprecated function in favor of `derive_string_concat`. + + Err(SafeNativeError::Abort { + abort_code: EAGGREGATOR_FUNCTION_NOT_YET_SUPPORTED, + }) +} + +/*************************************************************************************************** + * native fun read_derived_string(snapshot: &DerivedStringSnapshot): String + **************************************************************************************************/ + +fn native_read_derived_string( context: &mut SafeNativeContext, ty_args: Vec, mut args: VecDeque, ) -> SafeNativeResult> { abort_if_not_enabled!(context); - debug_assert_eq!(ty_args.len(), 1); - debug_assert_eq!(args.len(), 3); - context.charge(AGGREGATOR_V2_STRING_CONCAT_BASE)?; + debug_assert_eq!(ty_args.len(), 0); + debug_assert_eq!(args.len(), 1); + context.charge(AGGREGATOR_V2_READ_SNAPSHOT_BASE)?; + + let derived_string_value = get_derived_string_field(&safely_pop_arg!(args, StructRef))?; + + let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { + let delayed_id = derived_string_value_field_as_id(derived_string_value, resolver)?; + delayed_field_data.read_derived(delayed_id, resolver)? + } else { + derived_string_value + }; + + Ok(smallvec![create_string_value(result_value)]) +} + +/*************************************************************************************************** + * native fun create_derived_string(value: String): DerivedStringSnapshot + **************************************************************************************************/ - let snapshot_input_type = SnapshotType::from_ty_arg(context, &ty_args[0])?; +fn native_create_derived_string( + context: &mut SafeNativeContext, + ty_args: Vec, + mut args: VecDeque, +) -> SafeNativeResult> { + abort_if_not_enabled!(context); + + debug_assert_eq!(ty_args.len(), 0); + debug_assert_eq!(args.len(), 1); + context.charge(AGGREGATOR_V2_CREATE_SNAPSHOT_BASE)?; - // Concat works only with integer snapshot types - // This is to avoid unnecessary recursive snapshot dependencies - if !matches!(snapshot_input_type, SnapshotType::U128 | SnapshotType::U64) { + let input = string_to_bytes(safely_pop_arg!(args, Struct)).map_err(PartialVMError::from)?; + + context.charge(AGGREGATOR_V2_CREATE_SNAPSHOT_PER_BYTE * NumBytes::new(input.len() as u64))?; + if input.len() > DERIVED_STRING_INPUT_MAX_LENGTH { return Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, + abort_code: EINPUT_STRING_LENGTH_TOO_LARGE, }); } + let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { + let snapshot_id = delayed_field_data.create_new_derived(input, resolver)?; + snapshot_id + .into_derived_string_struct() + .map_err(PartialVMError::from)? + } else { + let width = calculate_width_for_constant_string(input.len()); + bytes_and_width_to_derived_string_struct(input, width).map_err(PartialVMError::from)? + }; + Ok(smallvec![result_value]) +} + +/*************************************************************************************************** + * native fun derive_string_concat(before: String, snapshot: &AggregatorSnapshot, after: String): DerivedStringSnapshot; + **************************************************************************************************/ + +fn native_derive_string_concat( + context: &mut SafeNativeContext, + ty_args: Vec, + mut args: VecDeque, +) -> SafeNativeResult> { + abort_if_not_enabled!(context); + + debug_assert_eq!(ty_args.len(), 1); + debug_assert_eq!(args.len(), 3); + context.charge(AGGREGATOR_V2_STRING_CONCAT_BASE)?; + // popping arguments from the end - let suffix = string_to_bytes(safely_pop_arg!(args, Struct))?; + let suffix = string_to_bytes(safely_pop_arg!(args, Struct)).map_err(PartialVMError::from)?; context.charge(AGGREGATOR_V2_STRING_CONCAT_PER_BYTE * NumBytes::new(suffix.len() as u64))?; - let snapshot_value = match snapshot_input_type.pop_snapshot_field_by_type(&mut args)? { - SnapshotValue::Integer(v) => v, - SnapshotValue::String(_) => { - return Err(SafeNativeError::Abort { - abort_code: EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE, - }) - }, - }; + let snapshot_value = + get_snapshot_field_by_type(&ty_args[0], &safely_pop_arg!(args, StructRef))?; - let prefix = string_to_bytes(safely_pop_arg!(args, Struct))?; + let prefix = string_to_bytes(safely_pop_arg!(args, Struct)).map_err(PartialVMError::from)?; context.charge(AGGREGATOR_V2_STRING_CONCAT_PER_BYTE * NumBytes::new(prefix.len() as u64))?; if prefix .len() .checked_add(suffix.len()) - .map_or(false, |v| v > STRING_SNAPSHOT_INPUT_MAX_LENGTH) + .map_or(false, |v| v > DERIVED_STRING_INPUT_MAX_LENGTH) { return Err(SafeNativeError::Abort { abort_code: EINPUT_STRING_LENGTH_TOO_LARGE, @@ -584,20 +639,37 @@ fn native_string_concat( let result_value = if let Some((resolver, mut delayed_field_data)) = get_context_data(context) { let base_id = aggregator_value_field_as_id(snapshot_value, resolver)?; - SnapshotValue::Integer( - delayed_field_data - .string_concat(base_id, prefix, suffix, resolver)? - .as_u64() as u128, - ) + delayed_field_data + .derive_string_concat(base_id, prefix, suffix, resolver)? + .into_derived_string_struct() + .map_err(PartialVMError::from)? } else { - SnapshotValue::String( - SnapshotToStringFormula::Concat { prefix, suffix }.apply_to(snapshot_value), + let snapshot_width = get_width_by_type(&ty_args[0], EUNSUPPORTED_AGGREGATOR_SNAPSHOT_TYPE)?; + let width = calculate_width_for_integer_embeded_string( + prefix.len() + suffix.len(), + DelayedFieldID::new_with_width(0, snapshot_width), ) + .map_err(PartialVMError::from)?; + let output = SnapshotToStringFormula::Concat { prefix, suffix }.apply_to(snapshot_value); + bytes_and_width_to_derived_string_struct(output, width).map_err(PartialVMError::from)? }; - Ok(smallvec![Value::struct_(Struct::pack(vec![ - SnapshotType::String.create_snapshot_value_by_type(result_value)? - ]))]) + Ok(smallvec![result_value]) +} + +#[test] +fn test_max_size_fits() { + DelayedFieldID::new_with_width( + 0, + u32::try_from( + (calculate_width_for_integer_embeded_string( + DERIVED_STRING_INPUT_MAX_LENGTH, + DelayedFieldID::new_with_width(0, 16), + )) + .unwrap(), + ) + .unwrap(), + ); } /*************************************************************************************************** @@ -624,6 +696,9 @@ pub fn make_all( ("copy_snapshot", native_copy_snapshot), ("read_snapshot", native_read_snapshot), ("string_concat", native_string_concat), + ("read_derived_string", native_read_derived_string), + ("create_derived_string", native_create_derived_string), + ("derive_string_concat", native_derive_string_concat), ]; builder.make_named_natives(natives) } diff --git a/aptos-move/framework/src/natives/aggregator_natives/context.rs b/aptos-move/framework/src/natives/aggregator_natives/context.rs index d9f327e8641c7..aa20f28b322a0 100644 --- a/aptos-move/framework/src/natives/aggregator_natives/context.rs +++ b/aptos-move/framework/src/natives/aggregator_natives/context.rs @@ -10,7 +10,9 @@ use aptos_aggregator::{ resolver::{AggregatorV1Resolver, DelayedFieldResolver}, types::DelayedFieldID, }; -use aptos_types::{aggregator::PanicError, state_store::state_key::StateKey, write_set::WriteOp}; +use aptos_types::{ + delayed_fields::PanicError, state_store::state_key::StateKey, write_set::WriteOp, +}; use better_any::{Tid, TidAble}; use move_core_types::value::MoveTypeLayout; use std::{ @@ -152,14 +154,13 @@ impl<'a> NativeAggregatorContext<'a> { mod test { use super::*; use aptos_aggregator::{ - aggregator_v1_id_for_test, aggregator_v1_state_key_for_test, - bounded_math::SignedU128, - delayed_change::DelayedApplyChange, - delta_change_set::DeltaWithMax, - delta_math::DeltaHistory, - tests::types::FAKE_AGGREGATOR_VIEW_GEN_ID_START, - types::{DelayedFieldValue, SnapshotToStringFormula}, - FakeAggregatorView, + aggregator_v1_id_for_test, aggregator_v1_state_key_for_test, bounded_math::SignedU128, + delayed_change::DelayedApplyChange, delta_change_set::DeltaWithMax, + delta_math::DeltaHistory, tests::types::FAKE_AGGREGATOR_VIEW_GEN_ID_START, + types::DelayedFieldValue, FakeAggregatorView, + }; + use aptos_types::delayed_fields::{ + calculate_width_for_integer_embeded_string, SnapshotToStringFormula, }; use claims::{assert_matches, assert_ok, assert_ok_eq, assert_some_eq}; @@ -168,8 +169,8 @@ mod test { state_view.set_from_state_key(aggregator_v1_state_key_for_test(500), 150); state_view.set_from_state_key(aggregator_v1_state_key_for_test(600), 100); state_view.set_from_state_key(aggregator_v1_state_key_for_test(700), 200); - state_view.set_from_aggregator_id(DelayedFieldID::new(900), 300); - state_view.set_from_aggregator_id(DelayedFieldID::new(1000), 400); + state_view.set_from_aggregator_id(DelayedFieldID::new_with_width(900, 8), 300); + state_view.set_from_aggregator_id(DelayedFieldID::new_with_width(1000, 8), 400); state_view } @@ -282,13 +283,13 @@ mod test { fn get_test_resolver_v2() -> FakeAggregatorView { let mut state_view = FakeAggregatorView::default(); - state_view.set_from_aggregator_id(DelayedFieldID::new(900), 300); - state_view.set_from_aggregator_id(DelayedFieldID::new(1000), 400); + state_view.set_from_aggregator_id(DelayedFieldID::new_with_width(900, 8), 300); + state_view.set_from_aggregator_id(DelayedFieldID::new_with_width(1000, 8), 400); state_view } - fn id_from_fake_idx(idx: u64) -> DelayedFieldID { - DelayedFieldID::new(FAKE_AGGREGATOR_VIEW_GEN_ID_START as u64 + idx) + fn id_from_fake_idx(idx: u32, width: u32) -> DelayedFieldID { + DelayedFieldID::new_with_width(FAKE_AGGREGATOR_VIEW_GEN_ID_START + idx, width) } // All aggregators are initialized deterministically based on their ID, @@ -311,7 +312,7 @@ mod test { assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, SignedU128::Positive(200), context.delayed_field_resolver @@ -322,24 +323,26 @@ mod test { // failed because of wrong max_value assert!(delayed_field_data .snapshot( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 800, - context.delayed_field_resolver + 8, + context.delayed_field_resolver, ) .is_err()); assert_ok_eq!( delayed_field_data.snapshot( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, + 8, context.delayed_field_resolver ), - id_from_fake_idx(0) + id_from_fake_idx(0, 8) ); assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, SignedU128::Negative(501), context.delayed_field_resolver @@ -349,7 +352,7 @@ mod test { assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, SignedU128::Positive(300), context.delayed_field_resolver @@ -359,16 +362,17 @@ mod test { assert_ok_eq!( delayed_field_data.snapshot( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, + 8, context.delayed_field_resolver ), - id_from_fake_idx(1) + id_from_fake_idx(1, 8) ); assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, SignedU128::Positive(100), context.delayed_field_resolver @@ -378,7 +382,7 @@ mod test { assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(900), + DelayedFieldID::new_with_width(900, 8), 900, SignedU128::Positive(51), context.delayed_field_resolver @@ -386,10 +390,10 @@ mod test { false ); - delayed_field_data.create_new_aggregator(DelayedFieldID::new(2000)); + delayed_field_data.create_new_aggregator(DelayedFieldID::new_with_width(2000, 8)); assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(2000), + DelayedFieldID::new_with_width(2000, 8), 2000, SignedU128::Positive(500), context.delayed_field_resolver @@ -399,36 +403,42 @@ mod test { assert_ok_eq!( delayed_field_data.snapshot( - DelayedFieldID::new(2000), + DelayedFieldID::new_with_width(2000, 8), 2000, + 8, context.delayed_field_resolver ), - id_from_fake_idx(2) + id_from_fake_idx(2, 8) ); + let derived_width = assert_ok!(calculate_width_for_integer_embeded_string( + "prefixsuffix".as_bytes().len(), + id_from_fake_idx(0, 8) + )) as u32; + assert_ok_eq!( - delayed_field_data.string_concat( - id_from_fake_idx(2), + delayed_field_data.derive_string_concat( + id_from_fake_idx(2, 8), "prefix".as_bytes().to_vec(), "suffix".as_bytes().to_vec(), context.delayed_field_resolver, ), - id_from_fake_idx(3) + id_from_fake_idx(3, derived_width), ); assert_ok_eq!( - delayed_field_data.string_concat( - id_from_fake_idx(0), + delayed_field_data.derive_string_concat( + id_from_fake_idx(0, 8), "prefix".as_bytes().to_vec(), "suffix".as_bytes().to_vec(), context.delayed_field_resolver, ), - id_from_fake_idx(4) + id_from_fake_idx(4, derived_width), ); assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(2000), + DelayedFieldID::new_with_width(2000, 8), 2000, SignedU128::Positive(1700), context.delayed_field_resolver @@ -437,7 +447,7 @@ mod test { ); assert_ok_eq!( delayed_field_data.try_add_delta( - DelayedFieldID::new(2000), + DelayedFieldID::new_with_width(2000, 8), 2000, SignedU128::Negative(501), context.delayed_field_resolver @@ -452,9 +462,9 @@ mod test { let context = NativeAggregatorContext::new([0; 32], &resolver, &resolver); test_set_up_v2(&context); let delayed_field_changes = context.into_delayed_fields(); - assert!(!delayed_field_changes.contains_key(&DelayedFieldID::new(1000))); + assert!(!delayed_field_changes.contains_key(&DelayedFieldID::new_with_width(1000, 8))); assert_some_eq!( - delayed_field_changes.get(&DelayedFieldID::new(900)), + delayed_field_changes.get(&DelayedFieldID::new_with_width(900, 8)), &DelayedChange::Apply(DelayedApplyChange::AggregatorDelta { delta: DeltaWithMax::new(SignedU128::Positive(600), 900) }), @@ -463,39 +473,45 @@ mod test { // So their validation validates full transaction, and it is not // needed to check aggregators too (i.e. when we do read_snapshot) assert_some_eq!( - delayed_field_changes.get(&id_from_fake_idx(0)), + delayed_field_changes.get(&id_from_fake_idx(0, 8)), &DelayedChange::Apply(DelayedApplyChange::SnapshotDelta { - base_aggregator: DelayedFieldID::new(900), + base_aggregator: DelayedFieldID::new_with_width(900, 8), delta: DeltaWithMax::new(SignedU128::Positive(200), 900) }), ); assert_some_eq!( - delayed_field_changes.get(&id_from_fake_idx(1)), + delayed_field_changes.get(&id_from_fake_idx(1, 8)), &DelayedChange::Apply(DelayedApplyChange::SnapshotDelta { - base_aggregator: DelayedFieldID::new(900), + base_aggregator: DelayedFieldID::new_with_width(900, 8), delta: DeltaWithMax::new(SignedU128::Positive(500), 900) }), ); assert_some_eq!( - delayed_field_changes.get(&DelayedFieldID::new(2000)), + delayed_field_changes.get(&DelayedFieldID::new_with_width(2000, 8)), &DelayedChange::Create(DelayedFieldValue::Aggregator(500)), ); assert_some_eq!( - delayed_field_changes.get(&id_from_fake_idx(2)), + delayed_field_changes.get(&id_from_fake_idx(2, 8)), &DelayedChange::Create(DelayedFieldValue::Snapshot(500)), ); + + let derived_width = assert_ok!(calculate_width_for_integer_embeded_string( + "prefixsuffix".as_bytes().len(), + id_from_fake_idx(0, 8) + )) as u32; + assert_some_eq!( - delayed_field_changes.get(&id_from_fake_idx(3)), + delayed_field_changes.get(&id_from_fake_idx(3, derived_width)), &DelayedChange::Create(DelayedFieldValue::Derived( "prefix500suffix".as_bytes().to_vec() )), ); assert_some_eq!( - delayed_field_changes.get(&id_from_fake_idx(4)), + delayed_field_changes.get(&id_from_fake_idx(4, derived_width)), &DelayedChange::Apply(DelayedApplyChange::SnapshotDerived { - base_snapshot: id_from_fake_idx(0), + base_snapshot: id_from_fake_idx(0, 8), formula: SnapshotToStringFormula::Concat { prefix: "prefix".as_bytes().to_vec(), suffix: "suffix".as_bytes().to_vec(), diff --git a/aptos-move/framework/src/natives/aggregator_natives/helpers_v1.rs b/aptos-move/framework/src/natives/aggregator_natives/helpers_v1.rs index a5c3e78d8f425..6082b2f2e0f89 100644 --- a/aptos-move/framework/src/natives/aggregator_natives/helpers_v1.rs +++ b/aptos-move/framework/src/natives/aggregator_natives/helpers_v1.rs @@ -34,8 +34,8 @@ pub(crate) fn get_handle(aggregator_table: &StructRef) -> PartialVMResult PartialVMResult { - let field_ref = aggregator.borrow_field(index)?.value_as::()?; +pub(crate) fn get_struct_field(value: &StructRef, index: usize) -> PartialVMResult { + let field_ref = value.borrow_field(index)?.value_as::()?; field_ref.read_ref() } @@ -51,10 +51,9 @@ pub(crate) fn set_aggregator_field( /// Returns ID and a limit of aggregator based on a reference to `Aggregator` Move struct. pub(crate) fn aggregator_info(aggregator: &StructRef) -> PartialVMResult<(AggregatorID, u128)> { - let handle = - get_aggregator_field(aggregator, HANDLE_FIELD_INDEX)?.value_as::()?; - let key = get_aggregator_field(aggregator, KEY_FIELD_INDEX)?.value_as::()?; - let limit = get_aggregator_field(aggregator, LIMIT_FIELD_INDEX)?.value_as::()?; + let handle = get_struct_field(aggregator, HANDLE_FIELD_INDEX)?.value_as::()?; + let key = get_struct_field(aggregator, KEY_FIELD_INDEX)?.value_as::()?; + let limit = get_struct_field(aggregator, LIMIT_FIELD_INDEX)?.value_as::()?; Ok((AggregatorID::new(TableHandle(handle), key), limit)) } diff --git a/aptos-move/framework/src/natives/aggregator_natives/helpers_v2.rs b/aptos-move/framework/src/natives/aggregator_natives/helpers_v2.rs index 571cb687527c7..8cca9ceacd1e2 100644 --- a/aptos-move/framework/src/natives/aggregator_natives/helpers_v2.rs +++ b/aptos-move/framework/src/natives/aggregator_natives/helpers_v2.rs @@ -1,19 +1,21 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use super::helpers_v1::{get_aggregator_field, set_aggregator_field}; -use aptos_aggregator::{ - resolver::DelayedFieldResolver, - types::{DelayedFieldID, SnapshotValue}, - utils::{from_utf8_bytes, u128_to_u64}, -}; +use super::helpers_v1::set_aggregator_field; +use aptos_aggregator::{resolver::DelayedFieldResolver, types::DelayedFieldID}; +use aptos_types::delayed_fields::{from_utf8_bytes, u128_to_u64}; use move_binary_format::errors::PartialVMResult; use move_vm_types::values::{StructRef, Value}; /// Indices of `value` and `limit` fields in the `Aggregator` Move /// struct. -const VALUE_FIELD_INDEX: usize = 0; -const LIMIT_FIELD_INDEX: usize = 1; +pub(crate) const AGG_VALUE_FIELD_INDEX: usize = 0; +pub(crate) const AGG_MAX_VALUE_FIELD_INDEX: usize = 1; + +pub(crate) const AGG_SNAPSHOT_VALUE_FIELD_INDEX: usize = 0; + +pub(crate) const DERIVED_STRING_VALUE_FIELD_INDEX: usize = 0; +// pub (crate) const DERIVED_STRING_PADDING_FIELD_INDEX: usize = 1; pub(crate) fn aggregator_value_field_as_id( value: u128, @@ -24,44 +26,23 @@ pub(crate) fn aggregator_value_field_as_id( } pub(crate) fn aggregator_snapshot_value_field_as_id( - value: SnapshotValue, + value: u128, resolver: &dyn DelayedFieldResolver, ) -> PartialVMResult { - match value { - SnapshotValue::Integer(v) => aggregator_value_field_as_id(v, resolver), - SnapshotValue::String(v) => { - let value_u64: u64 = from_utf8_bytes(v)?; - Ok(resolver.validate_and_convert_delayed_field_id(value_u64)?) - }, - } + aggregator_value_field_as_id(value, resolver) } -/// Given a reference to `Aggregator` Move struct, returns a tuple of its -/// fields: (`value`, `limit`). -pub(crate) fn get_aggregator_fields_u128(aggregator: &StructRef) -> PartialVMResult<(u128, u128)> { - let value = get_aggregator_field(aggregator, VALUE_FIELD_INDEX)?.value_as::()?; - let limit = get_aggregator_field(aggregator, LIMIT_FIELD_INDEX)?.value_as::()?; - Ok((value, limit)) +pub(crate) fn derived_string_value_field_as_id( + value: Vec, + resolver: &dyn DelayedFieldResolver, +) -> PartialVMResult { + let value_u64: u64 = from_utf8_bytes(value)?; + Ok(resolver.validate_and_convert_delayed_field_id(value_u64)?) } pub(crate) fn set_aggregator_value_field( aggregator: &StructRef, value: Value, ) -> PartialVMResult<()> { - set_aggregator_field(aggregator, VALUE_FIELD_INDEX, value) -} - -/// Given a reference to `Aggregator` Move struct, returns a tuple of its -/// fields: (`value`, `limit`). -pub(crate) fn get_aggregator_fields_u64(aggregator: &StructRef) -> PartialVMResult<(u64, u64)> { - let value = get_aggregator_field(aggregator, VALUE_FIELD_INDEX)?.value_as::()?; - let limit = get_aggregator_field(aggregator, LIMIT_FIELD_INDEX)?.value_as::()?; - Ok((value, limit)) -} - -/// Returns ID of aggregator snapshot based on a reference to `AggregatorSnapshot` Move struct. -pub(crate) fn aggregator_snapshot_field_value( - aggregator_snapshot: &StructRef, -) -> PartialVMResult { - get_aggregator_field(aggregator_snapshot, VALUE_FIELD_INDEX) + set_aggregator_field(aggregator, AGG_VALUE_FIELD_INDEX, value) } diff --git a/aptos-move/mvhashmap/src/versioned_delayed_fields.rs b/aptos-move/mvhashmap/src/versioned_delayed_fields.rs index 29f7885a57ed3..002bf21d1d809 100644 --- a/aptos-move/mvhashmap/src/versioned_delayed_fields.rs +++ b/aptos-move/mvhashmap/src/versioned_delayed_fields.rs @@ -710,11 +710,10 @@ impl TVersionedDelayedFieldView mod test { use super::*; use aptos_aggregator::{ - bounded_math::SignedU128, - delta_change_set::DeltaOp, - delta_math::DeltaHistory, - types::{DelayedFieldID, SnapshotToStringFormula}, + bounded_math::SignedU128, delta_change_set::DeltaOp, delta_math::DeltaHistory, + types::DelayedFieldID, }; + use aptos_types::delayed_fields::SnapshotToStringFormula; use claims::{assert_err_eq, assert_ok_eq, assert_some}; use test_case::test_case; @@ -767,11 +766,11 @@ mod test { delta: test_delta(), })), APPLY_SNAPSHOT => Some(VersionEntry::Apply(DelayedApplyEntry::SnapshotDelta { - base_aggregator: DelayedFieldID::new(2), + base_aggregator: DelayedFieldID::new_for_test_for_u64(2), delta: test_delta(), })), APPLY_DERIVED => Some(VersionEntry::Apply(DelayedApplyEntry::SnapshotDerived { - base_snapshot: DelayedFieldID::new(3), + base_snapshot: DelayedFieldID::new_for_test_for_u64(3), formula: test_formula(), })), ESTIMATE_NO_BYPASS => Some(VersionEntry::Estimate(EstimatedEntry::NoBypass)), @@ -849,10 +848,10 @@ mod test { assert_ok_eq!( $cond, VersionedRead::DependentApply( - DelayedFieldID::new($expected_id), + DelayedFieldID::new_for_test_for_u64($expected_id), $expected_txn_index, DelayedApplyEntry::SnapshotDelta { - base_aggregator: DelayedFieldID::new($expected_id), + base_aggregator: DelayedFieldID::new_for_test_for_u64($expected_id), delta: $expected_delta } ) @@ -865,10 +864,10 @@ mod test { assert_ok_eq!( $cond, VersionedRead::DependentApply( - DelayedFieldID::new($expected_id), + DelayedFieldID::new_for_test_for_u64($expected_id), $expected_txn_index, DelayedApplyEntry::SnapshotDerived { - base_snapshot: DelayedFieldID::new($expected_id), + base_snapshot: DelayedFieldID::new_for_test_for_u64($expected_id), formula: $expected_formula } ) @@ -1025,7 +1024,11 @@ mod test { let mut v = VersionedValue::new(None); v.insert_speculative_value( 8, - aggregator_entry_snapshot_value_and_delta(13, test_delta(), DelayedFieldID::new(2)), + aggregator_entry_snapshot_value_and_delta( + 13, + test_delta(), + DelayedFieldID::new_for_test_for_u64(2), + ), ) .unwrap(); @@ -1084,7 +1087,7 @@ mod test { aggregator_entry_derived_value_and_delta( vec![70, 80, 90], test_formula(), - DelayedFieldID::new(3), + DelayedFieldID::new_for_test_for_u64(3), ), ) .unwrap(); diff --git a/aptos-node/src/lib.rs b/aptos-node/src/lib.rs index 5a30624975797..5834b0e5bdab8 100644 --- a/aptos-node/src/lib.rs +++ b/aptos-node/src/lib.rs @@ -20,7 +20,9 @@ use anyhow::anyhow; use aptos_admin_service::AdminService; use aptos_api::bootstrap as bootstrap_api; use aptos_build_info::build_information; -use aptos_config::config::{merge_node_config, NodeConfig, PersistableConfig}; +use aptos_config::config::{ + merge_node_config, InitialSafetyRulesConfig, NodeConfig, PersistableConfig, +}; use aptos_dkg_runtime::start_dkg_runtime; use aptos_framework::ReleaseBundle; use aptos_jwk_consensus::start_jwk_consensus_runtime; @@ -660,7 +662,15 @@ pub fn setup_environment_and_start_node( ); // Ensure consensus key in secure DB. - aptos_safety_rules::safety_rules_manager::storage(&node_config.consensus.safety_rules); + if !matches!( + node_config + .consensus + .safety_rules + .initial_safety_rules_config, + InitialSafetyRulesConfig::None + ) { + aptos_safety_rules::safety_rules_manager::storage(&node_config.consensus.safety_rules); + } let vtxn_pool = VTxnPoolState::default(); let maybe_dkg_dealer_sk = diff --git a/config/src/config/network_config.rs b/config/src/config/network_config.rs index e0b006e6c4ffd..c2e9d0a280989 100644 --- a/config/src/config/network_config.rs +++ b/config/src/config/network_config.rs @@ -125,6 +125,8 @@ pub struct NetworkConfig { pub max_message_size: usize, /// The maximum number of parallel message deserialization tasks that can run (per application) pub max_parallel_deserialization_tasks: Option, + /// Whether or not to enable latency aware peer dialing + pub enable_latency_aware_dialing: bool, } impl Default for NetworkConfig { @@ -166,6 +168,7 @@ impl NetworkConfig { outbound_rx_buffer_size_bytes: None, outbound_tx_buffer_size_bytes: None, max_parallel_deserialization_tasks: None, + enable_latency_aware_dialing: true, }; // Configure the number of parallel deserialization tasks diff --git a/consensus/src/dag/bootstrap.rs b/consensus/src/dag/bootstrap.rs index 4e98dd70c801c..f671b957c9f05 100644 --- a/consensus/src/dag/bootstrap.rs +++ b/consensus/src/dag/bootstrap.rs @@ -46,7 +46,7 @@ use aptos_logger::{debug, info}; use aptos_reliable_broadcast::{RBNetworkSender, ReliableBroadcast}; use aptos_types::{ epoch_state::EpochState, - on_chain_config::{DagConsensusConfigV1, Features, ValidatorTxnConfig}, + on_chain_config::{DagConsensusConfigV1, FeatureFlag, Features, ValidatorTxnConfig}, validator_signer::ValidatorSigner, }; use async_trait::async_trait; @@ -651,6 +651,8 @@ pub(super) fn bootstrap_dag_for_test( UnboundedReceiver, ) { let (ordered_nodes_tx, ordered_nodes_rx) = futures_channel::mpsc::unbounded(); + let mut features = Features::default(); + features.enable(FeatureFlag::RECONFIGURE_WITH_DKG); let bootstraper = DagBootstrapper::new( self_peer, DagConsensusConfig::default(), @@ -669,7 +671,7 @@ pub(super) fn bootstrap_dag_for_test( false, ValidatorTxnConfig::default_enabled(), BoundedExecutor::new(2, Handle::current()), - Features::default(), + features, ); let (_base_state, handler, fetch_service) = bootstraper.full_bootstrap(); diff --git a/consensus/src/dag/rb_handler.rs b/consensus/src/dag/rb_handler.rs index 46cfeb76ed3fb..8f8afaa57aaab 100644 --- a/consensus/src/dag/rb_handler.rs +++ b/consensus/src/dag/rb_handler.rs @@ -102,7 +102,6 @@ impl NodeBroadcastHandler { vtxn.topic() ); } - let vtxn_total_bytes = node .validator_txns() .iter() diff --git a/consensus/src/dag/tests/integration_tests.rs b/consensus/src/dag/tests/integration_tests.rs index fc38b61557bbf..725300073db70 100644 --- a/consensus/src/dag/tests/integration_tests.rs +++ b/consensus/src/dag/tests/integration_tests.rs @@ -213,7 +213,6 @@ async fn test_dag_e2e() { let runtime = consensus_runtime(); let mut playground = NetworkPlayground::new(runtime.handle().clone()); let (signers, validators) = random_validator_verifier(num_nodes, None, false); - let (nodes, mut ordered_node_receivers) = bootstrap_nodes(&mut playground, signers, validators); for node in nodes { runtime.spawn(node.start()); @@ -229,7 +228,6 @@ async fn test_dag_e2e() { } let first = all_ordered.first().unwrap(); assert_gt!(first.len(), 0, "must order nodes"); - debug!("Nodes: {:?}", first); for a in all_ordered.iter() { assert_eq!(a.len(), first.len(), "length should match"); assert_eq!(a, first); diff --git a/consensus/src/payload_client/mixed.rs b/consensus/src/payload_client/mixed.rs index e2f920c2d8efb..a8b63521d1822 100644 --- a/consensus/src/payload_client/mixed.rs +++ b/consensus/src/payload_client/mixed.rs @@ -11,12 +11,14 @@ use crate::{ use aptos_consensus_types::common::{Payload, PayloadFilter}; use aptos_logger::debug; use aptos_types::{ + dkg::{DKGTranscript, DKGTranscriptMetadata}, on_chain_config::ValidatorTxnConfig, - validator_txn::{DummyValidatorTransaction, ValidatorTransaction}, + validator_txn::ValidatorTransaction, }; use aptos_validator_transaction_pool as vtxn_pool; use fail::fail_point; use futures::future::BoxFuture; +use move_core_types::account_address::AccountAddress; #[cfg(test)] use std::collections::HashSet; use std::{ @@ -49,13 +51,12 @@ impl MixedPayloadClient { /// When enabled in smoke tests, generate 2 random validator transactions, 1 valid, 1 invalid. fn extra_test_only_vtxns(&self) -> Vec { fail_point!("mixed_payload_client::extra_test_only_vtxns", |_| vec![ - ValidatorTransaction::DummyTopic1(DummyValidatorTransaction { - valid: true, - payload: b"P0".to_vec(), - }), - ValidatorTransaction::DummyTopic1(DummyValidatorTransaction { - valid: false, - payload: b"P1".to_vec(), + ValidatorTransaction::DKGResult(DKGTranscript { + metadata: DKGTranscriptMetadata { + epoch: 999, + author: AccountAddress::ZERO, + }, + transcript_bytes: vec![], }), ]); vec![] @@ -127,9 +128,9 @@ impl PayloadClient for MixedPayloadClient { #[tokio::test] async fn mixed_payload_client_should_prioritize_validator_txns() { let all_validator_txns = vec![ - ValidatorTransaction::dummy1(b"1".to_vec()), - ValidatorTransaction::dummy1(b"22".to_vec()), - ValidatorTransaction::dummy1(b"333".to_vec()), + ValidatorTransaction::dummy(b"1".to_vec()), + ValidatorTransaction::dummy(b"22".to_vec()), + ValidatorTransaction::dummy(b"333".to_vec()), ]; let all_user_txns = crate::test_utils::create_vec_signed_transactions(10); @@ -232,9 +233,9 @@ async fn mixed_payload_client_should_prioritize_validator_txns() { #[tokio::test] async fn mixed_payload_client_should_respect_validator_txn_feature_flag() { let all_validator_txns = vec![ - ValidatorTransaction::dummy1(b"1".to_vec()), - ValidatorTransaction::dummy1(b"22".to_vec()), - ValidatorTransaction::dummy1(b"333".to_vec()), + ValidatorTransaction::dummy(b"1".to_vec()), + ValidatorTransaction::dummy(b"22".to_vec()), + ValidatorTransaction::dummy(b"333".to_vec()), ]; let all_user_txns = crate::test_utils::create_vec_signed_transactions(10); diff --git a/consensus/src/round_manager_test.rs b/consensus/src/round_manager_test.rs index cbca9989ca93c..2cc270ce9ada4 100644 --- a/consensus/src/round_manager_test.rs +++ b/consensus/src/round_manager_test.rs @@ -2038,7 +2038,7 @@ fn no_vote_on_proposal_ext_when_feature_disabled() { let genesis_qc = certificate_for_genesis(); let invalid_block = Block::new_proposal_ext( - vec![ValidatorTransaction::dummy1(vec![0xFF]); 5], + vec![ValidatorTransaction::dummy(vec![0xFF]); 5], Payload::empty(false), 1, 1, @@ -2157,6 +2157,8 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { ..Default::default() }; + let mut features = Features::default(); + features.enable(FeatureFlag::RECONFIGURE_WITH_DKG); let mut nodes = NodeSetup::create_nodes( &mut playground, runtime.handle().clone(), @@ -2167,7 +2169,7 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { vtxn: vtxn_config, }), Some(local_config), - None, + Some(features), ); let node = &mut nodes[0]; let genesis_qc = certificate_for_genesis(); @@ -2184,7 +2186,7 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { .unwrap(); let block_too_many_vtxns = Block::new_proposal_ext( - vec![ValidatorTransaction::dummy1(vec![0xFF; 20]); 6], + vec![ValidatorTransaction::dummy(vec![0xFF; 20]); 6], Payload::DirectMempool(create_vec_signed_transactions(4)), 1, 1, @@ -2195,7 +2197,7 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { .unwrap(); let block_too_large = Block::new_proposal_ext( - vec![ValidatorTransaction::dummy1(vec![0xFF; 200]); 1], // total_bytes >= 200 * 1 = 200 + vec![ValidatorTransaction::dummy(vec![0xFF; 200]); 1], // total_bytes >= 200 * 1 = 200 Payload::DirectMempool(create_vec_signed_transactions(9)), // = total_bytes >= 69 * 9 = 621 1, 1, @@ -2206,7 +2208,7 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { .unwrap(); let block_vtxns_too_large = Block::new_proposal_ext( - vec![ValidatorTransaction::dummy1(vec![0xFF; 200]); 5], // total_bytes >= 200 * 5 = 1000 + vec![ValidatorTransaction::dummy(vec![0xFF; 200]); 5], // total_bytes >= 200 * 5 = 1000 Payload::empty(false), 1, 1, @@ -2217,7 +2219,7 @@ fn no_vote_on_proposal_ext_when_receiving_limit_exceeded() { .unwrap(); let valid_block = Block::new_proposal_ext( - vec![ValidatorTransaction::dummy1(vec![0xFF; 60]); 5], // total_bytes >= 60 * 5 = 300 + vec![ValidatorTransaction::dummy(vec![0xFF; 20]); 5], // total_bytes >= 60 * 5 = 300 Payload::DirectMempool(create_vec_signed_transactions(5)), // total_bytes >= 69 * 5 = 345 1, 1, diff --git a/consensus/src/state_computer_tests.rs b/consensus/src/state_computer_tests.rs index 3048de436aeb1..99ac2b25b6277 100644 --- a/consensus/src/state_computer_tests.rs +++ b/consensus/src/state_computer_tests.rs @@ -145,8 +145,8 @@ async fn schedule_compute_should_discover_validator_txns() { TransactionFilter::new(Filter::empty()), ); - let validator_txn_0 = ValidatorTransaction::dummy1(vec![0xFF; 99]); - let validator_txn_1 = ValidatorTransaction::dummy1(vec![0xFF; 999]); + let validator_txn_0 = ValidatorTransaction::dummy(vec![0xFF; 99]); + let validator_txn_1 = ValidatorTransaction::dummy(vec![0xFF; 999]); let block = Block::new_for_testing( HashValue::zero(), @@ -198,8 +198,8 @@ async fn commit_should_discover_validator_txns() { TransactionFilter::new(Filter::empty()), ); - let validator_txn_0 = ValidatorTransaction::dummy1(vec![0xFF; 99]); - let validator_txn_1 = ValidatorTransaction::dummy1(vec![0xFF; 999]); + let validator_txn_0 = ValidatorTransaction::dummy(vec![0xFF; 99]); + let validator_txn_1 = ValidatorTransaction::dummy(vec![0xFF; 999]); let block = Block::new_for_testing( HashValue::zero(), diff --git a/consensus/src/test_utils/mock_payload_manager.rs b/consensus/src/test_utils/mock_payload_manager.rs index cfbb1b35a1772..581f114d12520 100644 --- a/consensus/src/test_utils/mock_payload_manager.rs +++ b/consensus/src/test_utils/mock_payload_manager.rs @@ -68,7 +68,7 @@ impl PayloadClient for MockPayloadManager { ) -> Result<(Vec, Payload), QuorumStoreError> { // generate 1k txn is too slow with coverage instrumentation Ok(( - vec![ValidatorTransaction::dummy1(vec![0xFF; 1024])], + vec![ValidatorTransaction::dummy(vec![0xFF; 1024])], random_payload(10), )) } diff --git a/consensus/src/util/mod.rs b/consensus/src/util/mod.rs index 7e570eb11f190..a99fa1b51b859 100644 --- a/consensus/src/util/mod.rs +++ b/consensus/src/util/mod.rs @@ -14,7 +14,6 @@ pub mod time_service; pub fn is_vtxn_expected(features: &Features, vtxn: &ValidatorTransaction) -> bool { match vtxn { - ValidatorTransaction::DummyTopic1(_) | ValidatorTransaction::DummyTopic2(_) => true, ValidatorTransaction::DKGResult(_) => { features.is_enabled(FeatureFlag::RECONFIGURE_WITH_DKG) }, diff --git a/crates/aptos-jwk-consensus/src/epoch_manager.rs b/crates/aptos-jwk-consensus/src/epoch_manager.rs index 8b5497f6fcf44..308e145654391 100644 --- a/crates/aptos-jwk-consensus/src/epoch_manager.rs +++ b/crates/aptos-jwk-consensus/src/epoch_manager.rs @@ -1,11 +1,11 @@ // Copyright © Aptos Foundation use crate::{ - certified_update_producer::RealCertifiedUpdateProducer, jwk_manager::JWKManager, network::{IncomingRpcRequest, NetworkReceivers, NetworkSender}, network_interface::JWKConsensusNetworkClient, types::JWKConsensusMsg, + update_certifier::UpdateCertifier, }; use anyhow::Result; use aptos_bounded_executor::BoundedExecutor; @@ -174,13 +174,13 @@ impl EpochManager

{ Duration::from_millis(1000), BoundedExecutor::new(8, tokio::runtime::Handle::current()), ); - let qc_update_producer = RealCertifiedUpdateProducer::new(rb); + let update_certifier = UpdateCertifier::new(rb); let jwk_consensus_manager = JWKManager::new( self.consensus_key.clone(), self.my_addr, epoch_state.clone(), - Arc::new(qc_update_producer), + Arc::new(update_certifier), self.vtxn_pool.clone(), ); diff --git a/crates/aptos-jwk-consensus/src/jwk_manager/mod.rs b/crates/aptos-jwk-consensus/src/jwk_manager/mod.rs index b0722a1070a50..b098c6985242c 100644 --- a/crates/aptos-jwk-consensus/src/jwk_manager/mod.rs +++ b/crates/aptos-jwk-consensus/src/jwk_manager/mod.rs @@ -1,10 +1,10 @@ // Copyright © Aptos Foundation use crate::{ - certified_update_producer::CertifiedUpdateProducer, jwk_observer::JWKObserver, network::IncomingRpcRequest, types::{JWKConsensusMsg, ObservedUpdate, ObservedUpdateResponse}, + update_certifier::TUpdateCertifier, }; use anyhow::{anyhow, bail, Result}; use aptos_channels::{aptos_channel, message_queues::QueueStyle}; @@ -42,7 +42,7 @@ pub struct JWKManager { consensus_key: Arc, /// The sub-process that collects JWK updates from peers and aggregate them into a quorum-certified JWK update. - certified_update_producer: Arc, + update_certifier: Arc, /// When a quorum-certified JWK update is available, use this to put it into the validator transaction pool. vtxn_pool: VTxnPoolState, @@ -53,7 +53,8 @@ pub struct JWKManager { /// Whether a CLOSE command has been received. stopped: bool, - qc_update_tx: Option>, + qc_update_tx: aptos_channel::Sender, + qc_update_rx: aptos_channel::Receiver, jwk_observers: Vec, } @@ -62,18 +63,20 @@ impl JWKManager { consensus_key: Arc, my_addr: AccountAddress, epoch_state: Arc, - certified_update_producer: Arc, + update_certifier: Arc, vtxn_pool: VTxnPoolState, ) -> Self { + let (qc_update_tx, qc_update_rx) = aptos_channel::new(QueueStyle::KLAST, 1, None); Self { consensus_key, my_addr, epoch_state, - certified_update_producer, + update_certifier, vtxn_pool, states_by_issuer: HashMap::default(), stopped: false, - qc_update_tx: None, + qc_update_tx, + qc_update_rx, jwk_observers: vec![], } } @@ -88,8 +91,6 @@ impl JWKManager { ) { self.reset_with_on_chain_state(observed_jwks.unwrap_or_default().into_providers_jwks()) .unwrap(); - let (qc_update_tx, mut qc_update_rx) = aptos_channel::new(QueueStyle::FIFO, 100, None); - self.qc_update_tx = Some(qc_update_tx); let (local_observation_tx, mut local_observation_rx) = aptos_channel::new(QueueStyle::KLAST, 100, None); @@ -120,7 +121,7 @@ impl JWKManager { (_sender, msg) = rpc_req_rx.select_next_some() => { self.process_peer_request(msg) }, - qc_update = qc_update_rx.select_next_some() => { + qc_update = self.qc_update_rx.select_next_some() => { self.process_quorum_certified_update(qc_update) }, (issuer, jwks) = local_observation_rx.select_next_some() => { @@ -169,7 +170,7 @@ impl JWKManager { .consensus_key .sign(&observed) .map_err(|e| anyhow!("crypto material error occurred duing signing: {}", e))?; - let abort_handle = self.certified_update_producer.start_produce( + let abort_handle = self.update_certifier.start_produce( self.epoch_state.clone(), observed.clone(), self.qc_update_tx.clone(), @@ -201,15 +202,18 @@ impl JWKManager { .collect(); self.states_by_issuer .retain(|issuer, _| onchain_issuer_set.contains(issuer)); - for provider_jwks in on_chain_state.entries { - let x = self + for on_chain_provider_jwks in on_chain_state.entries { + let locally_cached = self .states_by_issuer - .get(&provider_jwks.issuer) + .get(&on_chain_provider_jwks.issuer) .and_then(|s| s.on_chain.as_ref()); - if x != Some(&provider_jwks) { + if locally_cached == Some(&on_chain_provider_jwks) { + // The on-chain update did not touch this provider. + // The corresponding local state does not have to be reset. + } else { self.states_by_issuer.insert( - provider_jwks.issuer.clone(), - PerProviderState::new(provider_jwks), + on_chain_provider_jwks.issuer.clone(), + PerProviderState::new(on_chain_provider_jwks), ); } } @@ -249,7 +253,7 @@ impl JWKManager { } } - /// Triggered once the `certified_update_producer` produced a quorum-certified update. + /// Triggered once the `update_certifier` produced a quorum-certified update. pub fn process_quorum_certified_update(&mut self, update: QuorumCertifiedUpdate) -> Result<()> { let issuer = update.update.issuer.clone(); let state = self.states_by_issuer.entry(issuer.clone()).or_default(); @@ -281,28 +285,25 @@ impl JWKManager { /// Then `JWKManager` needs to hold it. Once this resource is dropped, the corresponding QC update process will be cancelled. #[derive(Clone, Debug)] pub struct QuorumCertProcessGuard { - handle: Option, + handle: AbortHandle, } impl QuorumCertProcessGuard { pub fn new(handle: AbortHandle) -> Self { - Self { - handle: Some(handle), - } + Self { handle } } #[cfg(test)] pub fn dummy() -> Self { - Self { handle: None } + let (handle, _) = AbortHandle::new_pair(); + Self { handle } } } impl Drop for QuorumCertProcessGuard { fn drop(&mut self) { let QuorumCertProcessGuard { handle } = self; - if let Some(handle) = handle { - handle.abort(); - } + handle.abort(); } } @@ -360,6 +361,7 @@ impl ConsensusState { } } + #[cfg(test)] pub fn my_proposal_cloned(&self) -> ObservedUpdate { match self { ConsensusState::InProgress { my_proposal, .. } @@ -396,12 +398,6 @@ impl PerProviderState { .as_ref() .map_or(0, |provider_jwks| provider_jwks.version) } - - pub fn reset_with_onchain_state(&mut self, onchain_state: ProviderJWKs) { - if self.on_chain.as_ref() != Some(&onchain_state) { - *self = Self::new(onchain_state) - } - } } #[cfg(test)] diff --git a/crates/aptos-jwk-consensus/src/jwk_manager/tests.rs b/crates/aptos-jwk-consensus/src/jwk_manager/tests.rs index eb1c7428515ee..d113a71622b66 100644 --- a/crates/aptos-jwk-consensus/src/jwk_manager/tests.rs +++ b/crates/aptos-jwk-consensus/src/jwk_manager/tests.rs @@ -1,11 +1,12 @@ // Copyright © Aptos Foundation use crate::{ - certified_update_producer::CertifiedUpdateProducer, jwk_manager::{ConsensusState, JWKManager, PerProviderState, QuorumCertProcessGuard}, network::{DummyRpcResponseSender, IncomingRpcRequest}, types::{JWKConsensusMsg, ObservedUpdate, ObservedUpdateRequest, ObservedUpdateResponse}, + update_certifier::TUpdateCertifier, }; +use aptos_bitvec::BitVec; use aptos_channels::aptos_channel; use aptos_crypto::{ bls12381::{PrivateKey, PublicKey, Signature}, @@ -15,6 +16,7 @@ use aptos_crypto::{ use aptos_infallible::{Mutex, RwLock}; use aptos_types::{ account_address::AccountAddress, + aggregate_signature::AggregateSignature, epoch_state::EpochState, jwks::{ issuer_from_str, jwk::JWK, unsupported::UnsupportedJWK, AllProvidersJWKs, Issuer, @@ -26,7 +28,7 @@ use aptos_types::{ use aptos_validator_transaction_pool::{TransactionFilter, VTxnPoolState}; use futures_util::future::AbortHandle; use std::{ - collections::{BTreeSet, HashMap, HashSet}, + collections::{HashMap, HashSet}, sync::Arc, time::{Duration, Instant}, }; @@ -51,13 +53,13 @@ async fn test_jwk_manager_state_transition() { verifier: ValidatorVerifier::new(validator_consensus_infos.clone()), }; - let certified_update_producer = DummyCertifiedUpdateProducer::default(); + let update_certifier = DummyUpdateCertifier::default(); let vtxn_pool = VTxnPoolState::default(); let mut jwk_manager = JWKManager::new( private_keys[0].clone(), addrs[0], Arc::new(epoch_state), - Arc::new(certified_update_producer), + Arc::new(update_certifier), vtxn_pool.clone(), ); @@ -308,17 +310,18 @@ async fn test_jwk_manager_state_transition() { .consensus_state .my_proposal_cloned() .observed; - let multi_sig = Signature::aggregate( + let signer_bit_vec = BitVec::from(private_keys.iter().map(|_| true).collect::>()); + let sig = Signature::aggregate( private_keys .iter() .map(|sk| sk.sign(&qc_jwks_for_carl).unwrap()) .collect::>(), ) .unwrap(); + let multi_sig = AggregateSignature::new(signer_bit_vec, Some(sig)); let qc_update_for_carl = QuorumCertifiedUpdate { - authors: BTreeSet::from_iter(addrs.clone()), update: qc_jwks_for_carl, - multi_sig: multi_sig.clone(), + multi_sig, }; assert!(jwk_manager .process_quorum_certified_update(qc_update_for_carl.clone()) @@ -378,7 +381,14 @@ async fn test_jwk_manager_state_transition() { .consensus_state .my_proposal_cloned() .observed; - let multi_sig = Signature::aggregate( + let signer_bit_vec = BitVec::from( + private_keys + .iter() + .take(3) + .map(|_| true) + .collect::>(), + ); + let sig = Signature::aggregate( private_keys .iter() .take(3) @@ -386,10 +396,10 @@ async fn test_jwk_manager_state_transition() { .collect::>(), ) .unwrap(); + let multi_sig = AggregateSignature::new(signer_bit_vec, Some(sig)); let qc_update_for_alice = QuorumCertifiedUpdate { - authors: BTreeSet::from_iter(addrs[0..3].to_vec()), update: qc_jwks_for_alice, - multi_sig: multi_sig.clone(), + multi_sig, }; assert!(jwk_manager .process_quorum_certified_update(qc_update_for_alice.clone()) @@ -449,11 +459,11 @@ fn new_rpc_observation_request( } } -pub struct DummyCertifiedUpdateProducer { +pub struct DummyUpdateCertifier { pub invocations: Mutex, ProviderJWKs)>>, } -impl Default for DummyCertifiedUpdateProducer { +impl Default for DummyUpdateCertifier { fn default() -> Self { Self { invocations: Mutex::new(vec![]), @@ -461,12 +471,12 @@ impl Default for DummyCertifiedUpdateProducer { } } -impl CertifiedUpdateProducer for DummyCertifiedUpdateProducer { +impl TUpdateCertifier for DummyUpdateCertifier { fn start_produce( &self, epoch_state: Arc, payload: ProviderJWKs, - _agg_node_tx: Option>, + _agg_node_tx: aptos_channel::Sender, ) -> AbortHandle { self.invocations.lock().push((epoch_state, payload)); let (abort_handle, _) = AbortHandle::new_pair(); diff --git a/crates/aptos-jwk-consensus/src/jwk_observer.rs b/crates/aptos-jwk-consensus/src/jwk_observer.rs index 798869de3bf8e..8d9988ce99afa 100644 --- a/crates/aptos-jwk-consensus/src/jwk_observer.rs +++ b/crates/aptos-jwk-consensus/src/jwk_observer.rs @@ -64,7 +64,7 @@ impl JWKObserver { observation_tx: aptos_channel::Sender<(), (Issuer, Vec)>, ) -> Self { let (close_tx, close_rx) = oneshot::channel(); - let join_handle = tokio::spawn(Self::thread_main( + let join_handle = tokio::spawn(Self::start( fetch_interval, my_addr, issuer.clone(), @@ -83,7 +83,7 @@ impl JWKObserver { } } - async fn thread_main( + async fn start( fetch_interval: Duration, my_addr: AccountAddress, issuer: Issuer, diff --git a/crates/aptos-jwk-consensus/src/lib.rs b/crates/aptos-jwk-consensus/src/lib.rs index 6ed4127d58b47..2d23907e396dd 100644 --- a/crates/aptos-jwk-consensus/src/lib.rs +++ b/crates/aptos-jwk-consensus/src/lib.rs @@ -1,7 +1,8 @@ // Copyright © Aptos Foundation use crate::{ - epoch_manager::EpochManager, network::NetworkTask, network_interface::JWKConsensusNetworkClient, + epoch_manager::EpochManager, network::NetworkTask, + network_interface::JWKConsensusNetworkClient, types::JWKConsensusMsg, }; use aptos_crypto::bls12381::PrivateKey; use aptos_event_notifications::{ @@ -11,8 +12,8 @@ use aptos_network::application::interface::{NetworkClient, NetworkServiceEvents} use aptos_types::account_address::AccountAddress; use aptos_validator_transaction_pool::VTxnPoolState; use tokio::runtime::Runtime; -use types::JWKConsensusMsg; +#[allow(clippy::let_and_return)] pub fn start_jwk_consensus_runtime( my_addr: AccountAddress, consensus_key: PrivateKey, @@ -40,7 +41,6 @@ pub fn start_jwk_consensus_runtime( runtime } -pub mod certified_update_producer; pub mod counters; pub mod epoch_manager; pub mod jwk_manager; @@ -49,3 +49,4 @@ pub mod network; pub mod network_interface; pub mod observation_aggregation; pub mod types; +pub mod update_certifier; diff --git a/crates/aptos-jwk-consensus/src/observation_aggregation/mod.rs b/crates/aptos-jwk-consensus/src/observation_aggregation/mod.rs index ae850996fa40e..cc8dc22049404 100644 --- a/crates/aptos-jwk-consensus/src/observation_aggregation/mod.rs +++ b/crates/aptos-jwk-consensus/src/observation_aggregation/mod.rs @@ -3,30 +3,24 @@ use crate::types::{ JWKConsensusMsg, ObservedUpdate, ObservedUpdateRequest, ObservedUpdateResponse, }; -use anyhow::ensure; +use anyhow::{anyhow, ensure}; use aptos_consensus_types::common::Author; -use aptos_crypto::bls12381; use aptos_infallible::Mutex; use aptos_reliable_broadcast::BroadcastStatus; use aptos_types::{ + aggregate_signature::PartialSignatures, epoch_state::EpochState, jwks::{ProviderJWKs, QuorumCertifiedUpdate}, }; use move_core_types::account_address::AccountAddress; -use std::{collections::HashSet, sync::Arc}; +use std::{collections::BTreeSet, sync::Arc}; /// The aggregation state of reliable broadcast where a validator broadcast JWK observation requests /// and produce quorum-certified JWK updates. pub struct ObservationAggregationState { epoch_state: Arc, local_view: ProviderJWKs, - inner_state: Mutex, -} - -#[derive(Default)] -struct InnerState { - pub contributors: HashSet, - pub multi_sig: Option, + inner_state: Mutex, } impl ObservationAggregationState { @@ -34,7 +28,7 @@ impl ObservationAggregationState { Self { epoch_state, local_view, - inner_state: Mutex::new(InnerState::default()), + inner_state: Mutex::new(PartialSignatures::empty()), } } } @@ -64,8 +58,8 @@ impl BroadcastStatus for Arc { "adding peer observation failed with mismatched author", ); - let mut aggregator = self.inner_state.lock(); - if aggregator.contributors.contains(&sender) { + let mut partial_sigs = self.inner_state.lock(); + if partial_sigs.contains_voter(&sender) { return Ok(None); } @@ -74,33 +68,28 @@ impl BroadcastStatus for Arc { "adding peer observation failed with mismatched view" ); - // Verify the quorum-cert. + // Verify peer signature. self.epoch_state .verifier .verify(sender, &peer_view, &signature)?; // All checks passed. Aggregating. - aggregator.contributors.insert(sender); - let new_multi_sig = if let Some(existing) = aggregator.multi_sig.take() { - bls12381::Signature::aggregate(vec![existing, signature])? - } else { - signature - }; - - let maybe_qc_update = self + partial_sigs.add_signature(sender, signature); + let voters: BTreeSet = partial_sigs.signatures().keys().copied().collect(); + if self .epoch_state .verifier - .check_voting_power(aggregator.contributors.iter(), true) - .ok() - .map(|_| QuorumCertifiedUpdate { - authors: aggregator.contributors.clone().into_iter().collect(), - update: peer_view, - multi_sig: new_multi_sig.clone(), - }); - - aggregator.multi_sig = Some(new_multi_sig); + .check_voting_power(voters.iter(), true) + .is_err() + { + return Ok(None); + } + let multi_sig = self.epoch_state.verifier.aggregate_signatures(&partial_sigs).map_err(|e|anyhow!("adding peer observation failed with partial-to-aggregated conversion error: {e}"))?; - Ok(maybe_qc_update) + Ok(Some(QuorumCertifiedUpdate { + update: peer_view, + multi_sig, + })) } } diff --git a/crates/aptos-jwk-consensus/src/observation_aggregation/tests.rs b/crates/aptos-jwk-consensus/src/observation_aggregation/tests.rs index 60c4ca16db450..07878fbc3a654 100644 --- a/crates/aptos-jwk-consensus/src/observation_aggregation/tests.rs +++ b/crates/aptos-jwk-consensus/src/observation_aggregation/tests.rs @@ -4,11 +4,9 @@ use crate::{ observation_aggregation::ObservationAggregationState, types::{ObservedUpdate, ObservedUpdateResponse}, }; -use aptos_bitvec::BitVec; use aptos_crypto::{bls12381, SigningKey, Uniform}; use aptos_reliable_broadcast::BroadcastStatus; use aptos_types::{ - aggregate_signature::AggregateSignature, epoch_state::EpochState, jwks::{ jwk::{JWKMoveStruct, JWK}, @@ -123,19 +121,10 @@ fn test_observation_aggregation_state() { }, }); let QuorumCertifiedUpdate { - authors, update: observed, multi_sig, } = result.unwrap().unwrap(); assert_eq!(view_0, observed); - let bits: Vec = epoch_state - .verifier - .get_ordered_account_addresses() - .into_iter() - .map(|addr| authors.contains(&addr)) - .collect(); - let bit_vec = BitVec::from(bits); - let multi_sig = AggregateSignature::new(bit_vec, Some(multi_sig)); assert!(epoch_state .verifier .verify_multi_signatures(&observed, &multi_sig) diff --git a/crates/aptos-jwk-consensus/src/update_certifier.rs b/crates/aptos-jwk-consensus/src/update_certifier.rs new file mode 100644 index 0000000000000..b2f380f45258e --- /dev/null +++ b/crates/aptos-jwk-consensus/src/update_certifier.rs @@ -0,0 +1,64 @@ +// Copyright © Aptos Foundation + +use crate::{ + observation_aggregation::ObservationAggregationState, + types::{JWKConsensusMsg, ObservedUpdateRequest}, +}; +use aptos_channels::aptos_channel; +use aptos_reliable_broadcast::ReliableBroadcast; +use aptos_types::{ + epoch_state::EpochState, + jwks::{Issuer, ProviderJWKs, QuorumCertifiedUpdate}, +}; +use futures_util::future::{AbortHandle, Abortable}; +use std::sync::Arc; +use tokio_retry::strategy::ExponentialBackoff; + +/// A sub-process of the whole JWK consensus process. +/// Once invoked by `JWKConsensusManager` to `start_produce`, +/// it starts producing a `QuorumCertifiedUpdate` and returns an abort handle. +/// Once an `QuorumCertifiedUpdate` is available, it is sent back via a channel given earlier. +pub trait TUpdateCertifier: Send + Sync { + fn start_produce( + &self, + epoch_state: Arc, + payload: ProviderJWKs, + qc_update_tx: aptos_channel::Sender, + ) -> AbortHandle; +} + +pub struct UpdateCertifier { + reliable_broadcast: Arc>, +} + +impl UpdateCertifier { + pub fn new(reliable_broadcast: ReliableBroadcast) -> Self { + Self { + reliable_broadcast: Arc::new(reliable_broadcast), + } + } +} + +impl TUpdateCertifier for UpdateCertifier { + fn start_produce( + &self, + epoch_state: Arc, + payload: ProviderJWKs, + qc_update_tx: aptos_channel::Sender, + ) -> AbortHandle { + let rb = self.reliable_broadcast.clone(); + let issuer = payload.issuer.clone(); + let req = ObservedUpdateRequest { + epoch: epoch_state.epoch, + issuer: issuer.clone(), + }; + let agg_state = Arc::new(ObservationAggregationState::new(epoch_state, payload)); + let task = async move { + let qc_update = rb.broadcast(req, agg_state).await; + let _ = qc_update_tx.push(issuer, qc_update); + }; + let (abort_handle, abort_registration) = AbortHandle::new_pair(); + tokio::spawn(Abortable::new(task, abort_registration)); + abort_handle + } +} diff --git a/crates/validator-transaction-pool/src/tests.rs b/crates/validator-transaction-pool/src/tests.rs index 58e01d70de0e2..bcf0414ee9788 100644 --- a/crates/validator-transaction-pool/src/tests.rs +++ b/crates/validator-transaction-pool/src/tests.rs @@ -3,9 +3,10 @@ use crate::{TransactionFilter, VTxnPoolState}; use aptos_channels::{aptos_channel, message_queues::QueueStyle}; use aptos_crypto::hash::CryptoHash; -use aptos_types::validator_txn::{ - Topic::{DUMMY1, DUMMY2}, - ValidatorTransaction, +use aptos_types::{ + dkg::DKGTranscript, + jwks::{dummy_issuer, QuorumCertifiedUpdate}, + validator_txn::{Topic, ValidatorTransaction}, }; use futures_util::StreamExt; use std::{ @@ -19,12 +20,20 @@ use tokio::time::timeout; #[test] fn txn_pull_order_should_be_fifo_except_in_topic_overwriting() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(b"txn0".to_vec()); - let txn_1 = ValidatorTransaction::dummy1(b"txn1".to_vec()); - let txn_2 = ValidatorTransaction::dummy2(b"txn2".to_vec()); - let _guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); - let _guard_2 = pool.put(DUMMY2, Arc::new(txn_2.clone()), None); // txn_0 is replaced. + let txn_0 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); + let txn_1 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_2 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); + let _guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); + let _guard_2 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_2.clone()), + None, + ); // txn_0 is replaced. let pulled = pool.pull( Instant::now().add(Duration::from_secs(10)), 99, @@ -37,10 +46,14 @@ fn txn_pull_order_should_be_fifo_except_in_topic_overwriting() { #[test] fn delete_by_seq_num() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(b"txn0".to_vec()); - let txn_1 = ValidatorTransaction::dummy1(b"txn1".to_vec()); - let guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); + let txn_0 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_1 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); + let guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); drop(guard_0); let pulled = pool.pull( Instant::now().add(Duration::from_secs(10)), @@ -54,10 +67,14 @@ fn delete_by_seq_num() { #[test] fn txn_should_be_dropped_if_guard_is_dropped() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(b"txn0".to_vec()); - let txn_1 = ValidatorTransaction::dummy1(b"txn1".to_vec()); - let guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); + let txn_0 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_1 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); + let guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); drop(guard_0); drop(guard_1); let pulled = pool.pull( @@ -72,11 +89,15 @@ fn txn_should_be_dropped_if_guard_is_dropped() { #[tokio::test] async fn per_txn_pull_notification() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(b"txn0".to_vec()); - let txn_1 = ValidatorTransaction::dummy1(b"txn1".to_vec()); + let txn_0 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_1 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); let (tx, mut rx) = aptos_channel::new(QueueStyle::KLAST, 1, None); - let _guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), Some(tx)); + let _guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), Some(tx)); let notification_received = timeout(Duration::from_millis(100), rx.select_next_some()).await; assert!(notification_received.is_err()); let pulled = pool.pull( @@ -93,10 +114,14 @@ async fn per_txn_pull_notification() { #[test] fn pull_item_limit_should_be_respected() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(b"txn0".to_vec()); - let txn_1 = ValidatorTransaction::dummy1(b"txn1".to_vec()); - let guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); + let txn_0 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_1 = ValidatorTransaction::DKGResult(DKGTranscript::dummy()); + let guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); let pulled = pool.pull( Instant::now().add(Duration::from_secs(10)), 1, @@ -117,10 +142,14 @@ fn pull_item_limit_should_be_respected() { #[test] fn pull_size_limit_should_be_respected() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(vec![0xFF; 100]); - let txn_1 = ValidatorTransaction::dummy1(vec![0xFF; 100]); - let guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); + let txn_0 = ValidatorTransaction::dummy(vec![0xFF; 100]); + let txn_1 = ValidatorTransaction::dummy(vec![0xFF; 100]); + let guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); let pulled = pool.pull( Instant::now().add(Duration::from_secs(10)), 99, @@ -141,10 +170,14 @@ fn pull_size_limit_should_be_respected() { #[test] fn pull_filter_should_be_respected() { let pool = VTxnPoolState::default(); - let txn_0 = ValidatorTransaction::dummy2(vec![0xFF; 100]); - let txn_1 = ValidatorTransaction::dummy1(vec![0xFF; 100]); - let _guard_0 = pool.put(DUMMY2, Arc::new(txn_0.clone()), None); - let _guard_1 = pool.put(DUMMY1, Arc::new(txn_1.clone()), None); + let txn_0 = ValidatorTransaction::ObservedJWKUpdate(QuorumCertifiedUpdate::dummy()); + let txn_1 = ValidatorTransaction::dummy(vec![0xFF; 100]); + let _guard_0 = pool.put( + Topic::JWK_CONSENSUS(dummy_issuer()), + Arc::new(txn_0.clone()), + None, + ); + let _guard_1 = pool.put(Topic::DKG, Arc::new(txn_1.clone()), None); let pulled = pool.pull( Instant::now().add(Duration::from_secs(10)), 99, diff --git a/docker/builder/docker-bake-rust-all.hcl b/docker/builder/docker-bake-rust-all.hcl index 0bd7db9079f43..83986c154fd68 100644 --- a/docker/builder/docker-bake-rust-all.hcl +++ b/docker/builder/docker-bake-rust-all.hcl @@ -70,7 +70,7 @@ target "debian-base" { dockerfile = "docker/builder/debian-base.Dockerfile" contexts = { # Run `docker buildx imagetools inspect debian:bullseye` to find the latest multi-platform hash - debian = "docker-image://debian:bullseye@sha256:71cb300d5448af821aedfe63afd55ba05f45a6a79f00dcd131b96b780bb99fe4" + debian = "docker-image://debian:bullseye@sha256:44cfd77e3dbebfa8a0545d9fd1bee6e874cf6998a3bbfb5cccf34919df4e3360" } } diff --git a/ecosystem/indexer-grpc/indexer-grpc-data-access/src/gcs.rs b/ecosystem/indexer-grpc/indexer-grpc-data-access/src/gcs.rs index 10b68c58f8ea4..4dafefabe97f1 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-data-access/src/gcs.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-data-access/src/gcs.rs @@ -110,13 +110,13 @@ impl From for StorageReadError { ), false => StorageReadError::PermenantError( GCS_STORAGE_NAME, - anyhow::Error::new(e).context("Failed to download object; it's permernant."), + anyhow::Error::new(e).context("Failed to download object; it's permanent."), ), }, Error::TokenSource(e) => StorageReadError::PermenantError( GCS_STORAGE_NAME, anyhow::anyhow!(e.to_string()) - .context("Failed to download object; authenication/token error."), + .context("Failed to download object; authentication/token error."), ), } } diff --git a/ecosystem/indexer-grpc/indexer-grpc-data-service/src/service.rs b/ecosystem/indexer-grpc/indexer-grpc-data-service/src/service.rs index 1a18ee381e00e..0bb2637a883f7 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-data-service/src/service.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-data-service/src/service.rs @@ -9,7 +9,7 @@ use crate::metrics::{ }; use anyhow::{Context, Result}; use aptos_indexer_grpc_utils::{ - cache_operator::{CacheBatchGetStatus, CacheOperator}, + cache_operator::{CacheBatchGetStatus, CacheCoverageStatus, CacheOperator}, chunk_transactions, compression_util::{CacheEntry, StorageFormat}, config::IndexerGrpcFileStoreConfig, @@ -17,7 +17,7 @@ use aptos_indexer_grpc_utils::{ IndexerGrpcRequestMetadata, GRPC_AUTH_TOKEN_HEADER, GRPC_REQUEST_NAME_HEADER, MESSAGE_SIZE_LIMIT, }, - counters::{log_grpc_step, IndexerGrpcStep}, + counters::{log_grpc_step, IndexerGrpcStep, NUM_MULTI_FETCH_OVERLAPPED_VERSIONS}, file_store_operator::FileStoreOperator, time_diff_since_pb_timestamp_in_secs, types::RedisUrl, @@ -29,6 +29,7 @@ use aptos_protos::{ }; use futures::Stream; use prost::Message; +use redis::Client; use std::{ collections::HashMap, pin::Pin, @@ -41,6 +42,7 @@ use tokio_stream::wrappers::ReceiverStream; use tonic::{Request, Response, Status}; use tracing::{error, info, warn}; use uuid::Uuid; + type ResponseStream = Pin> + Send>>; const MOVING_AVERAGE_WINDOW_SIZE: u64 = 10_000; @@ -65,6 +67,14 @@ const REQUEST_HEADER_APTOS_API_KEY_NAME: &str = "x-aptos-api-key-name"; const RESPONSE_HEADER_APTOS_CONNECTION_ID_HEADER: &str = "x-aptos-connection-id"; const SERVICE_TYPE: &str = "data_service"; +// Number of times to retry fetching a given txn block from the stores +pub const NUM_DATA_FETCH_RETRIES: u8 = 5; + +// Max number of tasks to reach out to TXN stores with +const MAX_FETCH_TASKS_PER_REQUEST: u64 = 5; +// The number of transactions we store per txn block; this is used to determine max num of tasks +const TRANSACTIONS_PER_STORAGE_BLOCK: u64 = 1000; + pub struct RawDataServerWrapper { pub redis_client: Arc, pub file_store_config: IndexerGrpcFileStoreConfig, @@ -124,9 +134,9 @@ impl RawData for RawDataServerWrapper { }; CONNECTION_COUNT .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), + &request_metadata.request_api_key_name, + &request_metadata.request_email, + &request_metadata.processor_name, ]) .inc(); let request = req.into_inner(); @@ -135,7 +145,7 @@ impl RawData for RawDataServerWrapper { // Response channel to stream the data to the client. let (tx, rx) = channel(self.data_service_response_channel_size); - let mut current_version = match &request.starting_version { + let current_version = match &request.starting_version { Some(version) => *version, None => { return Result::Err(Status::aborted("Starting version is not set")); @@ -143,6 +153,7 @@ impl RawData for RawDataServerWrapper { }; let file_store_operator: Box = self.file_store_config.create(); + let file_store_operator = Arc::new(file_store_operator); // Adds tracing context for the request. log_grpc_step( @@ -155,259 +166,483 @@ impl RawData for RawDataServerWrapper { None, None, None, - Some(request_metadata.clone()), + Some(&request_metadata), ); let redis_client = self.redis_client.clone(); let cache_storage_format = self.cache_storage_format; + let request_metadata = Arc::new(request_metadata); tokio::spawn({ let request_metadata = request_metadata.clone(); async move { - let mut connection_start_time = Some(std::time::Instant::now()); - let mut transactions_count = transactions_count; - - // Establish redis connection - let conn = match redis_client.get_tokio_connection_manager().await { - Ok(conn) => conn, - Err(e) => { - ERROR_COUNT - .with_label_values(&["redis_connection_failed"]) - .inc(); - // Connection will be dropped anyway, so we ignore the error here. - let _result = tx - .send_timeout( - Err(Status::unavailable( - "[Data Service] Cannot connect to Redis; please retry.", - )), - RESPONSE_CHANNEL_SEND_TIMEOUT, - ) - .await; - error!( - error = e.to_string(), - "[Data Service] Failed to get redis connection." - ); - return; - }, - }; - let mut cache_operator = CacheOperator::new(conn, cache_storage_format); - - // Validate chain id - let mut metadata = file_store_operator.get_file_store_metadata().await; - while metadata.is_none() { - metadata = file_store_operator.get_file_store_metadata().await; - tracing::warn!( - "[File worker] File store metadata not found. Waiting for {} ms.", - FILE_STORE_METADATA_WAIT_MS - ); - tokio::time::sleep(std::time::Duration::from_millis( - FILE_STORE_METADATA_WAIT_MS, - )) - .await; - } + data_fetcher_task( + redis_client, + file_store_operator, + cache_storage_format, + request_metadata, + transactions_count, + tx, + current_version, + ) + .await; + } + }); - let metadata_chain_id = metadata.unwrap().chain_id; - - // Validate redis chain id. Must be present by the time it gets here - let chain_id = match cache_operator.get_chain_id().await { - Ok(chain_id) => chain_id.unwrap(), - Err(e) => { - ERROR_COUNT - .with_label_values(&["redis_get_chain_id_failed"]) - .inc(); - // Connection will be dropped anyway, so we ignore the error here. - let _result = tx - .send_timeout( - Err(Status::unavailable( - "[Data Service] Cannot get the chain id from redis; please retry.", - )), - RESPONSE_CHANNEL_SEND_TIMEOUT, - ) - .await; - error!( - error = e.to_string(), - "[Data Service] Failed to get chain id from redis." - ); - return; - }, - }; - - if metadata_chain_id != chain_id { - let _result = tx - .send_timeout( - Err(Status::unavailable("[Data Service] Chain ID mismatch.")), - RESPONSE_CHANNEL_SEND_TIMEOUT, - ) - .await; - error!("[Data Service] Chain ID mismatch.",); - return; - } + let output_stream = ReceiverStream::new(rx); + let mut response = Response::new(Box::pin(output_stream) as Self::GetTransactionsStream); + + response.metadata_mut().insert( + RESPONSE_HEADER_APTOS_CONNECTION_ID_HEADER, + tonic::metadata::MetadataValue::from_str(&request_metadata.request_connection_id) + .unwrap(), + ); + Ok(response) + } +} + +enum DataFetchSubTaskResult { + BatchSuccess(Vec>), + Success(Vec), + NoResults, +} + +async fn get_data_with_tasks( + start_version: u64, + transactions_count: Option, + chain_id: u64, + cache_operator: &mut CacheOperator, + file_store_operator: Arc>, + request_metadata: Arc, + cache_storage_format: StorageFormat, +) -> DataFetchSubTaskResult { + let cache_coverage_status = cache_operator + .check_cache_coverage_status(start_version) + .await; + + let num_tasks_to_use = match cache_coverage_status { + Ok(CacheCoverageStatus::DataNotReady) => return DataFetchSubTaskResult::NoResults, + Ok(CacheCoverageStatus::CacheHit(_)) => 1, + Ok(CacheCoverageStatus::CacheEvicted) => match transactions_count { + None => MAX_FETCH_TASKS_PER_REQUEST, + Some(transactions_count) => (transactions_count / TRANSACTIONS_PER_STORAGE_BLOCK) + .max(MAX_FETCH_TASKS_PER_REQUEST), + }, + Err(_) => { + error!("[Data Service] Failed to get cache coverage status."); + panic!("Failed to get cache coverage status."); + }, + }; + + let mut tasks = tokio::task::JoinSet::new(); + let mut current_version = start_version; + + for _ in 0..num_tasks_to_use { + tasks.spawn({ + // TODO: arc this instead of cloning + let mut cache_operator = cache_operator.clone(); + let file_store_operator = file_store_operator.clone(); + let request_metadata = request_metadata.clone(); + async move { + get_data_in_task( + current_version, + chain_id, + &mut cache_operator, + file_store_operator, + request_metadata.clone(), + cache_storage_format, + ) + .await + } + }); + // Storage is in block of 1000: we align our current version fetch to the nearest block + current_version += TRANSACTIONS_PER_STORAGE_BLOCK; + current_version -= current_version % TRANSACTIONS_PER_STORAGE_BLOCK; + } + + let mut transactions: Vec> = vec![]; + while let Some(result) = tasks.join_next().await { + match result { + Ok(DataFetchSubTaskResult::Success(txns)) => { + transactions.push(txns); + }, + Ok(DataFetchSubTaskResult::NoResults) => {}, + Err(e) => { + error!( + error = e.to_string(), + "[Data Service] Failed to get data from cache and file store." + ); + panic!("Failed to get data from cache and file store."); + }, + Ok(_) => unreachable!("Fetching from a single task will never return a batch"), + } + } + + if transactions.is_empty() { + DataFetchSubTaskResult::NoResults + } else { + DataFetchSubTaskResult::BatchSuccess(transactions) + } +} + +async fn get_data_in_task( + start_version: u64, + chain_id: u64, + cache_operator: &mut CacheOperator, + file_store_operator: Arc>, + request_metadata: Arc, + cache_storage_format: StorageFormat, +) -> DataFetchSubTaskResult { + let current_batch_start_time = std::time::Instant::now(); + + let fetched = data_fetch( + start_version, + cache_operator, + file_store_operator, + request_metadata.clone(), + cache_storage_format, + ); + + let transaction_data = match fetched.await { + Ok(TransactionsDataStatus::Success(transactions)) => transactions, + Ok(TransactionsDataStatus::AheadOfCache) => { + info!( + start_version = start_version, + request_name = request_metadata.processor_name.as_str(), + request_email = request_metadata.request_email.as_str(), + request_api_key_name = request_metadata.request_api_key_name.as_str(), + processor_name = request_metadata.processor_name.as_str(), + connection_id = request_metadata.request_connection_id.as_str(), + request_user_classification = request_metadata.request_user_classification.as_str(), + duration_in_secs = current_batch_start_time.elapsed().as_secs_f64(), + service_type = SERVICE_TYPE, + "[Data Service] Requested data is ahead of cache. Sleeping for {} ms.", + AHEAD_OF_CACHE_RETRY_SLEEP_DURATION_MS, + ); + ahead_of_cache_data_handling().await; + // Retry after a short sleep. + return DataFetchSubTaskResult::NoResults; + }, + Err(e) => { + ERROR_COUNT.with_label_values(&["data_fetch_failed"]).inc(); + data_fetch_error_handling(e, start_version, chain_id).await; + // Retry after a short sleep. + return DataFetchSubTaskResult::NoResults; + }, + }; + DataFetchSubTaskResult::Success(transaction_data) +} + +// This is a task spawned off for servicing a users' request +async fn data_fetcher_task( + redis_client: Arc, + file_store_operator: Arc>, + cache_storage_format: StorageFormat, + request_metadata: Arc, + transactions_count: Option, + tx: tokio::sync::mpsc::Sender>, + mut current_version: u64, +) { + let mut connection_start_time = Some(std::time::Instant::now()); + let mut transactions_count = transactions_count; + + // Establish redis connection + let conn = match redis_client.get_tokio_connection_manager().await { + Ok(conn) => conn, + Err(e) => { + ERROR_COUNT + .with_label_values(&["redis_connection_failed"]) + .inc(); + // Connection will be dropped anyway, so we ignore the error here. + let _result = tx + .send_timeout( + Err(Status::unavailable( + "[Data Service] Cannot connect to Redis; please retry.", + )), + RESPONSE_CHANNEL_SEND_TIMEOUT, + ) + .await; + error!( + error = e.to_string(), + "[Data Service] Failed to get redis connection." + ); + return; + }, + }; + let mut cache_operator = CacheOperator::new(conn, cache_storage_format); + + // Validate chain id + let mut metadata = file_store_operator.get_file_store_metadata().await; + while metadata.is_none() { + metadata = file_store_operator.get_file_store_metadata().await; + tracing::warn!( + "[File worker] File store metadata not found. Waiting for {} ms.", + FILE_STORE_METADATA_WAIT_MS + ); + tokio::time::sleep(std::time::Duration::from_millis( + FILE_STORE_METADATA_WAIT_MS, + )) + .await; + } - // Data service metrics. - let mut tps_calculator = MovingAverage::new(MOVING_AVERAGE_WINDOW_SIZE); - loop { - // 1. Fetch data from cache and file store. - let mut transaction_data = match data_fetch( - current_version, - &mut cache_operator, - file_store_operator.as_ref(), - request_metadata.clone(), - cache_storage_format, - ) - .await - { - Ok(TransactionsDataStatus::Success(transactions)) => transactions, - Ok(TransactionsDataStatus::AheadOfCache) => { - ahead_of_cache_data_handling().await; - // Retry after a short sleep. - continue; - }, - Err(e) => { - ERROR_COUNT.with_label_values(&["data_fetch_failed"]).inc(); - data_fetch_error_handling(e, current_version, chain_id).await; - // Retry after a short sleep. - continue; - }, - }; - - // TODO: Unify the truncation logic for start and end. - if let Some(count) = transactions_count { - if count == 0 { - // End the data stream. - // Since the client receives all the data it requested, we don't count it as a short conneciton. - connection_start_time = None; - break; - } else if (count as usize) < transaction_data.len() { - // Trim the data to the requested end version. - transaction_data.truncate(count as usize); - transactions_count = Some(0); - } else { - transactions_count = Some(count - transaction_data.len() as u64); - } - }; - // Note: this is the protobuf encoded transaction size. - let bytes_ready_to_transfer = transaction_data - .iter() - .map(|t| t.encoded_len()) - .sum::(); - BYTES_READY_TO_TRANSFER_FROM_SERVER + let metadata_chain_id = metadata.unwrap().chain_id; + + // Validate redis chain id. Must be present by the time it gets here + let chain_id = match cache_operator.get_chain_id().await { + Ok(chain_id) => chain_id.unwrap(), + Err(e) => { + ERROR_COUNT + .with_label_values(&["redis_get_chain_id_failed"]) + .inc(); + // Connection will be dropped anyway, so we ignore the error here. + let _result = tx + .send_timeout( + Err(Status::unavailable( + "[Data Service] Cannot get the chain id from redis; please retry.", + )), + RESPONSE_CHANNEL_SEND_TIMEOUT, + ) + .await; + error!( + error = e.to_string(), + "[Data Service] Failed to get chain id from redis." + ); + return; + }, + }; + + if metadata_chain_id != chain_id { + let _result = tx + .send_timeout( + Err(Status::unavailable("[Data Service] Chain ID mismatch.")), + RESPONSE_CHANNEL_SEND_TIMEOUT, + ) + .await; + error!("[Data Service] Chain ID mismatch.",); + return; + } + + // Data service metrics. + let mut tps_calculator = MovingAverage::new(MOVING_AVERAGE_WINDOW_SIZE); + + loop { + // 1. Fetch data from cache and file store. + let transaction_data = match get_data_with_tasks( + current_version, + transactions_count, + chain_id, + &mut cache_operator, + file_store_operator.clone(), + request_metadata.clone(), + cache_storage_format, + ) + .await + { + DataFetchSubTaskResult::BatchSuccess(txns) => txns, + DataFetchSubTaskResult::Success(_) => { + unreachable!("Fetching from multiple tasks will never return a single vector") + }, + DataFetchSubTaskResult::NoResults => continue, + }; + + let mut transaction_data = ensure_sequential_transactions(transaction_data); + + // TODO: Unify the truncation logic for start and end. + if let Some(count) = transactions_count { + if count == 0 { + // End the data stream. + // Since the client receives all the data it requested, we don't count it as a short connection. + connection_start_time = None; + break; + } else if (count as usize) < transaction_data.len() { + // Trim the data to the requested end version. + transaction_data.truncate(count as usize); + transactions_count = Some(0); + } else { + transactions_count = Some(count - transaction_data.len() as u64); + } + }; + // Note: this is the protobuf encoded transaction size. + let bytes_ready_to_transfer = transaction_data + .iter() + .map(|t| t.encoded_len()) + .sum::(); + BYTES_READY_TO_TRANSFER_FROM_SERVER + .with_label_values(&[ + &request_metadata.request_api_key_name, + &request_metadata.request_email, + &request_metadata.processor_name, + ]) + .inc_by(bytes_ready_to_transfer as u64); + // 2. Push the data to the response channel, i.e. stream the data to the client. + let current_batch_size = transaction_data.as_slice().len(); + let end_of_batch_version = transaction_data.as_slice().last().unwrap().version; + let resp_items = get_transactions_responses_builder(transaction_data, chain_id as u32); + let data_latency_in_secs = resp_items + .last() + .unwrap() + .transactions + .last() + .unwrap() + .timestamp + .as_ref() + .map(time_diff_since_pb_timestamp_in_secs); + + match channel_send_multiple_with_timeout(resp_items, tx.clone(), request_metadata.clone()) + .await + { + Ok(_) => { + PROCESSED_BATCH_SIZE + .with_label_values(&[ + request_metadata.request_api_key_name.as_str(), + request_metadata.request_email.as_str(), + request_metadata.processor_name.as_str(), + ]) + .set(current_batch_size as i64); + // TODO: Reasses whether this metric useful + LATEST_PROCESSED_VERSION_OLD + .with_label_values(&[ + request_metadata.request_api_key_name.as_str(), + request_metadata.request_email.as_str(), + request_metadata.processor_name.as_str(), + ]) + .set(end_of_batch_version as i64); + PROCESSED_VERSIONS_COUNT + .with_label_values(&[ + request_metadata.request_api_key_name.as_str(), + request_metadata.request_email.as_str(), + request_metadata.processor_name.as_str(), + ]) + .inc_by(current_batch_size as u64); + if let Some(data_latency_in_secs) = data_latency_in_secs { + PROCESSED_LATENCY_IN_SECS .with_label_values(&[ request_metadata.request_api_key_name.as_str(), request_metadata.request_email.as_str(), request_metadata.processor_name.as_str(), ]) - .inc_by(bytes_ready_to_transfer as u64); - // 2. Push the data to the response channel, i.e. stream the data to the client. - let current_batch_size = transaction_data.as_slice().len(); - let end_of_batch_version = transaction_data.as_slice().last().unwrap().version; - let resp_items = - get_transactions_responses_builder(transaction_data, chain_id as u32); - let data_latency_in_secs = resp_items - .last() - .unwrap() - .transactions - .last() - .unwrap() - .timestamp - .as_ref() - .map(time_diff_since_pb_timestamp_in_secs); - - match channel_send_multiple_with_timeout( - resp_items, - tx.clone(), - request_metadata.clone(), - ) - .await - { - Ok(_) => { - PROCESSED_BATCH_SIZE - .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), - ]) - .set(current_batch_size as i64); - // TODO: Reasses whether this metric useful - LATEST_PROCESSED_VERSION_OLD - .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), - ]) - .set(end_of_batch_version as i64); - PROCESSED_VERSIONS_COUNT - .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), - ]) - .inc_by(current_batch_size as u64); - if let Some(data_latency_in_secs) = data_latency_in_secs { - PROCESSED_LATENCY_IN_SECS - .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), - ]) - .set(data_latency_in_secs); - PROCESSED_LATENCY_IN_SECS_ALL - .with_label_values(&[request_metadata - .request_user_classification - .as_str()]) - .observe(data_latency_in_secs); - } - }, - Err(SendTimeoutError::Timeout(_)) => { - warn!("[Data Service] Receiver is full; exiting."); - break; - }, - Err(SendTimeoutError::Closed(_)) => { - warn!("[Data Service] Receiver is closed; exiting."); - break; - }, - } - // 3. Update the current version and record current tps. - tps_calculator.tick_now(current_batch_size as u64); - current_version = end_of_batch_version + 1; + .set(data_latency_in_secs); + PROCESSED_LATENCY_IN_SECS_ALL + .with_label_values(&[request_metadata.request_user_classification.as_str()]) + .observe(data_latency_in_secs); } - info!( - request_name = request_metadata.processor_name.as_str(), - request_email = request_metadata.request_email.as_str(), - request_api_key_name = request_metadata.request_api_key_name.as_str(), - processor_name = request_metadata.processor_name.as_str(), - connection_id = request_metadata.request_connection_id.as_str(), - request_user_classification = - request_metadata.request_user_classification.as_str(), - request_user_classification = - request_metadata.request_user_classification.as_str(), - service_type = SERVICE_TYPE, - "[Data Service] Client disconnected." + }, + Err(SendTimeoutError::Timeout(_)) => { + warn!("[Data Service] Receiver is full; exiting."); + break; + }, + Err(SendTimeoutError::Closed(_)) => { + warn!("[Data Service] Receiver is closed; exiting."); + break; + }, + } + // 3. Update the current version and record current tps. + tps_calculator.tick_now(current_batch_size as u64); + current_version = end_of_batch_version + 1; + } + info!( + request_name = request_metadata.processor_name.as_str(), + request_email = request_metadata.request_email.as_str(), + request_api_key_name = request_metadata.request_api_key_name.as_str(), + processor_name = request_metadata.processor_name.as_str(), + connection_id = request_metadata.request_connection_id.as_str(), + request_user_classification = request_metadata.request_user_classification.as_str(), + request_user_classification = request_metadata.request_user_classification.as_str(), + service_type = SERVICE_TYPE, + "[Data Service] Client disconnected." + ); + if let Some(start_time) = connection_start_time { + if start_time.elapsed().as_secs() < SHORT_CONNECTION_DURATION_IN_SECS { + SHORT_CONNECTION_COUNT + .with_label_values(&[ + request_metadata.request_api_key_name.as_str(), + request_metadata.request_email.as_str(), + request_metadata.processor_name.as_str(), + ]) + .inc(); + } + } +} + +/// Takes in multiple batches of transactions, and: +/// 1. De-dupes in the case of overlap (but log to prom metric) +/// 2. Panics in cases of gaps +fn ensure_sequential_transactions(mut batches: Vec>) -> Vec { + // If there's only one, no sorting required + if batches.len() == 1 { + return batches.pop().unwrap(); + } + + // Sort by the first version per batch, ascending + batches.sort_by(|a, b| a.first().unwrap().version.cmp(&b.first().unwrap().version)); + let first_version = batches.first().unwrap().first().unwrap().version; + let last_version = batches.last().unwrap().last().unwrap().version; + let mut transactions: Vec = vec![]; + + let mut prev_start = None; + let mut prev_end = None; + for mut batch in batches { + let mut start_version = batch.first().unwrap().version; + let end_version = batch.last().unwrap().version; + if prev_start.is_some() { + let prev_start = prev_start.unwrap(); + let prev_end = prev_end.unwrap(); + // If this batch is fully contained within the previous batch, skip it + if prev_start <= start_version && prev_end >= end_version { + NUM_MULTI_FETCH_OVERLAPPED_VERSIONS + .with_label_values(&[SERVICE_TYPE, &"full"]) + .inc_by(end_version - start_version); + continue; + } + // If this batch overlaps with the previous batch, combine them + if prev_end >= start_version { + NUM_MULTI_FETCH_OVERLAPPED_VERSIONS + .with_label_values(&[SERVICE_TYPE, &"partial"]) + .inc_by(prev_end - start_version + 1); + tracing::debug!( + batch_first_version = first_version, + batch_last_version = last_version, + start_version = start_version, + end_version = end_version, + prev_start = ?prev_start, + prev_end = prev_end, + "[Filestore] Overlapping version data" ); - if let Some(start_time) = connection_start_time { - if start_time.elapsed().as_secs() < SHORT_CONNECTION_DURATION_IN_SECS { - SHORT_CONNECTION_COUNT - .with_label_values(&[ - request_metadata.request_api_key_name.as_str(), - request_metadata.request_email.as_str(), - request_metadata.processor_name.as_str(), - ]) - .inc(); - } - } + batch.drain(0..(prev_end - start_version + 1) as usize); + start_version = batch.first().unwrap().version; } - }); - let output_stream = ReceiverStream::new(rx); - let mut response = Response::new(Box::pin(output_stream) as Self::GetTransactionsStream); + // Otherwise there is a gap + if prev_end + 1 != start_version { + NUM_MULTI_FETCH_OVERLAPPED_VERSIONS + .with_label_values(&[SERVICE_TYPE, &"gap"]) + .inc_by(prev_end - start_version + 1); + + tracing::error!( + batch_first_version = first_version, + batch_last_version = last_version, + start_version = start_version, + end_version = end_version, + prev_start = ?prev_start, + prev_end = prev_end, + "[Filestore] Gaps or dupes in processing version data" + ); + panic!("[Filestore] Gaps in processing data batch_first_version: {}, batch_last_version: {}, start_version: {}, end_version: {}, prev_start: {:?}, prev_end: {:?}", + first_version, + last_version, + start_version, + end_version, + prev_start, + prev_end, + ); + } + } - response.metadata_mut().insert( - RESPONSE_HEADER_APTOS_CONNECTION_ID_HEADER, - tonic::metadata::MetadataValue::from_str( - request_metadata.request_connection_id.as_str(), - ) - .unwrap(), - ); - Ok(response) + prev_start = Some(start_version); + prev_end = Some(end_version); + transactions.extend(batch); } + + transactions } /// Builds the response for the get transactions request. Partial batch is ok, i.e., a batch with transactions < 1000. @@ -425,13 +660,31 @@ fn get_transactions_responses_builder( .collect() } +// This is a CPU bound operation, so we spawn_blocking +async fn deserialize_cached_transactions( + transactions: Vec>, + storage_format: StorageFormat, +) -> anyhow::Result> { + let task = tokio::task::spawn_blocking(move || { + transactions + .into_iter() + .map(|transaction| { + let cache_entry = CacheEntry::new(transaction, storage_format); + cache_entry.into_transaction() + }) + .collect::>() + }) + .await; + task.context("Transaction bytes to CacheEntry deserialization task failed") +} + /// Fetches data from cache or the file store. It returns the data if it is ready in the cache or file store. /// Otherwise, it returns the status of the data fetching. async fn data_fetch( starting_version: u64, cache_operator: &mut CacheOperator, - file_store_operator: &dyn FileStoreOperator, - request_metadata: IndexerGrpcRequestMetadata, + file_store_operator: Arc>, + request_metadata: Arc, storage_format: StorageFormat, ) -> anyhow::Result { let current_batch_start_time = std::time::Instant::now(); @@ -450,13 +703,9 @@ async fn data_fetch( .sum::(); let num_of_transactions = transactions.len(); let duration_in_secs = current_batch_start_time.elapsed().as_secs_f64(); - let transactions = transactions - .into_iter() - .map(|transaction| { - let cache_entry = CacheEntry::new(transaction, storage_format); - cache_entry.into_transaction() - }) - .collect::>(); + + let transactions = + deserialize_cached_transactions(transactions, storage_format).await?; let start_version_timestamp = transactions.first().unwrap().timestamp.as_ref(); let end_version_timestamp = transactions.last().unwrap().timestamp.as_ref(); @@ -470,7 +719,7 @@ async fn data_fetch( Some(duration_in_secs), Some(size_in_bytes), Some(num_of_transactions as i64), - Some(request_metadata.clone()), + Some(&request_metadata), ); log_grpc_step( SERVICE_TYPE, @@ -482,53 +731,64 @@ async fn data_fetch( Some(decoding_start_time.elapsed().as_secs_f64()), Some(size_in_bytes), Some(num_of_transactions as i64), - Some(request_metadata.clone()), + Some(&request_metadata), ); Ok(TransactionsDataStatus::Success(transactions)) }, Ok(CacheBatchGetStatus::EvictedFromCache) => { - // Data is evicted from the cache. Fetch from file store. - let (transactions, io_duration, decoding_duration) = file_store_operator - .get_transactions_with_durations(starting_version) - .await?; - let size_in_bytes = transactions - .iter() - .map(|transaction| transaction.encoded_len()) - .sum::(); - let num_of_transactions = transactions.len(); - let start_version_timestamp = transactions.first().unwrap().timestamp.as_ref(); - let end_version_timestamp = transactions.last().unwrap().timestamp.as_ref(); - log_grpc_step( - SERVICE_TYPE, - IndexerGrpcStep::DataServiceDataFetchedFilestore, - Some(starting_version as i64), - Some(starting_version as i64 + num_of_transactions as i64 - 1), - start_version_timestamp, - end_version_timestamp, - Some(io_duration), - Some(size_in_bytes), - Some(num_of_transactions as i64), - Some(request_metadata.clone()), - ); - log_grpc_step( - SERVICE_TYPE, - IndexerGrpcStep::DataServiceTxnsDecoded, - Some(starting_version as i64), - Some(starting_version as i64 + num_of_transactions as i64 - 1), - start_version_timestamp, - end_version_timestamp, - Some(decoding_duration), - Some(size_in_bytes), - Some(num_of_transactions as i64), - Some(request_metadata.clone()), - ); + let transactions = + data_fetch_from_filestore(starting_version, file_store_operator, request_metadata) + .await?; Ok(TransactionsDataStatus::Success(transactions)) }, Err(e) => Err(e), } } +async fn data_fetch_from_filestore( + starting_version: u64, + file_store_operator: Arc>, + request_metadata: Arc, +) -> anyhow::Result> { + // Data is evicted from the cache. Fetch from file store. + let (transactions, io_duration, decoding_duration) = file_store_operator + .get_transactions_with_durations(starting_version, NUM_DATA_FETCH_RETRIES) + .await?; + let size_in_bytes = transactions + .iter() + .map(|transaction| transaction.encoded_len()) + .sum::(); + let num_of_transactions = transactions.len(); + let start_version_timestamp = transactions.first().unwrap().timestamp.as_ref(); + let end_version_timestamp = transactions.last().unwrap().timestamp.as_ref(); + log_grpc_step( + SERVICE_TYPE, + IndexerGrpcStep::DataServiceDataFetchedFilestore, + Some(starting_version as i64), + Some(starting_version as i64 + num_of_transactions as i64 - 1), + start_version_timestamp, + end_version_timestamp, + Some(io_duration), + Some(size_in_bytes), + Some(num_of_transactions as i64), + Some(&request_metadata), + ); + log_grpc_step( + SERVICE_TYPE, + IndexerGrpcStep::DataServiceTxnsDecoded, + Some(starting_version as i64), + Some(starting_version as i64 + num_of_transactions as i64 - 1), + start_version_timestamp, + end_version_timestamp, + Some(decoding_duration), + Some(size_in_bytes), + Some(num_of_transactions as i64), + Some(&request_metadata), + ); + Ok(transactions) +} + /// Handles the case when the data is not ready in the cache, i.e., beyond the current head. async fn ahead_of_cache_data_handling() { // TODO: add exponential backoff. @@ -591,7 +851,7 @@ fn get_request_metadata( async fn channel_send_multiple_with_timeout( resp_items: Vec, tx: tokio::sync::mpsc::Sender>, - request_metadata: IndexerGrpcRequestMetadata, + request_metadata: Arc, ) -> Result<(), SendTimeoutError>> { let overall_send_start_time = Instant::now(); let overall_size_in_bytes = resp_items @@ -642,7 +902,7 @@ async fn channel_send_multiple_with_timeout( Some(send_start_time.elapsed().as_secs_f64()), Some(response_size), Some(num_of_transactions as i64), - Some(request_metadata.clone()), + Some(&request_metadata), ); } @@ -656,8 +916,53 @@ async fn channel_send_multiple_with_timeout( Some(overall_send_start_time.elapsed().as_secs_f64()), Some(overall_size_in_bytes), Some((overall_end_version - overall_start_version + 1) as i64), - Some(request_metadata.clone()), + Some(&request_metadata), ); Ok(()) } + +#[test] +fn test_ensure_sequential_transactions_merges_and_sorts() { + let transactions1 = (1..5) + .map(|i| Transaction { + version: i, + ..Default::default() + }) + .collect(); + let transactions2 = (5..10) + .map(|i| Transaction { + version: i, + ..Default::default() + }) + .collect(); + // No overlap, just normal fetching flow + let transactions1 = ensure_sequential_transactions(vec![transactions1, transactions2]); + assert_eq!(transactions1.len(), 9); + assert_eq!(transactions1.first().unwrap().version, 1); + assert_eq!(transactions1.last().unwrap().version, 9); + + // This is a full overlap + let transactions2 = (5..7) + .map(|i| Transaction { + version: i, + ..Default::default() + }) + .collect(); + let transactions1 = ensure_sequential_transactions(vec![transactions1, transactions2]); + assert_eq!(transactions1.len(), 9); + assert_eq!(transactions1.first().unwrap().version, 1); + assert_eq!(transactions1.last().unwrap().version, 9); + + // Partial overlap + let transactions2 = (5..12) + .map(|i| Transaction { + version: i, + ..Default::default() + }) + .collect(); + let transactions1 = ensure_sequential_transactions(vec![transactions1, transactions2]); + assert_eq!(transactions1.len(), 11); + assert_eq!(transactions1.first().unwrap().version, 1); + assert_eq!(transactions1.last().unwrap().version, 11); +} diff --git a/ecosystem/indexer-grpc/indexer-grpc-utils/src/cache_operator.rs b/ecosystem/indexer-grpc/indexer-grpc-utils/src/cache_operator.rs index fd663303d2f9e..7aad300d5730e 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-utils/src/cache_operator.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-utils/src/cache_operator.rs @@ -11,7 +11,7 @@ use redis::{AsyncCommands, RedisResult}; // Configurations for cache. // Cache entries that are present. -const CACHE_SIZE_ESTIMATION: u64 = 250_000_u64; +pub const CACHE_SIZE_ESTIMATION: u64 = 250_000_u64; pub const MAX_CACHE_FETCH_SIZE: u64 = 1000_u64; @@ -77,7 +77,7 @@ pub enum CacheUpdateStatus { } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub(crate) enum CacheCoverageStatus { +pub enum CacheCoverageStatus { /// Requested version is not processed by cache worker yet. DataNotReady, /// Requested version is cached. @@ -188,7 +188,7 @@ impl CacheOperator { } // Internal function to get the latest version from cache. - pub(crate) async fn check_cache_coverage_status( + pub async fn check_cache_coverage_status( &mut self, requested_version: u64, ) -> anyhow::Result { @@ -313,6 +313,10 @@ impl CacheOperator { } } + // Fetching from cache + // Requested version x + // Cache hit x + + // TODO: Remove this pub async fn batch_get_encoded_proto_data( &mut self, diff --git a/ecosystem/indexer-grpc/indexer-grpc-utils/src/counters.rs b/ecosystem/indexer-grpc/indexer-grpc-utils/src/counters.rs index fe1e21f50fff4..bddcf58458c9e 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-utils/src/counters.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-utils/src/counters.rs @@ -8,32 +8,54 @@ use once_cell::sync::Lazy; use prometheus::{register_int_counter_vec, IntCounterVec}; pub enum IndexerGrpcStep { - DataServiceNewRequestReceived, // [Data Service] New request received. - DataServiceWaitingForCacheData, // [Data Service] Waiting for data from cache. - DataServiceDataFetchedCache, // [Data Service] Fetched data from Redis cache. - DataServiceDataFetchedFilestore, // [Data Service] Fetched data from Filestore. - DataServiceTxnsDecoded, // [Data Service] Decoded transactions. - DataServiceChunkSent, // [Data Service] One chunk of transactions sent to GRPC response channel. - DataServiceAllChunksSent, // [Data Service] All chunks of transactions sent to GRPC response channel. Current batch finished. + // [Data Service] New request received. + DataServiceNewRequestReceived, + // [Data Service] Waiting for data from cache. + DataServiceWaitingForCacheData, + // [Data Service] Fetched data from Redis cache. + DataServiceDataFetchedCache, + // [Data Service] Fetched data from Filestore. + DataServiceDataFetchedFilestore, + // [Data Service] Decoded transactions. + DataServiceTxnsDecoded, + // [Data Service] One chunk of transactions sent to GRPC response channel. + DataServiceChunkSent, + // [Data Service] All chunks of transactions sent to GRPC response channel. Current batch finished. + DataServiceAllChunksSent, - CacheWorkerReceivedTxns, // [Indexer Cache] Received transactions from fullnode. - CacheWorkerTxnEncoded, // [Indexer Cache] Encoded transactions. - CacheWorkerTxnsProcessed, // [Indexer Cache] Processed transactions in a batch. - CacheWorkerBatchProcessed, // [Indexer Cache] Successfully process current batch. + // [Indexer Cache] Received transactions from fullnode. + CacheWorkerReceivedTxns, + // [Indexer Cache] Encoded transactions. + CacheWorkerTxnEncoded, + // [Indexer Cache] Processed transactions in a batch. + CacheWorkerTxnsProcessed, + // [Indexer Cache] Successfully process current batch. + CacheWorkerBatchProcessed, - FilestoreFetchTxns, // [File worker] Fetch transactions from cache. - FilestoreUploadTxns, // [File worker] Upload transactions to filestore. - FilestoreUpdateMetadata, // [File worker] Update metadata to filestore. - FilestoreProcessedBatch, // [File worker] Successfully process current batch. - FileStoreEncodedTxns, // [File worker] Encoded transactions. + // [File worker] Fetch transactions from cache. + FilestoreFetchTxns, + // [File worker] Upload transactions to filestore. + FilestoreUploadTxns, + // [File worker] Update metadata to filestore. + FilestoreUpdateMetadata, + // [File worker] Successfully process current batch. + FilestoreProcessedBatch, + // [File worker] Encoded transactions. + FileStoreEncodedTxns, - FullnodeFetchedBatch, // [Indexer Fullnode] Fetched batch of transactions from fullnode - FullnodeDecodedBatch, // [Indexer Fullnode] Decoded batch of transactions from fullnode - FullnodeProcessedBatch, // [Indexer Fullnode] Processed batch of transactions from fullnode - FullnodeSentBatch, // [Indexer Fullnode] Sent batch successfully + // [Indexer Fullnode] Fetched batch of transactions from fullnode + FullnodeFetchedBatch, + // [Indexer Fullnode] Decoded batch of transactions from fullnode + FullnodeDecodedBatch, + // [Indexer Fullnode] Processed batch of transactions from fullnode + FullnodeProcessedBatch, + // [Indexer Fullnode] Sent batch successfully + FullnodeSentBatch, - TableInfoProcessedBatch, // [Indexer Table Info] Processed batch of transactions from fullnode - TableInfoProcessed, // [Indexer Table Info] Processed transactions from fullnode + // [Indexer Table Info] Processed batch of transactions from fullnode + TableInfoProcessedBatch, + // [Indexer Table Info] Processed transactions from fullnode + TableInfoProcessed, } impl IndexerGrpcStep { @@ -74,7 +96,7 @@ impl IndexerGrpcStep { // Data service steps IndexerGrpcStep::DataServiceNewRequestReceived => { "[Data Service] New request received." - }, + } IndexerGrpcStep::DataServiceWaitingForCacheData => { "[Data Service] Waiting for data from cache." } @@ -104,10 +126,10 @@ impl IndexerGrpcStep { // Table info service steps IndexerGrpcStep::TableInfoProcessedBatch => { "[Indexer Table Info] Processed batch successfully" - }, + } IndexerGrpcStep::TableInfoProcessed => { "[Indexer Table Info] Processed successfully" - }, + } } } } @@ -142,6 +164,26 @@ pub static NUM_TRANSACTIONS_COUNT: Lazy = Lazy::new(|| { .unwrap() }); +/// Number of versions that were overlapped in a multi-task fetch pull +pub static NUM_MULTI_FETCH_OVERLAPPED_VERSIONS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "indexer_grpc_num_multi_thread_fetch_overlapped_versions", + "Number of versions that were overlapped in a multi-task fetch pull", + &["service_type", "overlap_type"], + ) + .unwrap() +}); + +/// Number of times we internally retry fetching a transaction/block +pub static TRANSACTION_STORE_FETCH_RETRIES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "indexer_grpc_num_transaction_store_fetch_retries", + "Number of times we internally retry fetching a transaction/block", + &["store"], + ) + .unwrap() +}); + /// Generic duration metric pub static DURATION_IN_SECS: Lazy = Lazy::new(|| { register_gauge_vec!("indexer_grpc_duration_in_secs", "Duration in seconds", &[ @@ -174,7 +216,7 @@ pub fn log_grpc_step( duration_in_secs: Option, size_in_bytes: Option, num_transactions: Option, - request_metadata: Option, + request_metadata: Option<&IndexerGrpcRequestMetadata>, ) { if let Some(duration_in_secs) = duration_in_secs { DURATION_IN_SECS @@ -220,6 +262,7 @@ pub fn log_grpc_step( step.get_label(), ); } else { + let request_metadata = request_metadata.unwrap(); tracing::info!( start_version, end_version, @@ -229,23 +272,12 @@ pub fn log_grpc_step( duration_in_secs, size_in_bytes, // Request metadata variables - request_name = request_metadata.clone().unwrap().processor_name.as_str(), - request_email = request_metadata.clone().unwrap().request_email.as_str(), - request_api_key_name = request_metadata - .clone() - .unwrap() - .request_api_key_name - .as_str(), - processor_name = request_metadata.clone().unwrap().processor_name.as_str(), - connection_id = request_metadata - .clone() - .unwrap() - .request_connection_id - .as_str(), - request_user_classification = request_metadata - .unwrap() - .request_user_classification - .as_str(), + request_name = &request_metadata.processor_name, + request_email = &request_metadata.request_email, + request_api_key_name = &request_metadata.request_api_key_name, + processor_name = &request_metadata.processor_name, + connection_id = &request_metadata.request_connection_id, + request_user_classification = &request_metadata.request_user_classification, service_type, step = step.get_step(), "{}", diff --git a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/gcs.rs b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/gcs.rs index 6350671ba6119..05578bccdf898 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/gcs.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/gcs.rs @@ -61,6 +61,10 @@ impl FileStoreOperator for GcsFileStoreOperator { self.storage_format } + fn store_name(&self) -> &str { + "GCS" + } + async fn get_raw_file(&self, version: u64) -> anyhow::Result> { let file_entry_key = FileEntry::build_key(version, self.storage_format).to_string(); match Object::download(&self.bucket_name, file_entry_key.as_str()).await { @@ -70,7 +74,7 @@ impl FileStoreOperator for GcsFileStoreOperator { anyhow::bail!("[Indexer File] Transactions file not found. Gap might happen between cache and file store. {}", err) } else { anyhow::bail!( - "[Indexer File] Error happens when transaction file. {}", + "[Indexer File] Error happens when downloading transaction file. {}", err ); } diff --git a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/local.rs b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/local.rs index 5931cea8cc298..c3dfd359acb25 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/local.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/local.rs @@ -51,6 +51,10 @@ impl FileStoreOperator for LocalFileStoreOperator { self.storage_format } + fn store_name(&self) -> &str { + "local" + } + async fn get_raw_file(&self, version: u64) -> anyhow::Result> { let file_entry_key = FileEntry::build_key(version, self.storage_format).to_string(); let file_path = self.path.join(file_entry_key); diff --git a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/mod.rs b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/mod.rs index d0ae6f72964bb..e0bd22efa2407 100644 --- a/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/mod.rs +++ b/ecosystem/indexer-grpc/indexer-grpc-utils/src/file_store_operator/mod.rs @@ -4,12 +4,13 @@ use crate::compression_util::{ FileEntry, FileStoreMetadata, StorageFormat, FILE_ENTRY_TRANSACTION_COUNT, }; -use anyhow::Result; -use aptos_protos::{indexer::v1::TransactionsInStorage, transaction::v1::Transaction}; +use anyhow::{Context, Result}; +use aptos_protos::transaction::v1::Transaction; pub mod gcs; pub use gcs::*; pub mod local; +use crate::counters::TRANSACTION_STORE_FETCH_RETRIES; pub use local::*; const METADATA_FILE_NAME: &str = "metadata.json"; @@ -22,24 +23,56 @@ pub trait FileStoreOperator: Send + Sync { fn storage_format(&self) -> StorageFormat; + /// The name of the store, for logging. Ex: "GCS", "Redis", etc + fn store_name(&self) -> &str; + /// Gets the transactions files from the file store. version has to be a multiple of BLOB_STORAGE_SIZE. - async fn get_transactions(&self, version: u64) -> Result> { - let (transactions, _, _) = self.get_transactions_with_durations(version).await?; + async fn get_transactions(&self, version: u64, retries: u8) -> Result> { + let (transactions, _, _) = self + .get_transactions_with_durations(version, retries) + .await?; Ok(transactions) } async fn get_raw_file(&self, version: u64) -> Result>; + async fn get_raw_file_with_retries(&self, version: u64, retries: u8) -> Result> { + let mut retries = retries; + loop { + match self.get_raw_file(version).await { + Ok(bytes) => return Ok(bytes), + Err(err) => { + TRANSACTION_STORE_FETCH_RETRIES + .with_label_values(&[self.store_name()]) + .inc_by(1); + + if retries == 0 { + return Err(err); + } + retries -= 1; + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + }, + } + } + } + async fn get_transactions_with_durations( &self, version: u64, + retries: u8, ) -> Result<(Vec, f64, f64)> { let io_start_time = std::time::Instant::now(); - let bytes = self.get_raw_file(version).await?; + let bytes = self.get_raw_file_with_retries(version, retries).await?; let io_duration = io_start_time.elapsed().as_secs_f64(); let decoding_start_time = std::time::Instant::now(); - let transactions_in_storage: TransactionsInStorage = - FileEntry::new(bytes, self.storage_format()).into_transactions_in_storage(); + let storage_format = self.storage_format(); + + let transactions_in_storage = tokio::task::spawn_blocking(move || { + FileEntry::new(bytes, storage_format).into_transactions_in_storage() + }) + .await + .context("Converting storage bytes to FileEntry transactions thread panicked")?; + let decoding_duration = decoding_start_time.elapsed().as_secs_f64(); Ok(( transactions_in_storage diff --git a/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/down.sql b/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/down.sql index cdf49e8755b12..271ce0e95d00e 100644 --- a/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/down.sql +++ b/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/down.sql @@ -1 +1 @@ -ALTER TABLE nft_metadata_crawler.parsed_asset_uris DROP COLUMN do_not_parse; +ALTER TABLE IF EXISTS nft_metadata_crawler.parsed_asset_uris DROP COLUMN do_not_parse; diff --git a/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/up.sql b/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/up.sql index 1e480c0d5d43f..59313b9765afb 100644 --- a/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/up.sql +++ b/ecosystem/nft-metadata-crawler-parser/migrations/2024-01-31-221845_add_not_parsable_column/up.sql @@ -1 +1 @@ -ALTER TABLE nft_metadata_crawler.parsed_asset_uris ADD COLUMN do_not_parse BOOLEAN NOT NULL DEFAULT FALSE; +ALTER TABLE IF NOT EXISTS nft_metadata_crawler.parsed_asset_uris ADD COLUMN do_not_parse BOOLEAN NOT NULL DEFAULT FALSE; diff --git a/ecosystem/nft-metadata-crawler-parser/src/config.rs b/ecosystem/nft-metadata-crawler-parser/src/config.rs new file mode 100644 index 0000000000000..75a33c479a478 --- /dev/null +++ b/ecosystem/nft-metadata-crawler-parser/src/config.rs @@ -0,0 +1,243 @@ +// Copyright © Aptos Foundation + +use crate::{ + utils::{ + counters::{ + GOT_CONNECTION_COUNT, PARSER_FAIL_COUNT, PARSER_INVOCATIONS_COUNT, + PUBSUB_ACK_SUCCESS_COUNT, SKIP_URI_COUNT, UNABLE_TO_GET_CONNECTION_COUNT, + }, + database::{check_or_update_chain_id, establish_connection_pool, run_migrations}, + }, + worker::Worker, +}; +use aptos_indexer_grpc_server_framework::RunnableConfig; +use bytes::Bytes; +use diesel::{ + r2d2::{ConnectionManager, Pool}, + PgConnection, +}; +use google_cloud_storage::client::{Client as GCSClient, ClientConfig as GCSClientConfig}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use tracing::{error, info, warn}; +use warp::Filter; + +/// Structs to hold config from YAML +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct ParserConfig { + pub google_application_credentials: Option, + pub bucket: String, + pub database_url: String, + pub cdn_prefix: String, + pub ipfs_prefix: String, + pub ipfs_auth_key: Option, + pub max_file_size_bytes: Option, + pub image_quality: Option, // Quality up to 100 + pub max_image_dimensions: Option, + pub ack_parsed_uris: Option, + pub uri_blacklist: Option>, + pub server_port: u16, +} + +#[async_trait::async_trait] +impl RunnableConfig for ParserConfig { + /// Main driver function that establishes a connection to Pubsub and parses the Pubsub entries in parallel + async fn run(&self) -> anyhow::Result<()> { + info!( + "[NFT Metadata Crawler] Starting parser with config: {:?}", + self + ); + + info!("[NFT Metadata Crawler] Connecting to database"); + let pool = establish_connection_pool(self.database_url.clone()); + info!("[NFT Metadata Crawler] Database connection successful"); + + info!("[NFT Metadata Crawler] Running migrations"); + run_migrations(&pool); + info!("[NFT Metadata Crawler] Finished migrations"); + + if let Some(google_application_credentials) = self.google_application_credentials.clone() { + std::env::set_var( + "GOOGLE_APPLICATION_CREDENTIALS", + google_application_credentials, + ); + } + + // Establish GCS client + let gcs_config = GCSClientConfig::default() + .with_auth() + .await + .unwrap_or_else(|e| { + error!( + error = ?e, + "[NFT Metadata Crawler] Failed to create gRPC client config" + ); + panic!(); + }); + + // Create request context + let context = Arc::new(ServerContext { + parser_config: self.clone(), + pool, + gcs_client: GCSClient::new(gcs_config), + }); + + // Create web server + let route = warp::post() + .and(warp::path::end()) + .and(warp::body::bytes()) + .and(warp::any().map(move || context.clone())) + .and_then(handle_root); + warp::serve(route) + .run(([0, 0, 0, 0], self.server_port)) + .await; + Ok(()) + } + + fn get_server_name(&self) -> String { + "parser".to_string() + } +} + +/// Struct to hold context required for parsing +#[derive(Clone)] +pub struct ServerContext { + pub parser_config: ParserConfig, + pub pool: Pool>, + pub gcs_client: GCSClient, +} + +/// Repeatedly pulls workers from Channel and perform parsing operations +async fn spawn_parser( + parser_config: ParserConfig, + msg_base64: Bytes, + pool: Pool>, + gcs_client: GCSClient, +) { + PARSER_INVOCATIONS_COUNT.inc(); + let pubsub_message = String::from_utf8(msg_base64.to_vec()).unwrap_or_else(|e| { + error!( + error = ?e, + "[NFT Metadata Crawler] Failed to parse PubSub message" + ); + panic!(); + }); + + info!( + pubsub_message = pubsub_message, + "[NFT Metadata Crawler] Received message from PubSub" + ); + + // Skips message if it does not have 5 commas (likely malformed URI) + if pubsub_message.matches(',').count() != 5 { + // Sends ack to PubSub only if ack_parsed_uris flag is true + info!("[NFT Metadata Crawler] More than 5 commas, skipping message"); + SKIP_URI_COUNT.with_label_values(&["invalid"]).inc(); + return; + } + + // Parse PubSub message + let parts: Vec<&str> = pubsub_message.split(',').collect(); + + // Perform chain id check + // If chain id is not set, set it + let mut conn = pool.get().unwrap_or_else(|e| { + error!( + pubsub_message = pubsub_message, + error = ?e, + "[NFT Metadata Crawler] Failed to get DB connection from pool"); + UNABLE_TO_GET_CONNECTION_COUNT.inc(); + panic!(); + }); + GOT_CONNECTION_COUNT.inc(); + + let grpc_chain_id = parts[4].parse::().unwrap_or_else(|e| { + error!( + error = ?e, + "[NFT Metadata Crawler] Failed to parse chain id from PubSub message" + ); + panic!(); + }); + + // Panic if chain id of PubSub message does not match chain id in DB + check_or_update_chain_id(&mut conn, grpc_chain_id as i64).expect("Chain id should match"); + + // Spawn worker + let mut worker = Worker::new( + parser_config.clone(), + conn, + gcs_client.clone(), + pubsub_message.clone(), + parts[0].to_string(), + parts[1].to_string(), + parts[2].to_string().parse().unwrap_or_else(|e|{ + error!( + error = ?e, + "[NFT Metadata Crawler] Failed to parse last transaction version from PubSub message" + ); + panic!(); + }), + chrono::NaiveDateTime::parse_from_str(parts[3], "%Y-%m-%d %H:%M:%S %Z").unwrap_or( + chrono::NaiveDateTime::parse_from_str(parts[3], "%Y-%m-%d %H:%M:%S%.f %Z").unwrap_or_else( + |e| { + error!( + error = ?e, + "[NFT Metadata Crawler] Failed to parse timestamp from PubSub message" + ); + panic!(); + }, + ), + ), + parts[5].parse::().unwrap_or(false), + ); + + info!( + pubsub_message = pubsub_message, + "[NFT Metadata Crawler] Starting worker" + ); + + if let Err(e) = worker.parse().await { + warn!( + pubsub_message = pubsub_message, + error = ?e, + "[NFT Metadata Crawler] Parsing failed" + ); + PARSER_FAIL_COUNT.inc(); + } + + info!( + pubsub_message = pubsub_message, + "[NFT Metadata Crawler] Worker finished" + ); +} + +/// Handles calling parser for the root endpoint +async fn handle_root( + msg: Bytes, + context: Arc, +) -> Result { + let to_ack = context.parser_config.ack_parsed_uris.unwrap_or(false); + + // Use spawn_blocking to run the function on a separate thread. + let _ = tokio::spawn(spawn_parser( + context.parser_config.clone(), + msg, + context.pool.clone(), + context.gcs_client.clone(), + )) + .await; + + if !to_ack { + return Ok(warp::reply::with_status( + warp::reply(), + warp::http::StatusCode::BAD_REQUEST, + )); + } + + PUBSUB_ACK_SUCCESS_COUNT.inc(); + Ok(warp::reply::with_status( + warp::reply(), + warp::http::StatusCode::OK, + )) +} diff --git a/ecosystem/nft-metadata-crawler-parser/src/lib.rs b/ecosystem/nft-metadata-crawler-parser/src/lib.rs index 1bc1279c88c27..a3372bb673229 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/lib.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/lib.rs @@ -1,5 +1,6 @@ // Copyright © Aptos Foundation +pub mod config; pub mod models; pub mod schema; pub mod utils; diff --git a/ecosystem/nft-metadata-crawler-parser/src/main.rs b/ecosystem/nft-metadata-crawler-parser/src/main.rs index 6f51f0e1950d3..58a0af6c6a3f6 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/main.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/main.rs @@ -1,7 +1,7 @@ // Copyright © Aptos Foundation use aptos_indexer_grpc_server_framework::ServerArgs; -use aptos_nft_metadata_crawler_parser::worker::ParserConfig; +use aptos_nft_metadata_crawler_parser::config::ParserConfig; #[tokio::main] async fn main() -> anyhow::Result<()> { diff --git a/ecosystem/nft-metadata-crawler-parser/src/models/nft_metadata_crawler_uris.rs b/ecosystem/nft-metadata-crawler-parser/src/models/nft_metadata_crawler_uris.rs index 4316b1f6b4797..898e4f3a69eb8 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/models/nft_metadata_crawler_uris.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/models/nft_metadata_crawler_uris.rs @@ -1,6 +1,9 @@ // Copyright © Aptos Foundation -use crate::schema::nft_metadata_crawler::parsed_asset_uris; +use crate::{ + models::nft_metadata_crawler_uris_query::NFTMetadataCrawlerURIsQuery, + schema::nft_metadata_crawler::parsed_asset_uris, +}; use diesel::prelude::*; use field_count::FieldCount; use serde::{Deserialize, Serialize}; @@ -153,3 +156,20 @@ impl NFTMetadataCrawlerURIs { self.do_not_parse = do_not_parse; } } + +impl From for NFTMetadataCrawlerURIs { + fn from(query: NFTMetadataCrawlerURIsQuery) -> Self { + Self { + asset_uri: query.asset_uri, + raw_image_uri: query.raw_image_uri, + raw_animation_uri: query.raw_animation_uri, + cdn_json_uri: query.cdn_json_uri, + cdn_image_uri: query.cdn_image_uri, + cdn_animation_uri: query.cdn_animation_uri, + json_parser_retry_count: query.json_parser_retry_count, + image_optimizer_retry_count: query.image_optimizer_retry_count, + animation_optimizer_retry_count: query.animation_optimizer_retry_count, + do_not_parse: query.do_not_parse, + } + } +} diff --git a/ecosystem/nft-metadata-crawler-parser/src/utils/constants.rs b/ecosystem/nft-metadata-crawler-parser/src/utils/constants.rs index 7bae24d8cb476..f13a6008eb601 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/utils/constants.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/utils/constants.rs @@ -12,6 +12,9 @@ pub const MAX_JSON_REQUEST_RETRY_SECONDS: u64 = 30; /// Allocate 90 seconds for downloading large image files pub const MAX_IMAGE_REQUEST_RETRY_SECONDS: u64 = 90; +/// Max number of retries for a given asset_uri +pub const MAX_NUM_PARSE_RETRIES: i32 = 3; + /// Default 15 MB maximum file size for files to be downloaded pub const DEFAULT_MAX_FILE_SIZE_BYTES: u32 = 15_000_000; diff --git a/ecosystem/nft-metadata-crawler-parser/src/worker.rs b/ecosystem/nft-metadata-crawler-parser/src/worker.rs index 7b2631468b5b0..d2e4eb63d47dd 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/worker.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/worker.rs @@ -1,6 +1,7 @@ // Copyright © Aptos Foundation use crate::{ + config::ParserConfig, models::{ nft_metadata_crawler_uris::NFTMetadataCrawlerURIs, nft_metadata_crawler_uris_query::NFTMetadataCrawlerURIsQuery, @@ -8,257 +9,29 @@ use crate::{ utils::{ constants::{ DEFAULT_IMAGE_QUALITY, DEFAULT_MAX_FILE_SIZE_BYTES, DEFAULT_MAX_IMAGE_DIMENSIONS, + MAX_NUM_PARSE_RETRIES, }, counters::{ DUPLICATE_ASSET_URI_COUNT, DUPLICATE_RAW_ANIMATION_URI_COUNT, - DUPLICATE_RAW_IMAGE_URI_COUNT, GOT_CONNECTION_COUNT, OPTIMIZE_IMAGE_TYPE_COUNT, - PARSER_FAIL_COUNT, PARSER_INVOCATIONS_COUNT, PARSER_SUCCESSES_COUNT, - PARSE_URI_TYPE_COUNT, PUBSUB_ACK_SUCCESS_COUNT, SKIP_URI_COUNT, - UNABLE_TO_GET_CONNECTION_COUNT, - }, - database::{ - check_or_update_chain_id, establish_connection_pool, run_migrations, upsert_uris, + DUPLICATE_RAW_IMAGE_URI_COUNT, OPTIMIZE_IMAGE_TYPE_COUNT, PARSER_SUCCESSES_COUNT, + PARSE_URI_TYPE_COUNT, SKIP_URI_COUNT, }, + database::upsert_uris, gcs::{write_image_to_gcs, write_json_to_gcs}, image_optimizer::ImageOptimizer, json_parser::JSONParser, uri_parser::URIParser, }, }; -use aptos_indexer_grpc_server_framework::RunnableConfig; -use bytes::Bytes; use diesel::{ - r2d2::{ConnectionManager, Pool, PooledConnection}, + r2d2::{ConnectionManager, PooledConnection}, PgConnection, }; -use google_cloud_storage::client::{Client as GCSClient, ClientConfig as GCSClientConfig}; +use google_cloud_storage::client::Client as GCSClient; use image::ImageFormat; -use serde::{Deserialize, Serialize}; use serde_json::Value; -use std::sync::Arc; use tracing::{error, info, warn}; use url::Url; -use warp::Filter; - -/// Structs to hold config from YAML -#[derive(Clone, Debug, Deserialize, Serialize)] -#[serde(deny_unknown_fields)] -pub struct ParserConfig { - pub google_application_credentials: Option, - pub bucket: String, - pub database_url: String, - pub cdn_prefix: String, - pub ipfs_prefix: String, - pub ipfs_auth_key: Option, - pub max_file_size_bytes: Option, - pub image_quality: Option, // Quality up to 100 - pub max_image_dimensions: Option, - pub ack_parsed_uris: Option, - pub uri_blacklist: Option>, - pub server_port: u16, -} - -/// Struct to hold context required for parsing -#[derive(Clone)] -pub struct ServerContext { - pub parser_config: ParserConfig, - pub pool: Pool>, - pub gcs_client: GCSClient, -} - -/// Repeatedly pulls workers from Channel and perform parsing operations -async fn spawn_parser( - parser_config: ParserConfig, - msg_base64: Bytes, - pool: Pool>, - gcs_client: GCSClient, -) { - PARSER_INVOCATIONS_COUNT.inc(); - let pubsub_message = String::from_utf8(msg_base64.to_vec()).unwrap_or_else(|e| { - error!( - error = ?e, - "[NFT Metadata Crawler] Failed to parse PubSub message" - ); - panic!(); - }); - - info!( - pubsub_message = pubsub_message, - "[NFT Metadata Crawler] Received message from PubSub" - ); - - // Skips message if it does not have 5 commas (likely malformed URI) - if pubsub_message.matches(',').count() != 5 { - // Sends ack to PubSub only if ack_parsed_uris flag is true - info!("[NFT Metadata Crawler] More than 5 commas, skipping message"); - SKIP_URI_COUNT.with_label_values(&["invalid"]).inc(); - return; - } - - // Parse PubSub message - let parts: Vec<&str> = pubsub_message.split(',').collect(); - - // Perform chain id check - // If chain id is not set, set it - let mut conn = pool.get().unwrap_or_else(|e| { - error!( - pubsub_message = pubsub_message, - error = ?e, - "[NFT Metadata Crawler] Failed to get DB connection from pool"); - UNABLE_TO_GET_CONNECTION_COUNT.inc(); - panic!(); - }); - GOT_CONNECTION_COUNT.inc(); - - let grpc_chain_id = parts[4].parse::().unwrap_or_else(|e| { - error!( - error = ?e, - "[NFT Metadata Crawler] Failed to parse chain id from PubSub message" - ); - panic!(); - }); - - // Panic if chain id of PubSub message does not match chain id in DB - check_or_update_chain_id(&mut conn, grpc_chain_id as i64).expect("Chain id should match"); - - // Spawn worker - let mut worker = Worker::new( - parser_config.clone(), - conn, - gcs_client.clone(), - pubsub_message.clone(), - parts[0].to_string(), - parts[1].to_string(), - parts[2].to_string().parse().unwrap_or_else(|e|{ - error!( - error = ?e, - "[NFT Metadata Crawler] Failed to parse last transaction version from PubSub message" - ); - panic!(); - }), - chrono::NaiveDateTime::parse_from_str(parts[3], "%Y-%m-%d %H:%M:%S %Z").unwrap_or( - chrono::NaiveDateTime::parse_from_str(parts[3], "%Y-%m-%d %H:%M:%S%.f %Z").unwrap_or_else( - |e| { - error!( - error = ?e, - "[NFT Metadata Crawler] Failed to parse timestamp from PubSub message" - ); - panic!(); - }, - ), - ), - parts[5].parse::().unwrap_or(false), - ); - - info!( - pubsub_message = pubsub_message, - "[NFT Metadata Crawler] Starting worker" - ); - - if let Err(e) = worker.parse().await { - warn!( - pubsub_message = pubsub_message, - error = ?e, - "[NFT Metadata Crawler] Parsing failed" - ); - PARSER_FAIL_COUNT.inc(); - } - - info!( - pubsub_message = pubsub_message, - "[NFT Metadata Crawler] Worker finished" - ); -} - -/// Handles calling parser for the root endpoint -async fn handle_root( - msg: Bytes, - context: Arc, -) -> Result { - let to_ack = context.parser_config.ack_parsed_uris.unwrap_or(false); - - // Use spawn_blocking to run the function on a separate thread. - let _ = tokio::spawn(spawn_parser( - context.parser_config.clone(), - msg, - context.pool.clone(), - context.gcs_client.clone(), - )) - .await; - - if !to_ack { - return Ok(warp::reply::with_status( - warp::reply(), - warp::http::StatusCode::BAD_REQUEST, - )); - } - - PUBSUB_ACK_SUCCESS_COUNT.inc(); - Ok(warp::reply::with_status( - warp::reply(), - warp::http::StatusCode::OK, - )) -} - -#[async_trait::async_trait] -impl RunnableConfig for ParserConfig { - /// Main driver function that establishes a connection to Pubsub and parses the Pubsub entries in parallel - async fn run(&self) -> anyhow::Result<()> { - info!( - "[NFT Metadata Crawler] Starting parser with config: {:?}", - self - ); - - info!("[NFT Metadata Crawler] Connecting to database"); - let pool = establish_connection_pool(self.database_url.clone()); - info!("[NFT Metadata Crawler] Database connection successful"); - - info!("[NFT Metadata Crawler] Running migrations"); - run_migrations(&pool); - info!("[NFT Metadata Crawler] Finished migrations"); - - if let Some(google_application_credentials) = self.google_application_credentials.clone() { - std::env::set_var( - "GOOGLE_APPLICATION_CREDENTIALS", - google_application_credentials, - ); - } - - // Establish GCS client - let gcs_config = GCSClientConfig::default() - .with_auth() - .await - .unwrap_or_else(|e| { - error!( - error = ?e, - "[NFT Metadata Crawler] Failed to create gRPC client config" - ); - panic!(); - }); - - // Create request context - let context = Arc::new(ServerContext { - parser_config: self.clone(), - pool, - gcs_client: GCSClient::new(gcs_config), - }); - - // Create web server - let route = warp::post() - .and(warp::path::end()) - .and(warp::body::bytes()) - .and(warp::any().map(move || context.clone())) - .and_then(handle_root); - warp::serve(route) - .run(([0, 0, 0, 0], self.server_port)) - .await; - Ok(()) - } - - fn get_server_name(&self) -> String { - "parser".to_string() - } -} /// Stuct that represents a parser for a single entry from queue pub struct Worker { @@ -308,15 +81,14 @@ impl Worker { // Exit if not force or if asset_uri has already been parsed let prev_model = NFTMetadataCrawlerURIsQuery::get_by_asset_uri(self.asset_uri.clone(), &mut self.conn); - if !self.force && prev_model.is_some() { - self.log_info("Duplicate asset_uri found, skipping parse"); + if let Some(pm) = prev_model { DUPLICATE_ASSET_URI_COUNT.inc(); - return Ok(()); - } - - if prev_model.unwrap_or_default().do_not_parse { - self.log_info("do_not_parse is true, skipping parse"); - return Ok(()); + if !self.force && pm.do_not_parse { + self.log_info("asset_uri has been marked as do_not_parse, skipping parse"); + SKIP_URI_COUNT.with_label_values(&["do_not_parse"]).inc(); + return Ok(()); + } + self.model = pm.into(); } // Skip if asset_uri contains any of the uris in URI_SKIP_LIST @@ -339,87 +111,90 @@ impl Worker { return Ok(()); } - // Parse asset_uri - self.log_info("Parsing asset_uri"); - let json_uri = URIParser::parse( - self.config.ipfs_prefix.clone(), - self.model.get_asset_uri(), - self.config.ipfs_auth_key.clone(), - ) - .unwrap_or_else(|_| { - self.log_warn("Failed to parse asset_uri", None); - PARSE_URI_TYPE_COUNT.with_label_values(&["other"]).inc(); - self.model.get_asset_uri() - }); - - // Parse JSON for raw_image_uri and raw_animation_uri - self.log_info("Starting JSON parsing"); - let (raw_image_uri, raw_animation_uri, json) = JSONParser::parse( - json_uri, - self.config - .max_file_size_bytes - .unwrap_or(DEFAULT_MAX_FILE_SIZE_BYTES), - ) - .await - .unwrap_or_else(|e| { - // Increment retry count if JSON parsing fails - self.log_warn("JSON parsing failed", Some(&e)); - self.model.increment_json_parser_retry_count(); - (None, None, Value::Null) - }); - - self.model.set_raw_image_uri(raw_image_uri); - self.model.set_raw_animation_uri(raw_animation_uri); - - // Save parsed JSON to GCS - if json != Value::Null { - self.log_info("Writing JSON to GCS"); - let cdn_json_uri_result = write_json_to_gcs( - self.config.bucket.clone(), - self.asset_data_id.clone(), - &json, - &self.gcs_client, + if self.force || self.model.get_cdn_json_uri().is_none() { + // Parse asset_uri + self.log_info("Parsing asset_uri"); + let json_uri = URIParser::parse( + self.config.ipfs_prefix.clone(), + self.model.get_asset_uri(), + self.config.ipfs_auth_key.clone(), ) - .await; + .unwrap_or_else(|_| { + self.log_warn("Failed to parse asset_uri", None); + PARSE_URI_TYPE_COUNT.with_label_values(&["other"]).inc(); + self.model.get_asset_uri() + }); - if let Err(e) = cdn_json_uri_result.as_ref() { - self.log_warn( - "Failed to write JSON to GCS, maybe upload timed out?", - Some(e), - ); - } + // Parse JSON for raw_image_uri and raw_animation_uri + self.log_info("Starting JSON parsing"); + let (raw_image_uri, raw_animation_uri, json) = JSONParser::parse( + json_uri, + self.config + .max_file_size_bytes + .unwrap_or(DEFAULT_MAX_FILE_SIZE_BYTES), + ) + .await + .unwrap_or_else(|e| { + // Increment retry count if JSON parsing fails + self.log_warn("JSON parsing failed", Some(&e)); + self.model.increment_json_parser_retry_count(); + (None, None, Value::Null) + }); - let cdn_json_uri = cdn_json_uri_result - .map(|value| format!("{}{}", self.config.cdn_prefix, value)) - .ok(); - self.model.set_cdn_json_uri(cdn_json_uri); - } + self.model.set_raw_image_uri(raw_image_uri); + self.model.set_raw_animation_uri(raw_animation_uri); - // Commit model to Postgres - self.log_info("Committing JSON to Postgres"); - if let Err(e) = upsert_uris(&mut self.conn, &self.model) { - self.log_error("Commit to Postgres failed", &e); + // Save parsed JSON to GCS + if json != Value::Null { + self.log_info("Writing JSON to GCS"); + let cdn_json_uri_result = write_json_to_gcs( + self.config.bucket.clone(), + self.asset_data_id.clone(), + &json, + &self.gcs_client, + ) + .await; + + if let Err(e) = cdn_json_uri_result.as_ref() { + self.log_warn( + "Failed to write JSON to GCS, maybe upload timed out?", + Some(e), + ); + } + + let cdn_json_uri = cdn_json_uri_result + .map(|value| format!("{}{}", self.config.cdn_prefix, value)) + .ok(); + self.model.set_cdn_json_uri(cdn_json_uri); + } + + // Commit model to Postgres + self.log_info("Committing JSON to Postgres"); + if let Err(e) = upsert_uris(&mut self.conn, &self.model) { + self.log_error("Commit to Postgres failed", &e); + } } // Deduplicate raw_image_uri // Proceed with image optimization of force or if raw_image_uri has not been parsed // Since we default to asset_uri, this check works if raw_image_uri is null because deduplication for asset_uri has already taken place - if self.force - || self.model.get_raw_image_uri().map_or(true, |uri_option| { - match NFTMetadataCrawlerURIsQuery::get_by_raw_image_uri( - self.asset_uri.clone(), - uri_option, - &mut self.conn, - ) { - Some(uris) => { - self.log_info("Duplicate raw_image_uri found"); - DUPLICATE_RAW_IMAGE_URI_COUNT.inc(); - self.model.set_cdn_image_uri(uris.cdn_image_uri); - false - }, - None => true, - } - }) + if (self.force || self.model.get_cdn_image_uri().is_none()) + && (self.model.get_cdn_image_uri().is_some() + || self.model.get_raw_image_uri().map_or(true, |uri_option| { + match NFTMetadataCrawlerURIsQuery::get_by_raw_image_uri( + self.asset_uri.clone(), + uri_option, + &mut self.conn, + ) { + Some(uris) => { + self.log_info("Duplicate raw_image_uri found"); + DUPLICATE_RAW_IMAGE_URI_COUNT.inc(); + self.model.set_cdn_image_uri(uris.cdn_image_uri); + false + }, + None => true, + } + })) { // Parse raw_image_uri, use asset_uri if parsing fails self.log_info("Parsing raw_image_uri"); @@ -461,8 +236,8 @@ impl Worker { (vec![], ImageFormat::Png) }); - if image.is_empty() && json == Value::Null { - self.log_info("Image and JSON are empty, skipping parse, marking as do_not_parse"); + if image.is_empty() { + self.log_info("Image is empty, skipping parse"); self.model.set_do_not_parse(true); SKIP_URI_COUNT.with_label_values(&["empty"]).inc(); if let Err(e) = upsert_uris(&mut self.conn, &self.model) { @@ -495,33 +270,34 @@ impl Worker { .ok(); self.model.set_cdn_image_uri(cdn_image_uri); } - } - // Commit model to Postgres - self.log_info("Committing image to Postgres"); - if let Err(e) = upsert_uris(&mut self.conn, &self.model) { - self.log_error("Commit to Postgres failed", &e); + // Commit model to Postgres + self.log_info("Committing image to Postgres"); + if let Err(e) = upsert_uris(&mut self.conn, &self.model) { + self.log_error("Commit to Postgres failed", &e); + } } // Deduplicate raw_animation_uri // Set raw_animation_uri_option to None if not force and raw_animation_uri already exists let mut raw_animation_uri_option = self.model.get_raw_animation_uri(); - if !self.force - && raw_animation_uri_option.clone().map_or(true, |uri| { - match NFTMetadataCrawlerURIsQuery::get_by_raw_animation_uri( - self.asset_uri.clone(), - uri, - &mut self.conn, - ) { - Some(uris) => { - self.log_info("Duplicate raw_animation_uri found"); - DUPLICATE_RAW_ANIMATION_URI_COUNT.inc(); - self.model.set_cdn_animation_uri(uris.cdn_animation_uri); - true - }, - None => true, - } - }) + if self.model.get_cdn_animation_uri().is_some() + || !self.force + && raw_animation_uri_option.clone().map_or(true, |uri| { + match NFTMetadataCrawlerURIsQuery::get_by_raw_animation_uri( + self.asset_uri.clone(), + uri, + &mut self.conn, + ) { + Some(uris) => { + self.log_info("Duplicate raw_animation_uri found"); + DUPLICATE_RAW_ANIMATION_URI_COUNT.inc(); + self.model.set_cdn_animation_uri(uris.cdn_animation_uri); + true + }, + None => true, + } + }) { raw_animation_uri_option = None; } @@ -584,12 +360,23 @@ impl Worker { .ok(); self.model.set_cdn_animation_uri(cdn_animation_uri); } + + // Commit model to Postgres + self.log_info("Committing animation to Postgres"); + if let Err(e) = upsert_uris(&mut self.conn, &self.model) { + self.log_error("Commit to Postgres failed", &e); + } } - // Commit model to Postgres - self.log_info("Committing animation to Postgres"); - if let Err(e) = upsert_uris(&mut self.conn, &self.model) { - self.log_error("Commit to Postgres failed", &e); + if self.model.get_json_parser_retry_count() > MAX_NUM_PARSE_RETRIES + || self.model.get_image_optimizer_retry_count() > MAX_NUM_PARSE_RETRIES + || self.model.get_animation_optimizer_retry_count() > MAX_NUM_PARSE_RETRIES + { + self.log_info("Retry count exceeded, marking as do_not_parse"); + self.model.set_do_not_parse(true); + if let Err(e) = upsert_uris(&mut self.conn, &self.model) { + self.log_error("Commit to Postgres failed", &e); + } } PARSER_SUCCESSES_COUNT.inc(); diff --git a/network/builder/src/builder.rs b/network/builder/src/builder.rs index 09b76c724acd9..238f4aa07dc33 100644 --- a/network/builder/src/builder.rs +++ b/network/builder/src/builder.rs @@ -158,6 +158,7 @@ impl NetworkBuilder { CONNECTIVITY_CHECK_INTERVAL_MS, NETWORK_CHANNEL_SIZE, mutual_authentication, + true, /* enable_latency_aware_dialing */ ); builder @@ -224,6 +225,7 @@ impl NetworkBuilder { config.connectivity_check_interval_ms, config.network_channel_size, config.mutual_authentication, + config.enable_latency_aware_dialing, ); network_builder.discovery_listeners = Some(Vec::new()); @@ -342,6 +344,7 @@ impl NetworkBuilder { connectivity_check_interval_ms: u64, channel_size: usize, mutual_authentication: bool, + enable_latency_aware_dialing: bool, ) -> &mut Self { let pm_conn_mgr_notifs_rx = self.peer_manager_builder.add_connection_event_listener(); let outbound_connection_limit = if !self.network_context.network_id().is_validator_network() @@ -364,6 +367,7 @@ impl NetworkBuilder { pm_conn_mgr_notifs_rx, outbound_connection_limit, mutual_authentication, + enable_latency_aware_dialing, )); self } diff --git a/network/framework/Cargo.toml b/network/framework/Cargo.toml index 92da1769e93bd..e2baf89a9513c 100644 --- a/network/framework/Cargo.toml +++ b/network/framework/Cargo.toml @@ -43,10 +43,14 @@ hex = { workspace = true } itertools = { workspace = true } maplit = { workspace = true } once_cell = { workspace = true } +ordered-float = { workspace = true } pin-project = { workspace = true } proptest ={ workspace = true, optional = true } proptest-derive = { workspace = true, optional = true } rand = { workspace = true, features = ["small_rng"] } +# Note: we cannot rely on the workspace version of rand. So we use this workaround. See: +# https://github.com/aptos-labs/aptos-core/blob/main/state-sync/aptos-data-client/Cargo.toml#L41. +rand_latest = { package = "rand", version = "0.8.5" } serde = { workspace = true } serde_bytes = { workspace = true } serde_json = { workspace = true } diff --git a/network/framework/src/connectivity_manager/builder.rs b/network/framework/src/connectivity_manager/builder.rs index 5c41895a8d892..26e25fad5c971 100644 --- a/network/framework/src/connectivity_manager/builder.rs +++ b/network/framework/src/connectivity_manager/builder.rs @@ -35,6 +35,7 @@ impl ConnectivityManagerBuilder { connection_notifs_rx: conn_notifs_channel::Receiver, outbound_connection_limit: Option, mutual_authentication: bool, + enable_latency_aware_dialing: bool, ) -> Self { let (conn_mgr_reqs_tx, conn_mgr_reqs_rx) = aptos_channels::new( channel_size, @@ -56,6 +57,7 @@ impl ConnectivityManagerBuilder { Duration::from_millis(max_connection_delay_ms), outbound_connection_limit, mutual_authentication, + enable_latency_aware_dialing, )), } } diff --git a/network/framework/src/connectivity_manager/mod.rs b/network/framework/src/connectivity_manager/mod.rs index 51a73e0a2b83e..920c274835fe9 100644 --- a/network/framework/src/connectivity_manager/mod.rs +++ b/network/framework/src/connectivity_manager/mod.rs @@ -39,6 +39,7 @@ use aptos_config::{ network_id::NetworkContext, }; use aptos_crypto::x25519; +use aptos_infallible::RwLock; use aptos_logger::prelude::*; use aptos_netcore::transport::ConnectionOrigin; use aptos_num_variants::NumVariants; @@ -50,21 +51,24 @@ use futures::{ future::{BoxFuture, FutureExt}, stream::{FuturesUnordered, StreamExt}, }; -use rand::{ - prelude::{SeedableRng, SmallRng}, - seq::SliceRandom, -}; +use futures_util::future::join_all; +use itertools::Itertools; +use ordered_float::OrderedFloat; +use rand_latest::Rng; use serde::Serialize; use std::{ cmp::{min, Ordering}, collections::{hash_map::Entry, HashMap, HashSet}, fmt, + net::{Shutdown, TcpStream, ToSocketAddrs}, sync::Arc, - time::{Duration, SystemTime}, + time::{Duration, Instant, SystemTime}, }; +use tokio::task::JoinHandle; use tokio_retry::strategy::jitter; pub mod builder; +mod selection; #[cfg(test)] mod test; @@ -76,6 +80,13 @@ mod test; /// around the same time at startup. const MAX_CONNECTION_DELAY_JITTER: Duration = Duration::from_millis(100); +/// The maximum amount of time to wait before timing out a connection attempt. +/// This should be relatively small to avoid blocking dials for too long. +const MAX_CONNECTION_TIMEOUT_SECS: u64 = 2; + +/// The maximum number of socket addresses to ping for a single address +const MAX_SOCKET_ADDRESSES_TO_PING: usize = 2; + /// The amount of time to try other peers until dialing this peer again. /// /// It's currently set to 5 minutes to ensure rotation through all (or most) peers @@ -91,7 +102,7 @@ pub struct ConnectivityManager { /// PeerId and address of remote peers to which this peer is connected. connected: HashMap, /// All information about peers from discovery sources. - discovered_peers: DiscoveredPeerSet, + discovered_peers: Arc>, /// Channel to send connection requests to PeerManager. connection_reqs_tx: ConnectionRequestSender, /// Channel to receive notifications from PeerManager. @@ -115,10 +126,10 @@ pub struct ConnectivityManager { event_id: u32, /// A way to limit the number of connected peers by outgoing dials. outbound_connection_limit: Option, - /// Random for shuffling which peers will be dialed - rng: SmallRng, /// Whether we are using mutual authentication or not mutual_authentication: bool, + /// Whether or not to enable latency aware peer dialing + enable_latency_aware_dialing: bool, } /// Different sources for peer addresses, ordered by priority (Onchain=highest, @@ -163,37 +174,58 @@ pub enum ConnectivityRequest { } #[derive(Clone, Debug, Default, PartialEq, Serialize)] -struct DiscoveredPeerSet(HashMap); +struct DiscoveredPeerSet { + peer_set: HashMap, +} impl DiscoveredPeerSet { - fn get_mut(&mut self, peer_id: &PeerId) -> Option<&mut DiscoveredPeer> { - self.0.get_mut(peer_id) - } - - fn try_remove_empty(&mut self, peer_id: &PeerId) -> bool { - match self.0.entry(*peer_id) { - Entry::Occupied(entry) => { - let peer = entry.get(); - if peer.addrs.is_empty() && peer.keys.is_empty() { - entry.remove(); - true - } else { - false - } - }, - Entry::Vacant(_) => true, - } + #[cfg(test)] + /// Creates a new discovered peer set from the + /// specified peer set. This is used for testing. + pub fn new_from_peer_set(peer_set: HashMap) -> Self { + Self { peer_set } } - /// Converts `DiscoveredPeerSet` into a `PeerSet`, however disregards the source of discovery - /// TODO: Provide smarter merging based on discovery source - pub fn to_eligible_peers(&self) -> PeerSet { - self.0 + /// Gets the eligible peers from the discovered peer set + fn get_eligible_peers(&self) -> PeerSet { + self.peer_set .iter() .filter(|(_, peer)| peer.is_eligible()) .map(|(peer_id, peer)| (*peer_id, peer.into())) .collect() } + + /// Removes the specified peer from the set if the state is empty + fn remove_peer_if_empty(&mut self, peer_id: &PeerId) { + if let Entry::Occupied(entry) = self.peer_set.entry(*peer_id) { + if entry.get().is_empty() { + entry.remove(); + } + } + } + + /// Updates the last dial time for the specified peer (if one was found) + fn update_last_dial_time(&mut self, peer_id: &PeerId) { + if let Some(discovered_peer) = self.peer_set.get_mut(peer_id) { + discovered_peer.update_last_dial_time() + } + } + + /// Returns the ping latency for the specified peer (if one was found) + fn get_ping_latency_secs(&self, peer_id: &PeerId) -> Option { + if let Some(discovered_peer) = self.peer_set.get(peer_id) { + discovered_peer.ping_latency_secs + } else { + None + } + } + + /// Updates the ping latency for the specified peer (if one was found) + fn update_ping_latency_secs(&mut self, peer_id: &PeerId, latency_secs: f64) { + if let Some(discovered_peer) = self.peer_set.get_mut(peer_id) { + discovered_peer.set_ping_latency_secs(latency_secs) + } + } } /// Represents all the information for a discovered peer @@ -204,6 +236,8 @@ struct DiscoveredPeer { keys: PublicKeys, /// The last time the node was dialed last_dial_time: SystemTime, + /// The calculated peer ping latency (secs) + ping_latency_secs: Option, } impl DiscoveredPeer { @@ -213,6 +247,7 @@ impl DiscoveredPeer { addrs: Addresses::default(), keys: PublicKeys::default(), last_dial_time: SystemTime::UNIX_EPOCH, + ping_latency_secs: None, } } @@ -226,15 +261,25 @@ impl DiscoveredPeer { self.is_eligible() && !self.addrs.is_empty() } + /// Returns true iff the peer's addresses and keys are empty + pub fn is_empty(&self) -> bool { + self.addrs.is_empty() && self.keys.is_empty() + } + /// Updates the last time we tried to connect to this node - pub fn set_last_dial_time(&mut self, time: SystemTime) { - self.last_dial_time = time; + pub fn update_last_dial_time(&mut self) { + self.last_dial_time = SystemTime::now(); + } + + /// Updates the ping latency for this peer + pub fn set_ping_latency_secs(&mut self, latency_secs: f64) { + self.ping_latency_secs = Some(latency_secs); } /// Based on input, backoff on amount of time to dial a peer again - pub fn has_dialed_recently(&self, backoff_duration: Duration) -> bool { + pub fn has_dialed_recently(&self) -> bool { if let Ok(duration_since_last_dial) = self.last_dial_time.elapsed() { - duration_since_last_dial < backoff_duration + duration_since_last_dial < TRY_DIAL_BACKOFF_TIME } else { false } @@ -243,8 +288,8 @@ impl DiscoveredPeer { impl PartialOrd for DiscoveredPeer { fn partial_cmp(&self, other: &Self) -> Option { - let self_dialed_recently = self.has_dialed_recently(TRY_DIAL_BACKOFF_TIME); - let other_dialed_recently = other.has_dialed_recently(TRY_DIAL_BACKOFF_TIME); + let self_dialed_recently = self.has_dialed_recently(); + let other_dialed_recently = other.has_dialed_recently(); // Less recently dialed is prioritized over recently dialed if !self_dialed_recently && other_dialed_recently { @@ -313,6 +358,7 @@ where max_delay: Duration, outbound_connection_limit: Option, mutual_authentication: bool, + enable_latency_aware_dialing: bool, ) -> Self { // Verify that the trusted peers set exists and that it is empty let trusted_peers = peers_and_metadata @@ -336,7 +382,7 @@ where time_service, peers_and_metadata, connected: HashMap::new(), - discovered_peers: DiscoveredPeerSet::default(), + discovered_peers: Arc::new(RwLock::new(DiscoveredPeerSet::default())), connection_reqs_tx, connection_notifs_rx, requests_rx, @@ -347,11 +393,11 @@ where max_delay, event_id: 0, outbound_connection_limit, - rng: SmallRng::from_entropy(), mutual_authentication, + enable_latency_aware_dialing, }; - // set the initial config addresses and pubkeys + // Set the initial seed config addresses and public keys connmgr.handle_update_discovered_peers(DiscoverySource::Config, seeds); connmgr } @@ -510,68 +556,163 @@ where } } - fn dial_eligible_peers<'a>( + /// Identifies a set of peers to dial and queues them for dialing + async fn dial_eligible_peers<'a>( &'a mut self, pending_dials: &'a mut FuturesUnordered>, ) { - let to_connect = self.choose_peers_to_dial(); - for (peer_id, peer) in to_connect { + for (peer_id, peer) in self.choose_peers_to_dial().await { self.queue_dial_peer(peer_id, peer, pending_dials); } } - fn choose_peers_to_dial(&mut self) -> Vec<(PeerId, DiscoveredPeer)> { + /// Selects a set of peers to dial + async fn choose_peers_to_dial(&mut self) -> Vec<(PeerId, DiscoveredPeer)> { + // Get the eligible peers to dial let network_id = self.network_context.network_id(); let role = self.network_context.role(); let roles_to_dial = network_id.upstream_roles(&role); - let mut eligible: Vec<_> = self - .discovered_peers - .0 - .iter() + let discovered_peers = self.discovered_peers.read().peer_set.clone(); + let eligible_peers: Vec<_> = discovered_peers + .into_iter() .filter(|(peer_id, peer)| { peer.is_eligible_to_be_dialed() // The node is eligible to dial - && !self.connected.contains_key(peer_id) // The node is not already connected. - && !self.dial_queue.contains_key(peer_id) // There is no pending dial to this node. + && !self.connected.contains_key(peer_id) // The node is not already connected + && !self.dial_queue.contains_key(peer_id) // There is no pending dial to this node && roles_to_dial.contains(&peer.role) // We can dial this role }) .collect(); - // Prioritize by PeerRole - // Shuffle so we don't get stuck on certain peers - eligible.shuffle(&mut self.rng); - - // Sort by peer priority - eligible - .sort_by(|(_, peer), (_, other)| peer.partial_cmp(other).unwrap_or(Ordering::Equal)); - - // Limit the number of dialed connections from a Full Node - // This does not limit the number of incoming connections - // It enforces that a full node cannot have more outgoing connections than `connection_limit` - // including in flight dials. - let num_eligible = eligible.len(); - let to_connect = if let Some(conn_limit) = self.outbound_connection_limit { - let outbound_connections = self - .connected - .iter() - .filter(|(_, metadata)| metadata.origin == ConnectionOrigin::Outbound) - .count(); - min( - conn_limit - .saturating_sub(outbound_connections.saturating_add(self.dial_queue.len())), - num_eligible, + // Initialize the dial state for any new peers + for (peer_id, _) in &eligible_peers { + self.dial_states + .entry(*peer_id) + .or_insert_with(|| DialState::new(self.backoff_strategy.clone())); + } + + // Limit the number of dialed connections from a fullnode. Note: this does not + // limit the number of incoming connections. It only enforces that a fullnode + // cannot have more outgoing connections than the limit (including in-flight dials). + let num_eligible_peers = eligible_peers.len(); + let num_peers_to_dial = + if let Some(outbound_connection_limit) = self.outbound_connection_limit { + // Get the number of outbound connections + let num_outbound_connections = self + .connected + .iter() + .filter(|(_, metadata)| metadata.origin == ConnectionOrigin::Outbound) + .count(); + + // Add any pending dials to the count + let total_outbound_connections = + num_outbound_connections.saturating_add(self.dial_queue.len()); + + // Calculate the potential number of peers to dial + let num_peers_to_dial = + outbound_connection_limit.saturating_sub(total_outbound_connections); + + // Limit the number of peers to dial by the total number of eligible peers + min(num_peers_to_dial, num_eligible_peers) + } else { + num_eligible_peers // Otherwise, we attempt to dial all eligible peers + }; + + // If we have no peers to dial, return early + if num_peers_to_dial == 0 { + return vec![]; + } + + // Prioritize the eligible peers and select the peers to dial + if selection::should_select_peers_by_latency( + &self.network_context, + self.enable_latency_aware_dialing, + ) { + // Ping the eligible peers (so that we can fetch missing ping latency information) + self.ping_eligible_peers(eligible_peers.clone()).await; + + // Choose the peers to dial (weighted by ping latency) + selection::choose_random_peers_by_ping_latency( + self.network_context, + eligible_peers, + num_peers_to_dial, + self.discovered_peers.clone(), ) } else { - num_eligible - }; + // Choose the peers randomly + selection::choose_peers_to_dial_randomly(eligible_peers, num_peers_to_dial) + } + } - // Take peers to connect to in priority order - eligible - .iter() - .take(to_connect) - .map(|(peer_id, peer)| (**peer_id, (*peer).clone())) - .collect() + /// Pings the eligible peers to calculate their ping latencies + /// and updates the discovered peer state accordingly. + async fn ping_eligible_peers(&mut self, eligible_peers: Vec<(PeerId, DiscoveredPeer)>) { + // Identify the eligible peers that don't already have latency information + let peers_to_ping = eligible_peers + .into_iter() + .filter(|(_, peer)| peer.ping_latency_secs.is_none()) + .collect::>(); + + // If there are no peers to ping, return early + let num_peers_to_ping = peers_to_ping.len(); + if num_peers_to_ping == 0 { + return; + } + + // Spawn a task that pings each peer concurrently + let ping_start_time = Instant::now(); + let mut ping_tasks = vec![]; + for (peer_id, peer) in peers_to_ping.into_iter() { + // Get the network address for the peer + let network_context = self.network_context; + let network_address = match self.dial_states.get(&peer_id) { + Some(dial_state) => match dial_state.random_addr(&peer.addrs) { + Some(network_address) => network_address.clone(), + None => { + warn!( + NetworkSchema::new(&network_context), + "Peer {} does not have a network address!", + peer_id.short_str() + ); + continue; // Continue onto the next peer + }, + }, + None => { + warn!( + NetworkSchema::new(&network_context), + "Peer {} does not have a dial state!", + peer_id.short_str() + ); + continue; // Continue onto the next peer + }, + }; + + // Ping the peer + let ping_task = spawn_latency_ping_task( + network_context, + peer_id, + network_address, + self.discovered_peers.clone(), + ); + + // Add the task to the list of ping tasks + ping_tasks.push(ping_task); + } + + // Wait for all the ping tasks to complete (or timeout) + let num_ping_tasks = ping_tasks.len(); + join_all(ping_tasks).await; + + // Log the peer ping latencies + log_peer_ping_latencies( + self.network_context, + self.discovered_peers.clone(), + num_peers_to_ping, + num_ping_tasks, + ping_start_time, + ); } + /// Queues a dial to the specified peer fn queue_dial_peer<'a>( &'a mut self, peer_id: PeerId, @@ -582,19 +723,36 @@ where // newly eligible, but not connected to peers, have their counter initialized properly. counters::peer_connected(&self.network_context, &peer_id, 0); - let connection_reqs_tx = self.connection_reqs_tx.clone(); - // The initial dial state; it has zero dial delay and uses the first - // address. - let init_dial_state = DialState::new(self.backoff_strategy.clone()); - let dial_state = self - .dial_states - .entry(peer_id) - .or_insert_with(|| init_dial_state); + // Get the peer's dial state + let dial_state = match self.dial_states.get_mut(&peer_id) { + Some(dial_state) => dial_state, + None => { + // The peer should have a dial state! If not, log an error and return. + error!( + NetworkSchema::new(&self.network_context).remote_peer(&peer_id), + "{} Peer {} does not have a dial state!", + self.network_context, + peer_id.short_str() + ); + return; + }, + }; // Choose the next addr to dial for this peer. Currently, we just // round-robin the selection, i.e., try the sequence: // addr[0], .., addr[len-1], addr[0], .. - let addr = dial_state.next_addr(&peer.addrs).clone(); + let addr = match dial_state.next_addr(&peer.addrs) { + Some(addr) => addr.clone(), + None => { + warn!( + NetworkSchema::new(&self.network_context).remote_peer(&peer_id), + "{} Peer {} does not have any network addresses!", + self.network_context, + peer_id.short_str() + ); + return; + }, + }; // Using the DialState's backoff strategy, compute the delay until // the next dial attempt for this peer. @@ -606,6 +764,7 @@ where let network_context = self.network_context; // Create future which completes by either dialing after calculated // delay or on cancellation. + let connection_reqs_tx = self.connection_reqs_tx.clone(); let f = async move { // We dial after a delay. The dial can be canceled by sending to or dropping // `cancel_rx`. @@ -634,9 +793,9 @@ where pending_dials.push(f.boxed()); // Update last dial time - if let Some(discovered_peer) = self.discovered_peers.get_mut(&peer_id) { - discovered_peer.set_last_dial_time(SystemTime::now()) - } + self.discovered_peers + .write() + .update_last_dial_time(&peer_id); self.dial_queue.insert(peer_id, cancel_tx); } @@ -658,7 +817,7 @@ where info!( NetworkSchema::new(&self.network_context), discovered_peers = ?self.discovered_peers, - "Current eligible peers" + "Active discovered peers" ) }); @@ -668,12 +827,28 @@ where self.close_stale_connections().await; // Dial peers which are eligible but are neither connected nor queued for dialing in the // future. - self.dial_eligible_peers(pending_dials); + self.dial_eligible_peers(pending_dials).await; + + // Update the metrics for any peer ping latencies + self.update_ping_latency_metrics(); } - fn reset_dial_state(&mut self, peer_id: &PeerId) { - if let Some(dial_state) = self.dial_states.get_mut(peer_id) { - *dial_state = DialState::new(self.backoff_strategy.clone()); + /// Updates the metrics for tracking pre-dial and connected peer ping latencies + fn update_ping_latency_metrics(&self) { + // Update the pre-dial peer ping latencies + for (_, peer) in self.discovered_peers.read().peer_set.iter() { + if let Some(ping_latency_secs) = peer.ping_latency_secs { + counters::observe_pre_dial_ping_time(&self.network_context, ping_latency_secs); + } + } + + // Update the connected peer ping latencies + for peer_id in self.connected.keys() { + if let Some(ping_latency_secs) = + self.discovered_peers.read().get_ping_latency_secs(peer_id) + { + counters::observe_connected_ping_time(&self.network_context, ping_latency_secs); + } } } @@ -704,11 +879,14 @@ where } } + /// Handles an update for newly discovered peers. This typically + /// occurs at node startup, and on epoch changes. fn handle_update_discovered_peers( &mut self, src: DiscoverySource, new_discovered_peers: PeerSet, ) { + // Log the update event info!( NetworkSchema::new(&self.network_context), "{} Received updated list of discovered peers! Source: {:?}, num peers: {:?}", @@ -717,13 +895,10 @@ where new_discovered_peers.len() ); - let self_peer_id = self.network_context.peer_id(); + // Remove peers that no longer have relevant network information let mut keys_updated = false; - let mut peers_to_check_remove = Vec::new(); - - // Remove peer info that no longer have information to use them - for (peer_id, peer) in self.discovered_peers.0.iter_mut() { + for (peer_id, peer) in self.discovered_peers.write().peer_set.iter_mut() { let new_peer = new_discovered_peers.get(peer_id); let check_remove = if let Some(new_peer) = new_peer { if new_peer.keys.is_empty() { @@ -745,24 +920,25 @@ where // Remove peers that no longer have state for peer_id in peers_to_check_remove { - self.discovered_peers.try_remove_empty(&peer_id); + self.discovered_peers.write().remove_peer_if_empty(&peer_id); } // Make updates to the peers accordingly for (peer_id, discovered_peer) in new_discovered_peers { // Don't include ourselves, because we don't need to dial ourselves - if peer_id == self_peer_id { + if peer_id == self.network_context.peer_id() { continue; } // Create the new `DiscoveredPeer`, role is set when a `Peer` is first discovered - let peer = self - .discovered_peers - .0 + let mut discovered_peers = self.discovered_peers.write(); + let peer = discovered_peers + .peer_set .entry(peer_id) .or_insert_with(|| DiscoveredPeer::new(discovered_peer.role)); + + // Update the peer's pubkeys let mut peer_updated = false; - // Update peer's pubkeys if peer.keys.update(src, discovered_peer.keys) { info!( NetworkSchema::new(&self.network_context) @@ -777,7 +953,7 @@ where peer_updated = true; } - // Update peer's addresses + // Update the peer's addresses if peer.addrs.update(src, discovered_peer.addresses) { info!( NetworkSchema::new(&self.network_context).remote_peer(&peer_id), @@ -797,7 +973,9 @@ where // fresh backoff (since the current backoff delay might be maxed // out if we can't reach any of their previous addresses). if peer_updated { - self.reset_dial_state(&peer_id) + if let Some(dial_state) = self.dial_states.get_mut(&peer_id) { + *dial_state = DialState::new(self.backoff_strategy.clone()); + } } } @@ -805,7 +983,7 @@ where if keys_updated { // For each peer, union all of the pubkeys from each discovery source // to generate the new eligible peers set. - let new_eligible = self.discovered_peers.to_eligible_peers(); + let new_eligible = self.discovered_peers.read().get_eligible_peers(); // Swap in the new eligible peers set if let Err(error) = self @@ -933,6 +1111,119 @@ fn log_dial_result( } } +/// Logs the total and individual ping latencies +fn log_peer_ping_latencies( + network_context: NetworkContext, + discovered_peers: Arc>, + total_peers_to_ping: usize, + num_peers_pinged: usize, + ping_start_time: Instant, +) { + // Log the total ping latency time + let ping_latency_duration = Instant::now().duration_since(ping_start_time); + info!( + NetworkSchema::new(&network_context), + "Finished pinging eligible peers! Total peers to ping: {}, num peers pinged: {}, time: {} secs", + total_peers_to_ping, + num_peers_pinged, + ping_latency_duration.as_secs_f64() + ); + + // Log the ping latencies for the eligible peers (sorted by latency) + let eligible_peers = discovered_peers.read().peer_set.clone(); + let eligible_peers_and_latencies = eligible_peers + .into_iter() + .map(|(peer_id, peer)| (peer_id, peer.ping_latency_secs)) + .collect::>(); + let sorted_eligible_peers_and_latencies = eligible_peers_and_latencies + .iter() + .sorted_by_key(|(_, ping_latency_secs)| ping_latency_secs.map(OrderedFloat)) + .collect::>(); + info!( + NetworkSchema::new(&network_context), + "Sorted eligible peers with recorded ping latencies: {:?}", + sorted_eligible_peers_and_latencies + ); +} + +/// Spawns a task that pings the peer at the specified +/// network address and updates the peer's ping latency. +fn spawn_latency_ping_task( + network_context: NetworkContext, + peer_id: AccountAddress, + network_address: NetworkAddress, + discovered_peers: Arc>, +) -> JoinHandle<()> { + tokio::task::spawn_blocking(move || { + // Extract the socket addresses from the network address + let socket_addresses = match network_address.to_socket_addrs() { + Ok(socket_addresses) => socket_addresses.collect::>(), + Err(error) => { + warn!( + NetworkSchema::new(&network_context), + "Failed to resolve network address {:?}: {}", network_address, error + ); + return; + }, + }; + + // If no socket addresses were found, log an error and return + if socket_addresses.is_empty() { + warn!( + NetworkSchema::new(&network_context), + "Peer {} does not have any socket addresses for network address {:?}!", + peer_id.short_str(), + network_address, + ); + return; + } + + // Limit the number of socket addresses we'll try to connect to + let socket_addresses = socket_addresses + .iter() + .take(MAX_SOCKET_ADDRESSES_TO_PING) + .collect::>(); + + // Attempt to connect to the socket addresses over TCP and time the connection + for socket_address in socket_addresses { + // Start the ping timer + let start_time = Instant::now(); + + // Attempt to connect to the socket address + if let Ok(tcp_stream) = TcpStream::connect_timeout( + socket_address, + Duration::from_secs(MAX_CONNECTION_TIMEOUT_SECS), + ) { + // We connected successfully, update the peer's ping latency + let ping_latency_secs = start_time.elapsed().as_secs_f64(); + discovered_peers + .write() + .update_ping_latency_secs(&peer_id, ping_latency_secs); + + // Attempt to terminate the TCP stream cleanly + if let Err(error) = tcp_stream.shutdown(Shutdown::Both) { + warn!( + NetworkSchema::new(&network_context), + "Failed to terminate TCP stream to peer {} after pinging: {}", + peer_id.short_str(), + error + ); + } + + return; + } else { + // Log an error if we failed to connect to the socket address + info!( + NetworkSchema::new(&network_context), + "Failed to ping peer {} at socket address {:?} after pinging", + peer_id.short_str(), + socket_address + ); + } + } + }) +} + ///////////////////// // DiscoverySource // ///////////////////// @@ -1058,13 +1349,27 @@ where } } - fn next_addr<'a>(&mut self, addrs: &'a Addresses) -> &'a NetworkAddress { - assert!(!addrs.is_empty()); + /// Returns the address to dial (specified by the index) for this peer + fn get_addr_at_index<'a>( + &self, + addr_index: usize, + addrs: &'a Addresses, + ) -> Option<&'a NetworkAddress> { + addrs.get(addr_index % addrs.len()) + } - let addr_idx = self.addr_idx; + /// Returns the current address to dial for this peer and updates + /// the internal state to point to the next address. + fn next_addr<'a>(&mut self, addrs: &'a Addresses) -> Option<&'a NetworkAddress> { + let curr_addr = self.get_addr_at_index(self.addr_idx, addrs); self.addr_idx = self.addr_idx.wrapping_add(1); + curr_addr + } - addrs.get(addr_idx % addrs.len()).unwrap() + /// Returns a random address to dial for this peer + fn random_addr<'a>(&self, addrs: &'a Addresses) -> Option<&'a NetworkAddress> { + let addr_index = ::rand_latest::thread_rng().gen_range(0..addrs.len()); + self.get_addr_at_index(addr_index, addrs) } fn next_backoff_delay(&mut self, max_delay: Duration) -> Duration { diff --git a/network/framework/src/connectivity_manager/selection.rs b/network/framework/src/connectivity_manager/selection.rs new file mode 100644 index 0000000000000..f31d5759f58d6 --- /dev/null +++ b/network/framework/src/connectivity_manager/selection.rs @@ -0,0 +1,755 @@ +// Copyright © Aptos Foundation +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ + connectivity_manager::{DiscoveredPeer, DiscoveredPeerSet}, + logging::NetworkSchema, +}; +use aptos_config::network_id::NetworkContext; +use aptos_infallible::RwLock; +use aptos_logger::error; +use aptos_types::PeerId; +use maplit::hashset; +use ordered_float::OrderedFloat; +use rand_latest::prelude::*; +use std::{cmp::Ordering, collections::HashSet, sync::Arc}; + +/// Chooses peers to dial randomly from the given list of eligible +/// peers. We take last dial times into account to ensure that we +/// don't dial the same peers too frequently. +pub fn choose_peers_to_dial_randomly( + mut eligible_peers: Vec<(PeerId, DiscoveredPeer)>, + num_peers_to_dial: usize, +) -> Vec<(PeerId, DiscoveredPeer)> { + // Shuffle the peers (so that we don't always dial the same ones first) + eligible_peers.shuffle(&mut ::rand_latest::thread_rng()); + + // Sort the peers by priority (this takes into account last dial times) + eligible_peers + .sort_by(|(_, peer), (_, other)| peer.partial_cmp(other).unwrap_or(Ordering::Equal)); + + // Select the peers to dial + eligible_peers.into_iter().take(num_peers_to_dial).collect() +} + +/// Chooses peers randomly weighted by latency from the given list of peers +pub fn choose_random_peers_by_ping_latency( + network_context: NetworkContext, + eligible_peers: Vec<(PeerId, DiscoveredPeer)>, + num_peers_to_choose: usize, + discovered_peers: Arc>, +) -> Vec<(PeerId, DiscoveredPeer)> { + // Get all eligible peer IDs + let eligible_peer_ids = eligible_peers + .iter() + .map(|(peer_id, _)| *peer_id) + .collect::>(); + + // Identify the peer IDs that haven't been dialed recently + let non_recently_dialed_peer_ids = eligible_peers + .iter() + .filter(|(_, peer)| !peer.has_dialed_recently()) + .map(|(peer_id, _)| *peer_id) + .collect::>(); + + // Choose peers (weighted by latency) from the non-recently dialed peers + let mut selected_peer_ids = choose_peers_by_ping_latency( + &network_context, + &non_recently_dialed_peer_ids, + num_peers_to_choose, + discovered_peers.clone(), + ); + + // If not enough peers were selected, choose additional peers weighted by latency + let num_selected_peer_ids = selected_peer_ids.len(); + if num_selected_peer_ids < num_peers_to_choose { + // Filter out the already selected peers + let unselected_peer_ids = get_unselected_peer_ids(&eligible_peer_ids, &selected_peer_ids); + + // Choose the remaining peers weighted by latency + let num_remaining_peers = num_peers_to_choose.saturating_sub(num_selected_peer_ids); + let remaining_selected_peer_ids = choose_peers_by_ping_latency( + &network_context, + &unselected_peer_ids, + num_remaining_peers, + discovered_peers.clone(), + ); + + // Extend the selected peers with the remaining peers + selected_peer_ids.extend(remaining_selected_peer_ids); + } + + // Extend the selected peers with random peers (if necessary) + let selected_peer_ids = + extend_with_random_peers(selected_peer_ids, &eligible_peer_ids, num_peers_to_choose); + + // Return the selected peers + get_discovered_peers_for_ids(selected_peer_ids, discovered_peers) +} + +/// Returns true iff peers should be selected by ping latency. Note: this only +/// makes sense for the public network, as the validator and VFN networks +/// establish all-to-all connections. +pub fn should_select_peers_by_latency( + network_context: &NetworkContext, + enable_latency_aware_dialing: bool, +) -> bool { + network_context.network_id().is_public_network() && enable_latency_aware_dialing +} + +/// Selects the specified number of peers from the list of potential +/// peers. Peer selection is weighted by peer latencies (i.e., the +/// lower the ping latency, the higher the probability of selection). +fn choose_peers_by_ping_latency( + network_context: &NetworkContext, + peer_ids: &HashSet, + num_peers_to_choose: usize, + discovered_peers: Arc>, +) -> HashSet { + // If no peers can be chosen, return an empty list + if num_peers_to_choose == 0 || peer_ids.is_empty() { + return hashset![]; + } + + // Gather the latency weights for all peers + let mut peer_ids_and_latency_weights = vec![]; + for peer_id in peer_ids { + if let Some(ping_latency_secs) = discovered_peers.read().get_ping_latency_secs(peer_id) { + let latency_weight = convert_latency_to_weight(ping_latency_secs); + peer_ids_and_latency_weights.push((peer_id, OrderedFloat(latency_weight))); + } + } + + // Get the random peers by weight + let weighted_selected_peers = peer_ids_and_latency_weights + .choose_multiple_weighted( + &mut ::rand_latest::thread_rng(), + num_peers_to_choose, + |peer| peer.1, + ) + .map(|peers| peers.into_iter().map(|peer| *peer.0).collect::>()); + + // Return the random peers by weight + weighted_selected_peers + .unwrap_or_else(|error| { + // We failed to select any peers + error!( + NetworkSchema::new(network_context), + "Failed to choose peers by latency for network context: {:?}. Error: {:?}", + network_context, + error + ); + vec![] + }) + .into_iter() + .collect::>() +} + +/// Converts the given latency measurement to a weight. The weight +/// is calculated as the inverse of the latency, with a scaling +/// factor to ensure that low latency peers are highly weighted. +fn convert_latency_to_weight(latency_secs: f64) -> f64 { + // If the latency is <= 0, something has gone wrong, so return 0. + if latency_secs <= 0.0 { + return 0.0; + } + + // Invert the latency to get the weight + let mut weight = 1.0 / latency_secs; + + // For every 25ms of latency, reduce the weight by 1/2 (to + // ensure that low latency peers are highly weighted) + let num_reductions = (latency_secs / 0.025) as usize; + for _ in 0..num_reductions { + weight /= 2.0; + } + + weight +} + +/// If the number of selected peers is less than the number of required peers, +/// select remaining peers from the serviceable peers (at random). +fn extend_with_random_peers( + mut selected_peer_ids: HashSet, + peer_ids: &HashSet, + num_required_peers: usize, +) -> HashSet { + // Only select random peers if we don't have enough peers + let num_selected_peers = selected_peer_ids.len(); + if num_selected_peers < num_required_peers { + // Filter out the already selected peers + let unselected_peer_ids = get_unselected_peer_ids(peer_ids, &selected_peer_ids); + + // Randomly select the remaining peers + let num_remaining_peers = num_required_peers.saturating_sub(num_selected_peers); + let remaining_peer_ids = unselected_peer_ids + .into_iter() + .choose_multiple(&mut ::rand_latest::thread_rng(), num_remaining_peers); + + // Add the remaining peers to the selected peers + selected_peer_ids.extend(remaining_peer_ids); + } + + selected_peer_ids +} + +/// Returns the discovered peer states for the given peer ids +fn get_discovered_peers_for_ids( + peer_ids: HashSet, + discovered_peers: Arc>, +) -> Vec<(PeerId, DiscoveredPeer)> { + peer_ids + .into_iter() + .filter_map(|peer_id| { + discovered_peers + .read() + .peer_set + .get(&peer_id) + .map(|peer| (peer_id, peer.clone())) + }) + .collect() +} + +/// Returns the unselected peer IDs from the given set of eligible and selected peer IDs +fn get_unselected_peer_ids( + eligible_peer_ids: &HashSet, + selected_peer_ids: &HashSet, +) -> HashSet { + eligible_peer_ids + .difference(selected_peer_ids) + .cloned() + .collect() +} + +#[cfg(test)] +mod test { + use super::*; + use aptos_config::{ + config::{PeerRole, RoleType}, + network_id::NetworkId, + }; + use aptos_types::account_address::AccountAddress; + use rand::Rng; + use std::collections::{BinaryHeap, HashMap}; + + #[test] + fn test_choose_random_peers() { + // Create an empty eligible peers set + let eligible_peers = vec![]; + + // Choose several peers randomly and verify none are selected + let selected_peers = choose_peers_to_dial_randomly(eligible_peers, 5); + assert!(selected_peers.is_empty()); + + // Create a large set of eligible peers + let eligible_peers = create_eligible_peers(100); + + // Choose several peers randomly and verify the number of selected peers + let num_peers_to_dial = 5; + let selected_peers = choose_peers_to_dial_randomly(eligible_peers, num_peers_to_dial); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Create a small set of eligible peers + let num_eligible_peers = 5; + let eligible_peers = create_eligible_peers(num_eligible_peers); + + // Choose many peers randomly and verify the number of selected peers + let selected_peers = choose_peers_to_dial_randomly(eligible_peers, 20); + assert_eq!(selected_peers.len(), num_eligible_peers); + } + + #[test] + fn test_choose_random_peers_shuffle() { + // Create a set of 10 eligible peers + let num_eligible_peers = 10; + let eligible_peers = create_eligible_peers(num_eligible_peers); + + // Choose all the peers randomly and verify the number of selected peers + let selected_peers_1 = + choose_peers_to_dial_randomly(eligible_peers.clone(), num_eligible_peers); + assert_eq!(selected_peers_1.len(), num_eligible_peers); + + // Choose all the peers randomly again and verify the number of selected peers + let selected_peers_2 = choose_peers_to_dial_randomly(eligible_peers, num_eligible_peers); + assert_eq!(selected_peers_2.len(), num_eligible_peers); + + // Verify the selected peer sets are identical + for peer in selected_peers_1.clone() { + assert!(selected_peers_2.contains(&peer)); + } + + // Verify that the peer orders are different (the peers were shuffled randomly!) + assert_ne!(selected_peers_1, selected_peers_2); + } + + #[test] + fn test_choose_random_peers_recently_dialed() { + // Create a set of eligible peers + let mut eligible_peers = vec![]; + + // Add peers that have not been dialed recently + let num_non_dialed_peers = 20; + let non_dialed_peers = insert_non_dialed_peers(num_non_dialed_peers, &mut eligible_peers); + + // Add peers that have been dialed recently + let num_dialed_peers = 60; + let dialed_peers = insert_dialed_peers(num_dialed_peers, &mut eligible_peers); + + // Choose various peers randomly (until the max non-dialed peers) and verify the selection + for num_peers_to_dial in 1..=num_non_dialed_peers { + // Choose peers randomly and verify the number of selected peers + let selected_peers = + choose_peers_to_dial_randomly(eligible_peers.clone(), num_peers_to_dial); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Verify that all of the selected peers were not dialed recently + for (peer_id, _) in selected_peers { + assert!(non_dialed_peers.contains(&peer_id)); + assert!(!dialed_peers.contains(&peer_id)); + } + } + + // Choose various peers randomly (beyond the max non-dialed peers) and verify the selection + let mut non_dialed_peer_selected = false; + let mut dialed_peer_selected = false; + let total_num_peers = num_non_dialed_peers + num_dialed_peers; + for num_peers_to_dial in num_non_dialed_peers + 1..=total_num_peers { + // Choose peers randomly and verify the number of selected peers + let selected_peers = + choose_peers_to_dial_randomly(eligible_peers.clone(), num_peers_to_dial); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Update the selected peer flags + for (peer_id, _) in selected_peers { + if non_dialed_peers.contains(&peer_id) { + non_dialed_peer_selected = true; + } + if dialed_peers.contains(&peer_id) { + dialed_peer_selected = true; + } + } + + // Verify that at least one of each peer type was selected + assert!(non_dialed_peer_selected); + assert!(dialed_peer_selected); + } + } + + #[test] + fn test_choose_peers_by_latency_dialed() { + // Create a set of eligible peers + let mut eligible_peers = vec![]; + + // Add peers that have not been dialed recently + let num_non_dialed_peers = 30; + let non_dialed_peers = insert_non_dialed_peers(num_non_dialed_peers, &mut eligible_peers); + + // Add peers that have been dialed recently + let num_dialed_peers = 30; + let dialed_peers = insert_dialed_peers(num_dialed_peers, &mut eligible_peers); + + // Create the discovered peer set + let discovered_peers = create_discovered_peers(eligible_peers.clone(), true); + + // Choose peers by latency (until the max non-dialed peers) and verify the selection + for num_peers_to_dial in 1..=num_non_dialed_peers { + // Choose peers by latency and verify the number of selected peers + let selected_peers = choose_random_peers_by_ping_latency( + NetworkContext::mock(), + eligible_peers.clone(), + num_peers_to_dial, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Verify that all of the selected peers were not dialed recently + for (peer_id, _) in selected_peers { + assert!(non_dialed_peers.contains(&peer_id)); + assert!(!dialed_peers.contains(&peer_id)); + } + } + + // Choose peers by latency (beyond the max non-dialed peers) and verify the selection + let total_num_peers = num_non_dialed_peers + num_dialed_peers; + for num_peers_to_dial in num_non_dialed_peers + 1..=total_num_peers { + // Choose peers by latency and verify the number of selected peers + let selected_peers = choose_random_peers_by_ping_latency( + NetworkContext::mock(), + eligible_peers.clone(), + num_peers_to_dial, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Get the selected peer IDs + let selected_peer_ids = selected_peers + .iter() + .map(|(peer_id, _)| *peer_id) + .collect::>(); + + // Verify the peer selection + for non_dialed_peer in non_dialed_peers.clone() { + assert!(selected_peer_ids.contains(&non_dialed_peer)); + } + + // Verify that at least some dialed peers were selected + let dialed_selected_peers = non_dialed_peers + .difference(&selected_peer_ids) + .cloned() + .collect::>(); + assert!(dialed_peers.is_superset(&dialed_selected_peers)); + } + } + + #[test] + fn test_choose_peers_by_latency_missing_pings() { + // Create an empty set of eligible peers + let mut eligible_peers = vec![]; + + // Choose several peers by latency and verify none are selected + let network_context = NetworkContext::mock(); + let discovered_peers = Arc::new(RwLock::new(DiscoveredPeerSet::default())); + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + 5, + discovered_peers.clone(), + ); + assert!(selected_peers.is_empty()); + + // Add peers that have not been dialed recently + let num_non_dialed_peers = 30; + let _ = insert_non_dialed_peers(num_non_dialed_peers, &mut eligible_peers); + + // Create the discovered peer set (without ping latencies) + let discovered_peers = create_discovered_peers(eligible_peers.clone(), false); + + // Choose several peers by latency and verify the number of selected peers + let num_peers_to_choose = 5; + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_peers_to_choose, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_choose); + + // Choose all peers by latency and verify the number of selected peers + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_non_dialed_peers, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_non_dialed_peers); + + // Choose more peers by latency than are available and verify the number of selected peers + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_non_dialed_peers + 1, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_non_dialed_peers); + + // Add peers that have been dialed recently (with no ping latencies) + let num_dialed_peers = 30; + let _ = insert_dialed_peers(num_dialed_peers, &mut eligible_peers); + + // Create the discovered peer set (without ping latencies) + let discovered_peers = create_discovered_peers(eligible_peers.clone(), false); + + // Choose more peers than non dialed-peers and verify the number of selected peers + let num_peers_to_choose = num_non_dialed_peers + 10; + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_peers_to_choose, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_choose); + + // Choose all peers by latency and verify the number of selected peers + let num_peers_to_choose = num_non_dialed_peers + num_dialed_peers; + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_peers_to_choose, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_choose); + + // Choose more peers than are available and verify the number of selected peers + let num_total_peers = num_non_dialed_peers + num_dialed_peers; + let selected_peers = choose_random_peers_by_ping_latency( + network_context, + eligible_peers.clone(), + num_total_peers + 10, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_total_peers); + } + + #[test] + fn test_choose_peers_by_latency_prioritized_dialed() { + // Create a set of eligible peers + let mut eligible_peers = vec![]; + + // Add peers that have been dialed recently + let num_dialed_peers = 100; + let dialed_peers = insert_dialed_peers(num_dialed_peers, &mut eligible_peers); + + // Create the discovered peer set + let discovered_peers = create_discovered_peers(eligible_peers.clone(), true); + + // Add peers that have not been dialed recently (with no ping latencies) + let num_non_dialed_peers = 100; + let non_dialed_peers = insert_non_dialed_peers(num_non_dialed_peers, &mut eligible_peers); + + // Choose peers by latency (multiple times) and verify the selection + let mut peer_selection_counts = HashMap::new(); + for _ in 0..5000 { + // Choose a single peer by latency and verify the number of selected peers + let num_peers_to_dial = 1; + let selected_peers = choose_random_peers_by_ping_latency( + NetworkContext::mock(), + eligible_peers.clone(), + num_peers_to_dial, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Verify the selection and update the peer selection counts + for (peer_id, _) in selected_peers { + // Verify that the peer was dialed recently + assert!(!non_dialed_peers.contains(&peer_id)); + assert!(dialed_peers.contains(&peer_id)); + + // Update the peer selection counts + let count = peer_selection_counts.entry(peer_id).or_insert(0); + *count += 1; + } + } + + // Verify the top 10% of selected peers are the lowest latency peers + verify_highest_peer_selection_latencies(discovered_peers.clone(), &peer_selection_counts); + } + + #[test] + fn test_choose_peers_by_latency_prioritized_non_dialed() { + // Create a set of eligible peers + let mut eligible_peers = vec![]; + + // Add peers that have not been dialed recently + let num_non_dialed_peers = 100; + let non_dialed_peers = insert_non_dialed_peers(num_non_dialed_peers, &mut eligible_peers); + + // Add peers that have been dialed recently + let num_dialed_peers = 100; + let dialed_peers = insert_dialed_peers(num_dialed_peers, &mut eligible_peers); + + // Create the discovered peer set (with ping latencies) + let discovered_peers = create_discovered_peers(eligible_peers.clone(), true); + + // Choose peers by latency (multiple times) and verify the selection + let mut peer_selection_counts = HashMap::new(); + for _ in 0..5000 { + // Choose a single peer by latency and verify the number of selected peers + let num_peers_to_dial = 1; + let selected_peers = choose_random_peers_by_ping_latency( + NetworkContext::mock(), + eligible_peers.clone(), + num_peers_to_dial, + discovered_peers.clone(), + ); + assert_eq!(selected_peers.len(), num_peers_to_dial); + + // Verify the selection and update the peer selection counts + for (peer_id, _) in selected_peers { + // Verify that the peer was not dialed recently + assert!(non_dialed_peers.contains(&peer_id)); + assert!(!dialed_peers.contains(&peer_id)); + + // Update the peer selection counts + let count = peer_selection_counts.entry(peer_id).or_insert(0); + *count += 1; + } + } + + // Verify the top 10% of selected peers are the lowest latency peers + verify_highest_peer_selection_latencies(discovered_peers.clone(), &peer_selection_counts); + } + + #[test] + fn test_latency_to_weights() { + // Verify that a latency of 0 has a weight of 0 + assert_eq!(convert_latency_to_weight(0.0), 0.0); + + // Verify that latencies are scaled exponentially + assert_eq!(convert_latency_to_weight(0.001), 1000.0); + assert_eq!(convert_latency_to_weight(0.005), 200.0); + assert_eq!(convert_latency_to_weight(0.01), 100.0); + assert_eq!(convert_latency_to_weight(0.02), 50.0); + assert_eq!(convert_latency_to_weight(0.025), 20.0); + assert_eq!(convert_latency_to_weight(0.05), 5.0); + assert_eq!(convert_latency_to_weight(0.1), 0.625); + assert_eq!(convert_latency_to_weight(0.2), 0.01953125); + } + + #[test] + fn test_should_select_peers_by_latency() { + // Create a validator network context + let validator_network_context = + NetworkContext::new(RoleType::Validator, NetworkId::Validator, PeerId::random()); + + // Verify that we don't select peers by latency for the validator network + let enable_latency_aware_dialing = true; + assert!(!should_select_peers_by_latency( + &validator_network_context, + enable_latency_aware_dialing + )); + + // Create a VFN network context + let vfn_network_context = + NetworkContext::new(RoleType::FullNode, NetworkId::Vfn, PeerId::random()); + + // Verify that we don't select peers by latency for the VFN network + let enable_latency_aware_dialing = true; + assert!(!should_select_peers_by_latency( + &vfn_network_context, + enable_latency_aware_dialing + )); + + // Create a public network context + let public_network_context = + NetworkContext::new(RoleType::FullNode, NetworkId::Public, PeerId::random()); + + // Verify that we select peers by latency for the public network + let enable_latency_aware_dialing = true; + assert!(should_select_peers_by_latency( + &public_network_context, + enable_latency_aware_dialing + )); + + // Disable peer ping latencies and verify that we don't select peers by latency + let enable_latency_aware_dialing = false; + assert!(!should_select_peers_by_latency( + &public_network_context, + enable_latency_aware_dialing + )); + } + + /// Creates a set of discovered peers from the given eligible + /// peers. If `set_ping_latencies` is true, random ping latencies + /// are set for each peer. + fn create_discovered_peers( + eligible_peers: Vec<(PeerId, DiscoveredPeer)>, + set_ping_latencies: bool, + ) -> Arc> { + // Create a new discovered peer set + let mut peer_set = HashMap::new(); + for (peer_id, mut peer) in eligible_peers { + // Set a random ping latency between 1 and 1000 ms (if required) + if set_ping_latencies { + let ping_latency_ms = rand::thread_rng().gen_range(1, 1000); + let ping_latency_secs = ping_latency_ms as f64 / 1000.0; + peer.set_ping_latency_secs(ping_latency_secs); + } + + // Insert the peer into the set + peer_set.insert(peer_id, peer.clone()); + } + + // Create and return the discovered peers + Arc::new(RwLock::new(DiscoveredPeerSet::new_from_peer_set(peer_set))) + } + + /// Creates a set of eligible peers (as specified by the number of peers) + fn create_eligible_peers(num_eligible_peers: usize) -> Vec<(PeerId, DiscoveredPeer)> { + let mut eligible_peers = vec![]; + for _ in 0..num_eligible_peers { + eligible_peers.push(( + AccountAddress::random(), + DiscoveredPeer::new(PeerRole::PreferredUpstream), + )); + } + eligible_peers + } + + /// Creates and inserts a set of dialed peers into the eligible peers + /// set, and returns the set of dialed peer IDs. + fn insert_dialed_peers( + num_dialed_peers: usize, + eligible_peers: &mut Vec<(PeerId, DiscoveredPeer)>, + ) -> HashSet { + let mut dialed_peers = hashset![]; + for _ in 0..num_dialed_peers { + // Create a dialed peer + let peer_id = AccountAddress::random(); + let mut peer = DiscoveredPeer::new(PeerRole::PreferredUpstream); + dialed_peers.insert(peer_id); + + // Set the last dial time to be recent + peer.update_last_dial_time(); + + // Add the peer to the eligible peers + eligible_peers.push((peer_id, peer)); + } + dialed_peers + } + + /// Creates and inserts a set of non-dialed peers into the eligible peers + /// set, and returns the set of non-dialed peer IDs. + fn insert_non_dialed_peers( + num_non_dialed_peers: usize, + eligible_peers: &mut Vec<(PeerId, DiscoveredPeer)>, + ) -> HashSet { + let mut non_dialed_peers = hashset![]; + for _ in 0..num_non_dialed_peers { + // Create a non-dialed peer + let peer_id = AccountAddress::random(); + non_dialed_peers.insert(peer_id); + + // Add the peer to the eligible peers + eligible_peers.push((peer_id, DiscoveredPeer::new(PeerRole::ValidatorFullNode))); + } + non_dialed_peers + } + + /// Verifies the top 10% of selected peers are the lowest latency peers + fn verify_highest_peer_selection_latencies( + discovered_peers: Arc>, + peers_and_selection_counts: &HashMap, + ) { + // Build a max-heap of all peers by their selection counts + let mut max_heap_selection_counts = BinaryHeap::new(); + for (peer, selection_count) in peers_and_selection_counts.clone() { + max_heap_selection_counts.push((selection_count, peer)); + } + + // Verify the top 10% of polled peers are the lowest latency peers + let peers_to_verify = peers_and_selection_counts.len() / 10; + let mut highest_seen_latency = 0.0; + for _ in 0..peers_to_verify { + // Get the peer + let (_, peer) = max_heap_selection_counts.pop().unwrap(); + + // Get the peer's ping latency + let discovered_peers = discovered_peers.read(); + let discovered_peer = discovered_peers.peer_set.get(&peer).unwrap(); + let ping_latency = discovered_peer.ping_latency_secs.unwrap(); + + // Verify that the ping latencies are increasing + if ping_latency <= highest_seen_latency { + // The ping latencies did not increase. This should only be + // possible if the latencies are very close (i.e., within 10%). + if (highest_seen_latency - ping_latency) > 0.1 { + panic!("The ping latencies are not increasing! Are peers weighted by latency?"); + } + } + + // Update the highest seen latency + highest_seen_latency = ping_latency; + } + } +} diff --git a/network/framework/src/connectivity_manager/test.rs b/network/framework/src/connectivity_manager/test.rs index 70af39cacc1be..7165e8e3e6e2f 100644 --- a/network/framework/src/connectivity_manager/test.rs +++ b/network/framework/src/connectivity_manager/test.rs @@ -19,7 +19,7 @@ use aptos_time_service::{MockTimeService, TimeService}; use aptos_types::{account_address::AccountAddress, network_address::NetworkAddress}; use futures::{executor::block_on, future, SinkExt}; use maplit::{hashmap, hashset}; -use rand::rngs::StdRng; +use rand::{rngs::StdRng, SeedableRng}; use std::{io, str::FromStr}; use tokio_retry::strategy::FixedInterval; @@ -106,6 +106,7 @@ impl TestHarness { MAX_CONNECTION_DELAY, Some(MAX_TEST_CONNECTIONS), true, /* mutual_authentication */ + true, /* enable_latency_aware_dialing */ ); let mock = Self { network_context, diff --git a/network/framework/src/counters.rs b/network/framework/src/counters.rs index 600bdd003839e..e5b5b68c8aacd 100644 --- a/network/framework/src/counters.rs +++ b/network/framework/src/counters.rs @@ -31,6 +31,10 @@ pub const FAILED_LABEL: &str = "failed"; pub const INBOUND_LABEL: &str = "inbound"; pub const OUTBOUND_LABEL: &str = "outbound"; +// Peer ping labels +const CONNECTED_LABEL: &str = "connected"; +const PRE_DIAL_LABEL: &str = "pre_dial"; + // Serialization labels pub const SERIALIZATION_LABEL: &str = "serialization"; pub const DESERIALIZATION_LABEL: &str = "deserialization"; @@ -597,3 +601,30 @@ pub fn start_serialization_timer(protocol_id: ProtocolId, operation: &str) -> Hi .with_label_values(&[protocol_id.as_str(), operation]) .start_timer() } + +/// Counters related to peer ping times (before and after dialing) +pub static NETWORK_PEER_PING_TIMES: Lazy = Lazy::new(|| { + register_histogram_vec!( + "aptos_network_peer_ping_times", + "Counters related to peer ping times (before and after dialing)", + &["network_id", "label"], + ) + .unwrap() +}); + +/// Observes the ping time for a connected peer +pub fn observe_connected_ping_time(network_context: &NetworkContext, ping_latency_secs: f64) { + observe_ping_time(network_context, ping_latency_secs, CONNECTED_LABEL); +} + +/// Observes the ping time for a peer before dialing +pub fn observe_pre_dial_ping_time(network_context: &NetworkContext, ping_latency_secs: f64) { + observe_ping_time(network_context, ping_latency_secs, PRE_DIAL_LABEL); +} + +/// Observes the ping time for the given label +fn observe_ping_time(network_context: &NetworkContext, ping_latency_secs: f64, label: &str) { + NETWORK_PEER_PING_TIMES + .with_label_values(&[network_context.network_id().as_str(), label]) + .observe(ping_latency_secs); +} diff --git a/network/framework/src/noise/error.rs b/network/framework/src/noise/error.rs index 7395dd3b65ef3..67f2d657a6397 100644 --- a/network/framework/src/noise/error.rs +++ b/network/framework/src/noise/error.rs @@ -44,9 +44,9 @@ pub enum NoiseHandshakeError { #[error( "noise server: client {0}: client is expecting us to have a different \ - public key: {1}" + public key. Expected: {1}, Actual: {2}" )] - ClientExpectingDifferentPubkey(ShortHexStr, String), + ClientExpectingDifferentPubkey(ShortHexStr, String, String), #[error("noise server: client {0}: error parsing handshake init message: {1}")] ServerParseClient(ShortHexStr, NoiseError), diff --git a/network/framework/src/noise/handshake.rs b/network/framework/src/noise/handshake.rs index 0c61cc0490aab..1e7c2c51d358a 100644 --- a/network/framework/src/noise/handshake.rs +++ b/network/framework/src/noise/handshake.rs @@ -348,10 +348,12 @@ impl NoiseUpgrader { } // verify that this is indeed our public key - if self_expected_public_key != self.noise_config.public_key().as_slice() { + let actual_public_key = self.noise_config.public_key(); + if self_expected_public_key != actual_public_key.as_slice() { return Err(NoiseHandshakeError::ClientExpectingDifferentPubkey( remote_peer_short, hex::encode(self_expected_public_key), + hex::encode(actual_public_key.as_slice()), )); } diff --git a/terraform/.tflint.hcl b/terraform/.tflint.hcl deleted file mode 100644 index e469dfc05aa65..0000000000000 --- a/terraform/.tflint.hcl +++ /dev/null @@ -1,17 +0,0 @@ -plugin "aws" { - enabled = true - version = "0.16.1" - source = "github.com/terraform-linters/tflint-ruleset-aws" -} - -plugin "azurerm" { - enabled = true - version = "0.17.1" - source = "github.com/terraform-linters/tflint-ruleset-azurerm" -} - -plugin "google" { - enabled = true - version = "0.19.0" - source = "github.com/terraform-linters/tflint-ruleset-google" -} diff --git a/terraform/aptos-node-testnet/aws/.terraform.lock.hcl b/terraform/aptos-node-testnet/aws/.terraform.lock.hcl new file mode 100644 index 0000000000000..f00d4ecc67ee2 --- /dev/null +++ b/terraform/aptos-node-testnet/aws/.terraform.lock.hcl @@ -0,0 +1,157 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.14.0" + hashes = [ + "h1:MkK5wbWd3g9MAR+LqyWhqlGXolkrryf6LWkOAtOdG9k=", + "zh:03b80869b97dfca4ce6ee94a005e15ccec4d98af0876084a963963b05c9ab743", + "zh:11d148800fe028fcd10590f0473c5df306e220776e359aa838c2f07e5a89187e", + "zh:15d696cf583dc2917b257891e4a33afe7c3e8f20b63183f510267d709baaaf3d", + "zh:34c41e44534fbbf95a5f89b38404ee52b41c6c70af68f7e63a423b276fbcf797", + "zh:4211d0fd4753f7ba202f3e4a8afb2e03d12112dd4db4f9267c472bd597dc71ca", + "zh:47b6017d0cdd2f62b9e46137de38cd618441f658f8570a8e2687cce7643bf953", + "zh:51785b942d6f588825f4bfa86e05502be8721194b289c474121072e49acff6c3", + "zh:565f76885d41ecfea192b8a2e2f3d4b3dd278790d1d82b204706ae3582d51cf6", + "zh:703d670e1d73360d2533b02dbe9e2e055bf6f36a478cd4d66f2349861575c2ed", + "zh:7e4701f38590c22066da90b75dd92d81a685225d2d222d22425b7ccb26e92b4a", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:ca3449252d70df14ad713d5b95fa0610da8087f12c9deb87beffe788f518d06d", + "zh:e2ed3d6d8c12d3fe56fb03fe272779270a92f6157ade8c3db1c987b83b62e68c", + "zh:f0b07b84a43d1afc3a9790ca699771970525c132fa8551e7b326d1f263414dd1", + "zh:f1d83b3e5a29bae471f9841a4e0153eac5bccedbdece369e2f6186e9044db64e", + ] +} + +provider "registry.terraform.io/hashicorp/helm" { + version = "2.11.0" + hashes = [ + "h1:AOp9vXIM4uT1c/PVwsWTPiLVGlO2SSYrfiirV5rjCMQ=", + "zh:013857c88f3e19a4b162344e21dc51891c4ac8b600da8391f7fb2b6d234961e1", + "zh:044fffa233a93cdcf8384afbe9e1ab6c9d0b5b176cbae56ff465eb9611302975", + "zh:208b7cdd4fa3a1b25ae817dc00a9198ef98be0ddc3a577b5b72bc0f006afb997", + "zh:3e8b33f56cfe387277572a92037a1ca1cbe4e3aa6b5c19a8c2431193b07f7865", + "zh:7dd663d5619bd71676899b05b19d36f585189fdabc6b0b03c23579524a8fd9bf", + "zh:ae5329cb3e5bf0b86b02e823aac3ef3bd0d4b1618ff013cd0076dca0be8322e4", + "zh:ba6201695b55d51bedacdb017cb8d03d7a8ada51d0168ac44fef3fa791a85ab4", + "zh:c61285c8b1ba10f50cf94c9dcf98f2f3b720f14906a18be71b9b422279b5d806", + "zh:d522d388246f38b9f329c511ec579b516d212670b954f9dab64efb27e51862af", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:f92546e26b670da61437ae2cbd038427c9374ce5f7a78df52193397da90bd997", + "zh:f9ad1407e5c0d5e3474094491025bf100828e8c1a01acdf9591d7dd1eb59f961", + ] +} + +provider "registry.terraform.io/hashicorp/kubernetes" { + version = "2.23.0" + hashes = [ + "h1:arTzD0XG/DswGCAx9JEttkSKe9RyyFW9W7UWcXF13dU=", + "zh:10488a12525ed674359585f83e3ee5e74818b5c98e033798351678b21b2f7d89", + "zh:1102ba5ca1a595f880e67102bbf999cc8b60203272a078a5b1e896d173f3f34b", + "zh:1347cf958ed3f3f80b3c7b3e23ddda3d6c6573a81847a8ee92b7df231c238bf6", + "zh:2cb18e9f5156bc1b1ee6bc580a709f7c2737d142722948f4a6c3c8efe757fa8d", + "zh:5506aa6f28dcca2a265ccf8e34478b5ec2cb43b867fe6d93b0158f01590fdadd", + "zh:6217a20686b631b1dcb448ee4bc795747ebc61b56fbe97a1ad51f375ebb0d996", + "zh:8accf916c00579c22806cb771e8909b349ffb7eb29d9c5468d0a3f3166c7a84a", + "zh:9379b0b54a0fa030b19c7b9356708ec8489e194c3b5e978df2d31368563308e5", + "zh:aa99c580890691036c2931841e88e7ee80d59ae52289c8c2c28ea0ac23e31520", + "zh:c57376d169875990ac68664d227fb69cd0037b92d0eba6921d757c3fd1879080", + "zh:e6068e3f94f6943b5586557b73f109debe19d1a75ca9273a681d22d1ce066579", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.4.0" + hashes = [ + "h1:ZUEYUmm2t4vxwzxy1BvN1wL6SDWrDxfH7pxtzX8c6d0=", + "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", + "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", + "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", + "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", + "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", + "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", + "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", + "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", + "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", + "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", + ] +} + +provider "registry.terraform.io/hashicorp/null" { + version = "3.2.1" + hashes = [ + "h1:ydA0/SNRVB1o95btfshvYsmxA+jZFRZcvKzZSB+4S1M=", + "zh:58ed64389620cc7b82f01332e27723856422820cfd302e304b5f6c3436fb9840", + "zh:62a5cc82c3b2ddef7ef3a6f2fedb7b9b3deff4ab7b414938b08e51d6e8be87cb", + "zh:63cff4de03af983175a7e37e52d4bd89d990be256b16b5c7f919aff5ad485aa5", + "zh:74cb22c6700e48486b7cabefa10b33b801dfcab56f1a6ac9b6624531f3d36ea3", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:79e553aff77f1cfa9012a2218b8238dd672ea5e1b2924775ac9ac24d2a75c238", + "zh:a1e06ddda0b5ac48f7e7c7d59e1ab5a4073bbcf876c73c0299e4610ed53859dc", + "zh:c37a97090f1a82222925d45d84483b2aa702ef7ab66532af6cbcfb567818b970", + "zh:e4453fbebf90c53ca3323a92e7ca0f9961427d2f0ce0d2b65523cc04d5d999c2", + "zh:e80a746921946d8b6761e77305b752ad188da60688cfd2059322875d363be5f5", + "zh:fbdb892d9822ed0e4cb60f2fedbdbb556e4da0d88d3b942ae963ed6ff091e48f", + "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.5.1" + hashes = [ + "h1:IL9mSatmwov+e0+++YX2V6uel+dV6bn+fC/cnGDK3Ck=", + "zh:04e3fbd610cb52c1017d282531364b9c53ef72b6bc533acb2a90671957324a64", + "zh:119197103301ebaf7efb91df8f0b6e0dd31e6ff943d231af35ee1831c599188d", + "zh:4d2b219d09abf3b1bb4df93d399ed156cadd61f44ad3baf5cf2954df2fba0831", + "zh:6130bdde527587bbe2dcaa7150363e96dbc5250ea20154176d82bc69df5d4ce3", + "zh:6cc326cd4000f724d3086ee05587e7710f032f94fc9af35e96a386a1c6f2214f", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:b6d88e1d28cf2dfa24e9fdcc3efc77adcdc1c3c3b5c7ce503a423efbdd6de57b", + "zh:ba74c592622ecbcef9dc2a4d81ed321c4e44cddf7da799faa324da9bf52a22b2", + "zh:c7c5cde98fe4ef1143bd1b3ec5dc04baf0d4cc3ca2c5c7d40d17c0e9b2076865", + "zh:dac4bad52c940cd0dfc27893507c1e92393846b024c5a9db159a93c534a3da03", + "zh:de8febe2a2acd9ac454b844a4106ed295ae9520ef54dc8ed2faf29f12716b602", + "zh:eab0d0495e7e711cca367f7d4df6e322e6c562fc52151ec931176115b83ed014", + ] +} + +provider "registry.terraform.io/hashicorp/time" { + version = "0.9.1" + hashes = [ + "h1:VxyoYYOCaJGDmLz4TruZQTSfQhvwEcMxvcKclWdnpbs=", + "zh:00a1476ecf18c735cc08e27bfa835c33f8ac8fa6fa746b01cd3bcbad8ca84f7f", + "zh:3007f8fc4a4f8614c43e8ef1d4b0c773a5de1dcac50e701d8abc9fdc8fcb6bf5", + "zh:5f79d0730fdec8cb148b277de3f00485eff3e9cf1ff47fb715b1c969e5bbd9d4", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:8c8094689a2bed4bb597d24a418bbbf846e15507f08be447d0a5acea67c2265a", + "zh:a6d9206e95d5681229429b406bc7a9ba4b2d9b67470bda7df88fa161508ace57", + "zh:aa299ec058f23ebe68976c7581017de50da6204883950de228ed9246f309e7f1", + "zh:b129f00f45fba1991db0aa954a6ba48d90f64a738629119bfb8e9a844b66e80b", + "zh:ef6cecf5f50cda971c1b215847938ced4cb4a30a18095509c068643b14030b00", + "zh:f1f46a4f6c65886d2dd27b66d92632232adc64f92145bf8403fe64d5ffa5caea", + "zh:f79d6155cda7d559c60d74883a24879a01c4d5f6fd7e8d1e3250f3cd215fb904", + "zh:fd59fa73074805c3575f08cd627eef7acda14ab6dac2c135a66e7a38d262201c", + ] +} + +provider "registry.terraform.io/hashicorp/tls" { + version = "4.0.4" + hashes = [ + "h1:GZcFizg5ZT2VrpwvxGBHQ/hO9r6g0vYdQqx3bFD3anY=", + "zh:23671ed83e1fcf79745534841e10291bbf34046b27d6e68a5d0aab77206f4a55", + "zh:45292421211ffd9e8e3eb3655677700e3c5047f71d8f7650d2ce30242335f848", + "zh:59fedb519f4433c0fdb1d58b27c210b27415fddd0cd73c5312530b4309c088be", + "zh:5a8eec2409a9ff7cd0758a9d818c74bcba92a240e6c5e54b99df68fff312bbd5", + "zh:5e6a4b39f3171f53292ab88058a59e64825f2b842760a4869e64dc1dc093d1fe", + "zh:810547d0bf9311d21c81cc306126d3547e7bd3f194fc295836acf164b9f8424e", + "zh:824a5f3617624243bed0259d7dd37d76017097dc3193dac669be342b90b2ab48", + "zh:9361ccc7048be5dcbc2fafe2d8216939765b3160bd52734f7a9fd917a39ecbd8", + "zh:aa02ea625aaf672e649296bce7580f62d724268189fe9ad7c1b36bb0fa12fa60", + "zh:c71b4cd40d6ec7815dfeefd57d88bc592c0c42f5e5858dcc88245d371b4b8b1e", + "zh:dabcd52f36b43d250a3d71ad7abfa07b5622c69068d989e60b79b2bb4f220316", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/terraform/aptos-node-testnet/aws/addons.tf b/terraform/aptos-node-testnet/aws/addons.tf index 29e61afe391dc..2437969df2642 100644 --- a/terraform/aptos-node-testnet/aws/addons.tf +++ b/terraform/aptos-node-testnet/aws/addons.tf @@ -1,8 +1,7 @@ locals { - autoscaling_helm_chart_path = "${path.module}/../../helm/autoscaling" - chaos_mesh_helm_chart_path = "${path.module}/../../helm/chaos" - testnet_addons_helm_chart_path = "${path.module}/../../helm/testnet-addons" - node_health_checker_helm_chart_path = "${path.module}/../../helm/node-health-checker" + autoscaling_helm_chart_path = "${path.module}/../../helm/autoscaling" + chaos_mesh_helm_chart_path = "${path.module}/../../helm/chaos" + testnet_addons_helm_chart_path = "${path.module}/../../helm/testnet-addons" } resource "helm_release" "autoscaling" { @@ -31,10 +30,6 @@ resource "helm_release" "autoscaling" { autoscaler = { enabled = true clusterName = module.validator.aws_eks_cluster.name - image = { - # EKS does not report patch version - tag = "v${module.validator.aws_eks_cluster.version}.0" - } serviceAccount = { annotations = { "eks.amazonaws.com/role-arn" = aws_iam_role.cluster-autoscaler.arn @@ -127,18 +122,19 @@ resource "aws_iam_role_policy" "cluster-autoscaler" { } resource "kubernetes_namespace" "chaos-mesh" { + count = var.enable_forge ? 1 : 0 metadata { annotations = { name = "chaos-mesh" } - name = "chaos-mesh" } } resource "helm_release" "chaos-mesh" { + count = var.enable_forge ? 1 : 0 name = "chaos-mesh" - namespace = kubernetes_namespace.chaos-mesh.metadata[0].name + namespace = kubernetes_namespace.chaos-mesh[0].metadata[0].name chart = local.chaos_mesh_helm_chart_path max_history = 5 @@ -255,12 +251,6 @@ resource "helm_release" "testnet-addons" { acm_certificate = length(aws_acm_certificate.ingress) > 0 ? aws_acm_certificate.ingress[0].arn : null loadBalancerSourceRanges = var.client_sources_ipv4 } - load_test = { - fullnodeGroups = try(var.aptos_node_helm_values.fullnode.groups, []) - config = { - numFullnodeGroups = var.num_fullnode_groups - } - } }), jsonencode(var.testnet_addons_helm_values) ] @@ -273,30 +263,3 @@ resource "helm_release" "testnet-addons" { } } } - -resource "helm_release" "node-health-checker" { - count = var.enable_node_health_checker ? 1 : 0 - name = "node-health-checker" - chart = local.node_health_checker_helm_chart_path - max_history = 5 - wait = false - - values = [ - jsonencode({ - imageTag = var.image_tag - # borrow the serviceaccount for the rest of the testnet addon components - # TODO: just create a service account for the node-health-checker - serviceAccount = { - create = false - name = "testnet-addons" - } - }), - jsonencode(var.node_health_checker_helm_values) - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.node_health_checker_helm_chart_path, "**") : filesha1("${local.node_health_checker_helm_chart_path}/${f}")])) - } -} diff --git a/terraform/aptos-node-testnet/aws/dns.tf b/terraform/aptos-node-testnet/aws/dns.tf index 028cd3dbe7577..688f9af7f79d7 100644 --- a/terraform/aptos-node-testnet/aws/dns.tf +++ b/terraform/aptos-node-testnet/aws/dns.tf @@ -26,7 +26,7 @@ resource "aws_acm_certificate" "ingress" { } resource "aws_route53_record" "ingress-acm-validation" { - for_each = var.zone_id == "" ? {} : { for dvo in aws_acm_certificate.ingress[0].domain_validation_options : dvo.domain_name => dvo } + for_each = length(aws_acm_certificate.ingress) == 0 ? {} : { for dvo in aws_acm_certificate.ingress[0].domain_validation_options : dvo.domain_name => dvo } zone_id = var.zone_id allow_overwrite = true @@ -37,7 +37,7 @@ resource "aws_route53_record" "ingress-acm-validation" { } resource "aws_acm_certificate_validation" "ingress" { - count = var.zone_id != "" ? 1 : 0 + count = length(aws_acm_certificate.ingress) > 0 ? 1 : 0 certificate_arn = aws_acm_certificate.ingress[0].arn validation_record_fqdns = [for dvo in aws_acm_certificate.ingress[0].domain_validation_options : dvo.resource_record_name] diff --git a/terraform/aptos-node-testnet/aws/forge.tf b/terraform/aptos-node-testnet/aws/forge.tf index db7f537a45d87..d566c833f9f7c 100644 --- a/terraform/aptos-node-testnet/aws/forge.tf +++ b/terraform/aptos-node-testnet/aws/forge.tf @@ -17,7 +17,7 @@ resource "helm_release" "forge" { } serviceAccount = { annotations = { - "eks.amazonaws.com/role-arn" = aws_iam_role.forge.arn + "eks.amazonaws.com/role-arn" = aws_iam_role.forge[0].arn } } }), @@ -32,6 +32,7 @@ resource "helm_release" "forge" { } data "aws_iam_policy_document" "forge-assume-role" { + count = var.enable_forge ? 1 : 0 statement { actions = ["sts:AssumeRoleWithWebIdentity"] @@ -58,6 +59,7 @@ data "aws_iam_policy_document" "forge-assume-role" { } data "aws_iam_policy_document" "forge" { + count = var.enable_forge ? 1 : 0 statement { sid = "AllowS3" actions = [ @@ -71,15 +73,16 @@ data "aws_iam_policy_document" "forge" { } resource "aws_iam_role" "forge" { + count = var.enable_forge ? 1 : 0 name = "aptos-node-testnet-${local.workspace_name}-forge" path = var.iam_path permissions_boundary = var.permissions_boundary_policy - assume_role_policy = data.aws_iam_policy_document.forge-assume-role.json + assume_role_policy = data.aws_iam_policy_document.forge-assume-role[0].json } resource "aws_iam_role_policy" "forge" { + count = var.enable_forge ? 1 : 0 name = "Helm" - role = aws_iam_role.forge.name - policy = data.aws_iam_policy_document.forge.json + role = aws_iam_role.forge[0].name + policy = data.aws_iam_policy_document.forge[0].json } - diff --git a/terraform/aptos-node-testnet/aws/main.tf b/terraform/aptos-node-testnet/aws/main.tf index fe322e7800af6..2cb77e43cc6b5 100644 --- a/terraform/aptos-node-testnet/aws/main.tf +++ b/terraform/aptos-node-testnet/aws/main.tf @@ -36,8 +36,6 @@ module "validator" { # if forge enabled, standardize the helm release name for ease of operations helm_release_name_override = var.enable_forge ? "aptos-node" : "" - # Forge testing does not require calico for validator NetworkPolicies - enable_calico = !var.enable_forge k8s_api_sources = var.admin_sources_ipv4 k8s_admin_roles = var.k8s_admin_roles @@ -57,21 +55,15 @@ module "validator" { helm_values = var.aptos_node_helm_values # allow all nodegroups to surge to 2x their size by default, in case of total nodes replacement - validator_instance_num = var.num_validator_instance > 0 ? 2 * var.num_validator_instance : var.num_validators - validator_instance_max_num = var.validator_instance_max_num + validator_instance_num = var.num_validator_instance > 0 ? 2 * var.num_validator_instance : var.num_validators + validator_instance_max_num = var.validator_instance_max_num + validator_instance_enable_taint = true # create one utility instance per validator, since HAProxy requires resources 1.5 CPU, 2Gi memory for now utility_instance_num = var.num_utility_instance > 0 ? var.num_utility_instance : var.num_validators utility_instance_max_num = var.utility_instance_max_num utility_instance_type = var.utility_instance_type validator_instance_type = var.validator_instance_type - - # addons - enable_monitoring = var.enable_monitoring - enable_prometheus_node_exporter = var.enable_prometheus_node_exporter - enable_kube_state_metrics = var.enable_kube_state_metrics - monitoring_helm_values = var.monitoring_helm_values - logger_helm_values = var.logger_helm_values } locals { @@ -81,14 +73,14 @@ locals { provider "helm" { kubernetes { host = module.validator.aws_eks_cluster.endpoint - cluster_ca_certificate = base64decode(module.validator.aws_eks_cluster.certificate_authority.0.data) + cluster_ca_certificate = base64decode(module.validator.aws_eks_cluster.certificate_authority[0].data) token = module.validator.aws_eks_cluster_auth_token } } provider "kubernetes" { host = module.validator.aws_eks_cluster.endpoint - cluster_ca_certificate = base64decode(module.validator.aws_eks_cluster.certificate_authority.0.data) + cluster_ca_certificate = base64decode(module.validator.aws_eks_cluster.certificate_authority[0].data) token = module.validator.aws_eks_cluster_auth_token } @@ -96,8 +88,8 @@ locals { genesis_helm_chart_path = "${path.module}/../../helm/genesis" } - resource "helm_release" "genesis" { + count = var.enable_genesis ? 1 : 0 name = "genesis" chart = local.genesis_helm_chart_path max_history = 5 diff --git a/terraform/aptos-node-testnet/aws/variables.tf b/terraform/aptos-node-testnet/aws/variables.tf index d1257333dca9a..9ce02d893d623 100644 --- a/terraform/aptos-node-testnet/aws/variables.tf +++ b/terraform/aptos-node-testnet/aws/variables.tf @@ -2,20 +2,24 @@ variable "region" { description = "AWS region" + type = string } variable "maximize_single_az_capacity" { description = "TEST ONLY: Whether to maximize the capacity of the cluster by allocating a large CIDR block to the first AZ" + type = bool default = false } variable "zone_id" { description = "Route53 Zone ID to create records in" + type = string default = "" } variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } @@ -27,17 +31,19 @@ variable "tls_sans" { variable "workspace_dns" { description = "Include Terraform workspace name in DNS records" + type = bool default = true } variable "iam_path" { - default = "/" description = "Path to use when naming IAM objects" + type = string + default = "/" } variable "permissions_boundary_policy" { - default = "" description = "ARN of IAM policy to set as permissions boundary on created roles" + type = string } variable "admin_sources_ipv4" { @@ -68,26 +74,31 @@ variable "k8s_admins" { variable "chain_id" { description = "Aptos chain ID. If var.enable_forge set, defaults to 4" + type = number default = 4 } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 15 } variable "chain_name" { description = "Aptos chain name. If unset, defaults to using the workspace name" + type = string default = "" } variable "image_tag" { description = "Docker image tag for all Aptos workloads, including validators, fullnodes, backup, restore, genesis, and other tooling" + type = string default = "devnet" } variable "validator_image_tag" { description = "Docker image tag for validators and fullnodes. If set, overrides var.image_tag for those nodes" + type = string default = "" } @@ -105,31 +116,10 @@ variable "genesis_helm_values" { default = {} } -variable "logger_helm_values" { - description = "Map of values to pass to logger helm chart" - type = any - default = {} -} - -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring helm chart" - type = any - default = {} -} - -variable "enable_prometheus_node_exporter" { - description = "Enable prometheus-node-exporter within monitoring helm chart" - default = false -} - -variable "enable_kube_state_metrics" { - description = "Enable kube-state-metrics within monitoring helm chart" - default = false +variable "enable_genesis" { + description = "Perform genesis automatically" + type = bool + default = true } variable "testnet_addons_helm_values" { @@ -138,68 +128,67 @@ variable "testnet_addons_helm_values" { default = {} } -variable "enable_node_health_checker" { - description = "Enable node-health-checker" - default = false -} - -variable "node_health_checker_helm_values" { - description = "Map of values to pass to node-health-checker helm chart" - type = any - default = {} -} - ### EKS nodegroups variable "num_validators" { description = "The number of validator nodes to create" + type = number default = 4 } variable "num_fullnode_groups" { description = "The number of fullnode groups to create" + type = number default = 1 } variable "num_utility_instance" { description = "Number of instances for utilities node pool, when it's 0, it will be set to var.num_validators" + type = number default = 0 } variable "num_validator_instance" { description = "Number of instances for validator node pool, when it's 0, it will be set to 2 * var.num_validators" + type = number default = 0 } variable "utility_instance_max_num" { description = "Maximum number of instances for utilities. If left 0, defaults to 2 * var.num_validators" + type = number default = 0 } variable "validator_instance_max_num" { description = "Maximum number of instances for utilities. If left 0, defaults to 2 * var.num_validators" + type = number default = 0 } variable "utility_instance_type" { description = "Instance type used for utilities" + type = string default = "t3.2xlarge" } variable "validator_instance_type" { description = "Instance type used for validator and fullnodes" - default = "c6i.4xlarge" + type = string + default = "c6i.8xlarge" } ### Forge variable "enable_forge" { description = "Enable Forge test framework, also creating an internal helm repo" + type = bool default = false } variable "forge_config_s3_bucket" { description = "S3 bucket in which Forge config is stored" + type = string default = "forge-wrapper-config" } @@ -211,6 +200,7 @@ variable "forge_helm_values" { variable "validator_storage_class" { description = "Which storage class to use for the validator and fullnode" + type = string default = "io1" validation { condition = contains(["gp3", "io1", "io2"], var.validator_storage_class) @@ -220,6 +210,7 @@ variable "validator_storage_class" { variable "fullnode_storage_class" { description = "Which storage class to use for the validator and fullnode" + type = string default = "io1" validation { condition = contains(["gp3", "io1", "io2"], var.fullnode_storage_class) @@ -229,5 +220,6 @@ variable "fullnode_storage_class" { variable "manage_via_tf" { description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool default = true } diff --git a/terraform/aptos-node-testnet/gcp/.terraform.lock.hcl b/terraform/aptos-node-testnet/gcp/.terraform.lock.hcl new file mode 100644 index 0000000000000..c3e2a86fa13e3 --- /dev/null +++ b/terraform/aptos-node-testnet/gcp/.terraform.lock.hcl @@ -0,0 +1,156 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/google" { + version = "4.54.0" + constraints = "~> 4.54.0" + hashes = [ + "h1:1cWQdF2IRqCUMG3wGygzqDIPq6SYtaubbQ+vXGRw25k=", + "zh:31f8f881703ba166b3ce4579dcba4b4c3192360642bd34ff633815d635c39003", + "zh:40ac861876276001c6483c52232072a4640491c36ebfba0e1e4baa5873e8183f", + "zh:4a1dfb601e7426e2aee1cd9cbab721638a330186398b587400a275981adf9e43", + "zh:71ef5b767fe25f4f03535fe006986cd9829a68788185914098dfe9d7cdb8f0de", + "zh:92ce2d5b8cbf2b0641f9c954959cfd8e2975f3912642b14a89dc7376c8edc8b9", + "zh:9c817bbe912e986f62f036fac18c25de8b225c065343f8c39362feffb25f9b37", + "zh:a21b8cfa15a56a7c332769d14a9fd1b321393cba1547f3155ff70aa7cb0bf0b2", + "zh:b42e883e3272c3aeba2cdc53d07a2058321e8e68d68238d08a73a804274e29d0", + "zh:bc25f7f9a1b8fee60a853c87f3762c5860598daf0a0a3c3e67563632f67b1c45", + "zh:bfd60ab7cf42380dc7dab87e50c62f6ad5c1af8d55d854a890a3f6dfb778aba5", + "zh:c79ad29ebff06da641c8d67b2104b72749df56f919d48bd1ca6ce31057d86b9b", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/google-beta" { + version = "4.54.0" + constraints = "~> 4.54.0" + hashes = [ + "h1:W5uwH+CNWJijGTCQtMfo5wqQyiMMmcxDPaeptIdzUsI=", + "zh:0ae198224a7d116f4f9e7a38b2b6cb2e281050929e22d38a98060eecf2eeccf8", + "zh:2962d48ac3789c32ef2b8e61f9c14335398c64603af98a308d21512008728dfc", + "zh:5f40b3655c71b081a67f06fb9ecc08174946d80e6411a90d4cce6265802515f8", + "zh:72c111e0c49b3d7091155f719c224209bbb7a011b215382eeaf9506c8067b60e", + "zh:73d86b00181803e9c30bfd92611a8475900acb31b5abcd02e7d36bdcb0de9a35", + "zh:7c4af6ca624e7454db679e3adf32b1f8288babc95e05a1220388e3cb53e16ab8", + "zh:7fe723ce93803fe5596d706f52be46094273ee07e858f3808f1bafdab65bb6ed", + "zh:bbec2859a5dc9f736e2d75a486d9a0200a6e5a6f712bd4595e24b8d8e9a4f19b", + "zh:cc90a6021a55df0d836f5b22014a76c8dd7b55dfc5bdf60c7ce3264c1a07e3ff", + "zh:eaa842ab7c8aba3fe104c975d0bc0dc4412098814cd1a374136379b6d107eafe", + "zh:f0457715d8f12d42d4ed128ea3e5b74e2a1f7a60fa5ff8a9971dfff31ebf2f27", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/helm" { + version = "2.11.0" + hashes = [ + "h1:AOp9vXIM4uT1c/PVwsWTPiLVGlO2SSYrfiirV5rjCMQ=", + "zh:013857c88f3e19a4b162344e21dc51891c4ac8b600da8391f7fb2b6d234961e1", + "zh:044fffa233a93cdcf8384afbe9e1ab6c9d0b5b176cbae56ff465eb9611302975", + "zh:208b7cdd4fa3a1b25ae817dc00a9198ef98be0ddc3a577b5b72bc0f006afb997", + "zh:3e8b33f56cfe387277572a92037a1ca1cbe4e3aa6b5c19a8c2431193b07f7865", + "zh:7dd663d5619bd71676899b05b19d36f585189fdabc6b0b03c23579524a8fd9bf", + "zh:ae5329cb3e5bf0b86b02e823aac3ef3bd0d4b1618ff013cd0076dca0be8322e4", + "zh:ba6201695b55d51bedacdb017cb8d03d7a8ada51d0168ac44fef3fa791a85ab4", + "zh:c61285c8b1ba10f50cf94c9dcf98f2f3b720f14906a18be71b9b422279b5d806", + "zh:d522d388246f38b9f329c511ec579b516d212670b954f9dab64efb27e51862af", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + "zh:f92546e26b670da61437ae2cbd038427c9374ce5f7a78df52193397da90bd997", + "zh:f9ad1407e5c0d5e3474094491025bf100828e8c1a01acdf9591d7dd1eb59f961", + ] +} + +provider "registry.terraform.io/hashicorp/kubernetes" { + version = "2.23.0" + hashes = [ + "h1:arTzD0XG/DswGCAx9JEttkSKe9RyyFW9W7UWcXF13dU=", + "zh:10488a12525ed674359585f83e3ee5e74818b5c98e033798351678b21b2f7d89", + "zh:1102ba5ca1a595f880e67102bbf999cc8b60203272a078a5b1e896d173f3f34b", + "zh:1347cf958ed3f3f80b3c7b3e23ddda3d6c6573a81847a8ee92b7df231c238bf6", + "zh:2cb18e9f5156bc1b1ee6bc580a709f7c2737d142722948f4a6c3c8efe757fa8d", + "zh:5506aa6f28dcca2a265ccf8e34478b5ec2cb43b867fe6d93b0158f01590fdadd", + "zh:6217a20686b631b1dcb448ee4bc795747ebc61b56fbe97a1ad51f375ebb0d996", + "zh:8accf916c00579c22806cb771e8909b349ffb7eb29d9c5468d0a3f3166c7a84a", + "zh:9379b0b54a0fa030b19c7b9356708ec8489e194c3b5e978df2d31368563308e5", + "zh:aa99c580890691036c2931841e88e7ee80d59ae52289c8c2c28ea0ac23e31520", + "zh:c57376d169875990ac68664d227fb69cd0037b92d0eba6921d757c3fd1879080", + "zh:e6068e3f94f6943b5586557b73f109debe19d1a75ca9273a681d22d1ce066579", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.4.0" + hashes = [ + "h1:ZUEYUmm2t4vxwzxy1BvN1wL6SDWrDxfH7pxtzX8c6d0=", + "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", + "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", + "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", + "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", + "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", + "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", + "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", + "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", + "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", + "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", + ] +} + +provider "registry.terraform.io/hashicorp/random" { + version = "3.5.1" + hashes = [ + "h1:IL9mSatmwov+e0+++YX2V6uel+dV6bn+fC/cnGDK3Ck=", + "zh:04e3fbd610cb52c1017d282531364b9c53ef72b6bc533acb2a90671957324a64", + "zh:119197103301ebaf7efb91df8f0b6e0dd31e6ff943d231af35ee1831c599188d", + "zh:4d2b219d09abf3b1bb4df93d399ed156cadd61f44ad3baf5cf2954df2fba0831", + "zh:6130bdde527587bbe2dcaa7150363e96dbc5250ea20154176d82bc69df5d4ce3", + "zh:6cc326cd4000f724d3086ee05587e7710f032f94fc9af35e96a386a1c6f2214f", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:b6d88e1d28cf2dfa24e9fdcc3efc77adcdc1c3c3b5c7ce503a423efbdd6de57b", + "zh:ba74c592622ecbcef9dc2a4d81ed321c4e44cddf7da799faa324da9bf52a22b2", + "zh:c7c5cde98fe4ef1143bd1b3ec5dc04baf0d4cc3ca2c5c7d40d17c0e9b2076865", + "zh:dac4bad52c940cd0dfc27893507c1e92393846b024c5a9db159a93c534a3da03", + "zh:de8febe2a2acd9ac454b844a4106ed295ae9520ef54dc8ed2faf29f12716b602", + "zh:eab0d0495e7e711cca367f7d4df6e322e6c562fc52151ec931176115b83ed014", + ] +} + +provider "registry.terraform.io/hashicorp/time" { + version = "0.9.1" + hashes = [ + "h1:VxyoYYOCaJGDmLz4TruZQTSfQhvwEcMxvcKclWdnpbs=", + "zh:00a1476ecf18c735cc08e27bfa835c33f8ac8fa6fa746b01cd3bcbad8ca84f7f", + "zh:3007f8fc4a4f8614c43e8ef1d4b0c773a5de1dcac50e701d8abc9fdc8fcb6bf5", + "zh:5f79d0730fdec8cb148b277de3f00485eff3e9cf1ff47fb715b1c969e5bbd9d4", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:8c8094689a2bed4bb597d24a418bbbf846e15507f08be447d0a5acea67c2265a", + "zh:a6d9206e95d5681229429b406bc7a9ba4b2d9b67470bda7df88fa161508ace57", + "zh:aa299ec058f23ebe68976c7581017de50da6204883950de228ed9246f309e7f1", + "zh:b129f00f45fba1991db0aa954a6ba48d90f64a738629119bfb8e9a844b66e80b", + "zh:ef6cecf5f50cda971c1b215847938ced4cb4a30a18095509c068643b14030b00", + "zh:f1f46a4f6c65886d2dd27b66d92632232adc64f92145bf8403fe64d5ffa5caea", + "zh:f79d6155cda7d559c60d74883a24879a01c4d5f6fd7e8d1e3250f3cd215fb904", + "zh:fd59fa73074805c3575f08cd627eef7acda14ab6dac2c135a66e7a38d262201c", + ] +} + +provider "registry.terraform.io/hashicorp/tls" { + version = "4.0.4" + hashes = [ + "h1:GZcFizg5ZT2VrpwvxGBHQ/hO9r6g0vYdQqx3bFD3anY=", + "zh:23671ed83e1fcf79745534841e10291bbf34046b27d6e68a5d0aab77206f4a55", + "zh:45292421211ffd9e8e3eb3655677700e3c5047f71d8f7650d2ce30242335f848", + "zh:59fedb519f4433c0fdb1d58b27c210b27415fddd0cd73c5312530b4309c088be", + "zh:5a8eec2409a9ff7cd0758a9d818c74bcba92a240e6c5e54b99df68fff312bbd5", + "zh:5e6a4b39f3171f53292ab88058a59e64825f2b842760a4869e64dc1dc093d1fe", + "zh:810547d0bf9311d21c81cc306126d3547e7bd3f194fc295836acf164b9f8424e", + "zh:824a5f3617624243bed0259d7dd37d76017097dc3193dac669be342b90b2ab48", + "zh:9361ccc7048be5dcbc2fafe2d8216939765b3160bd52734f7a9fd917a39ecbd8", + "zh:aa02ea625aaf672e649296bce7580f62d724268189fe9ad7c1b36bb0fa12fa60", + "zh:c71b4cd40d6ec7815dfeefd57d88bc592c0c42f5e5858dcc88245d371b4b8b1e", + "zh:dabcd52f36b43d250a3d71ad7abfa07b5622c69068d989e60b79b2bb4f220316", + "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", + ] +} diff --git a/terraform/aptos-node-testnet/gcp/addons.tf b/terraform/aptos-node-testnet/gcp/addons.tf index 01dee100f4b5e..d09ca63336cf0 100644 --- a/terraform/aptos-node-testnet/gcp/addons.tf +++ b/terraform/aptos-node-testnet/gcp/addons.tf @@ -4,6 +4,7 @@ locals { } resource "kubernetes_namespace" "chaos-mesh" { + count = var.enable_forge ? 1 : 0 metadata { annotations = { name = "chaos-mesh" @@ -14,8 +15,9 @@ resource "kubernetes_namespace" "chaos-mesh" { } resource "helm_release" "chaos-mesh" { + count = var.enable_forge ? 1 : 0 name = "chaos-mesh" - namespace = kubernetes_namespace.chaos-mesh.metadata[0].name + namespace = kubernetes_namespace.chaos-mesh[0].metadata[0].name chart = local.chaos_mesh_helm_chart_path max_history = 5 @@ -24,29 +26,14 @@ resource "helm_release" "chaos-mesh" { values = [ jsonencode({ chaos-mesh = { + images = { + registry = "us-docker.pkg.dev/aptos-registry/docker/ghcr.io" + tag = "aptos-patch" // Same as the patched chart in helm/chaos + }, chaosDaemon = { runtime = "containerd" socketPath = "/run/containerd/containerd.sock" - image = { - repository = "aptos-internal/chaos-daemon" - tag = "latest" - } - }, - controllerManager = { - image = { - repository = "aptos-internal/chaos-mesh" - tag = "latest" - } }, - dashboard = { - image = { - repository = "aptos-internal/chaos-dashboard" - tag = "latest" - } - } - images = { - registry = "us-west1-docker.pkg.dev/aptos-global" - } } }) ] @@ -153,13 +140,7 @@ resource "helm_release" "testnet-addons" { } ingress = { gce_static_ip = "aptos-${local.workspace_name}-testnet-addons-ingress" - gce_managed_certificate = "aptos-${local.workspace_name}-${var.zone_name}-testnet-addons" - } - load_test = { - fullnodeGroups = try(var.aptos_node_helm_values.fullnode.groups, []) - config = { - numFullnodeGroups = var.num_fullnode_groups - } + gce_managed_certificate = var.create_google_managed_ssl_certificate ? "aptos-${local.workspace_name}-${var.zone_name}-testnet-addons" : null } }), jsonencode(var.testnet_addons_helm_values) diff --git a/terraform/aptos-node-testnet/gcp/main.tf b/terraform/aptos-node-testnet/gcp/main.tf index d9b7193b0457c..4386d7d4c1756 100644 --- a/terraform/aptos-node-testnet/gcp/main.tf +++ b/terraform/aptos-node-testnet/gcp/main.tf @@ -17,13 +17,13 @@ provider "helm" { module "validator" { source = "../../aptos-node/gcp" - cluster_bootstrap = var.cluster_bootstrap - manage_via_tf = var.manage_via_tf + manage_via_tf = var.manage_via_tf # Project config - project = var.project - zone = var.zone - region = var.region + project = var.project + zone = var.zone + region = var.region + node_locations = var.node_locations # DNS zone_name = var.zone_name # keep empty if you don't want a DNS name @@ -51,7 +51,7 @@ module "validator" { gke_enable_node_autoprovisioning = var.gke_enable_node_autoprovisioning gke_node_autoprovisioning_max_cpu = var.gke_node_autoprovisioning_max_cpu gke_node_autoprovisioning_max_memory = var.gke_node_autoprovisioning_max_memory - gke_enable_autoscaling = var.gke_enable_autoscaling + gke_autoscaling_profile = var.gke_autoscaling_profile gke_autoscaling_max_node_count = var.gke_autoscaling_max_node_count # Testnet config @@ -63,14 +63,18 @@ module "validator" { num_fullnode_groups = var.num_fullnode_groups # Instance config - utility_instance_type = var.utility_instance_type - validator_instance_type = var.validator_instance_type - - # addons - enable_monitoring = var.enable_monitoring - enable_node_exporter = var.enable_prometheus_node_exporter - monitoring_helm_values = var.monitoring_helm_values - + default_disk_size_gb = var.default_disk_size_gb + default_disk_type = var.default_disk_type + create_nodepools = var.create_nodepools + nodepool_sysctls = var.nodepool_sysctls + core_instance_type = var.core_instance_type + utility_instance_type = var.utility_instance_type + validator_instance_type = var.validator_instance_type + utility_instance_enable_taint = var.utility_instance_enable_taint + validator_instance_enable_taint = var.validator_instance_enable_taint + + enable_clouddns = var.enable_clouddns + enable_image_streaming = var.enable_image_streaming gke_maintenance_policy = var.gke_maintenance_policy } @@ -86,8 +90,8 @@ locals { aptos_node_helm_prefix = var.enable_forge ? "aptos-node" : "${module.validator.helm_release_name}-aptos-node" } - resource "helm_release" "genesis" { + count = var.enable_genesis ? 1 : 0 name = "genesis" chart = local.genesis_helm_chart_path max_history = 5 diff --git a/terraform/aptos-node-testnet/gcp/security.tf b/terraform/aptos-node-testnet/gcp/security.tf index d492bdc963f57..42734bb6771b6 100644 --- a/terraform/aptos-node-testnet/gcp/security.tf +++ b/terraform/aptos-node-testnet/gcp/security.tf @@ -17,6 +17,7 @@ locals { } resource "kubernetes_labels" "pss-chaos-mesh" { + count = var.enable_forge ? 1 : 0 api_version = "v1" kind = "Namespace" metadata { diff --git a/terraform/aptos-node-testnet/gcp/variables.tf b/terraform/aptos-node-testnet/gcp/variables.tf index f76ea1231f4c5..b56355fae99b7 100644 --- a/terraform/aptos-node-testnet/gcp/variables.tf +++ b/terraform/aptos-node-testnet/gcp/variables.tf @@ -1,11 +1,5 @@ ### Project config -variable "cluster_bootstrap" { - description = "Set when bootstrapping a new cluster" - type = bool - default = false -} - variable "project" { description = "GCP project" type = string @@ -19,10 +13,18 @@ variable "region" { variable "zone" { description = "GCP zone suffix" type = string + default = "" +} + +variable "node_locations" { + description = "List of node locations" + type = list(string) + default = [] # if empty, let GCP choose } variable "manage_via_tf" { description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool default = true } @@ -30,21 +32,25 @@ variable "manage_via_tf" { variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { description = "Aptos chain ID" + type = string default = "TESTING" } variable "chain_name" { description = "Aptos chain name" + type = string default = "testnet" } variable "image_tag" { description = "Docker image tag for Aptos node" + type = string default = "devnet" } @@ -52,36 +58,49 @@ variable "image_tag" { variable "workspace_dns" { description = "Include Terraform workspace name in DNS records" + type = bool default = true } variable "dns_prefix_name" { description = "DNS prefix for fullnode url" + type = string default = "fullnode" } variable "zone_name" { description = "Zone name of GCP Cloud DNS zone to create records in" + type = string default = "" } variable "zone_project" { description = "GCP project which the DNS zone is in (if different)" + type = string default = "" } +variable "create_google_managed_ssl_certificate" { + description = "Whether to create a Google Managed SSL Certificate for the GCE Ingress" + type = bool + default = false +} + variable "record_name" { description = "DNS record name to use ( is replaced with the TF workspace name)" + type = string default = ".aptos" } variable "create_dns_records" { description = "Creates DNS records in var.zone_name that point to k8s service, as opposed to using external-dns or other means" + type = bool default = true } variable "dns_ttl" { description = "Time-to-Live for the Validator and Fullnode DNS records" + type = number default = 300 } @@ -89,11 +108,13 @@ variable "dns_ttl" { variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } variable "helm_release_name_override" { description = "If set, overrides the name of the aptos-node helm chart" + type = string default = "" } @@ -117,11 +138,13 @@ variable "forge_helm_values" { variable "num_validators" { description = "The number of validator nodes to create" + type = number default = 1 } variable "num_fullnode_groups" { description = "The number of fullnode groups to create" + type = number default = 1 } @@ -130,84 +153,136 @@ variable "num_fullnode_groups" { variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } -### Instance config - -variable "utility_instance_type" { - description = "Instance type used for utilities" - default = "n2-standard-8" -} - -variable "validator_instance_type" { - description = "Instance type used for validator and fullnodes" - default = "n2-standard-32" -} - ### Addons variable "enable_forge" { description = "Enable Forge" + type = bool default = false } -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false +variable "enable_genesis" { + description = "Perform genesis automatically" + type = bool + default = true } -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" +variable "testnet_addons_helm_values" { + description = "Map of values to pass to testnet-addons helm chart" type = any default = {} } -variable "enable_prometheus_node_exporter" { - description = "Enable prometheus-node-exporter within monitoring helm chart" - default = false +### Node pools and Autoscaling + +variable "default_disk_size_gb" { + description = "Default disk size for nodes" + type = number + default = 200 } -variable "testnet_addons_helm_values" { - description = "Map of values to pass to testnet-addons helm chart" - type = any +variable "default_disk_type" { + description = "Default disk type for nodes" + type = string + default = "pd-standard" +} + +variable "create_nodepools" { + description = "Create managed nodepools" + type = bool + default = true +} + +variable "nodepool_sysctls" { + description = "Sysctls to set on nodepools" + type = map(string) default = {} } -### Autoscaling +variable "core_instance_type" { + description = "Instance type used for core pods" + type = string + default = "e2-medium" +} + +variable "utility_instance_type" { + description = "Instance type used for utility pods" + type = string + default = "e2-standard-8" +} + +variable "validator_instance_type" { + description = "Instance type used for validator and fullnodes" + type = string + default = "t2d-standard-16" +} + +variable "utility_instance_enable_taint" { + description = "Whether to taint instances in the utilities nodegroup" + type = bool + default = true +} + +variable "validator_instance_enable_taint" { + description = "Whether to taint instances in the validator nodegroup" + type = bool + default = true +} variable "gke_enable_node_autoprovisioning" { - description = "Enable node autoprovisioning for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning" - default = false + description = "Enable GKE node autoprovisioning" + type = bool + default = true } variable "gke_node_autoprovisioning_max_cpu" { - description = "Maximum CPU utilization for GKE node_autoprovisioning" - default = 10 + description = "Maximum CPU allocation for GKE node_autoprovisioning" + type = number + default = 500 } variable "gke_node_autoprovisioning_max_memory" { - description = "Maximum memory utilization for GKE node_autoprovisioning" - default = 100 + description = "Maximum memory allocation for GKE node_autoprovisioning" + type = number + default = 2000 } -variable "gke_enable_autoscaling" { - description = "Enable autoscaling for the nodepools in the GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler" - default = true +variable "gke_autoscaling_profile" { + description = "Autoscaling profile for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler#autoscaling_profiles" + type = string + default = "OPTIMIZE_UTILIZATION" } variable "gke_autoscaling_max_node_count" { description = "Maximum number of nodes for GKE nodepool autoscaling" - default = 10 + type = number + default = 250 } ### GKE cluster config variable "cluster_ipv4_cidr_block" { description = "The IP address range of the container pods in this cluster, in CIDR notation. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#cluster_ipv4_cidr_block" + type = string default = "" } +variable "enable_clouddns" { + description = "Enable CloudDNS (Google-managed cluster DNS)" + type = bool + default = false +} + +variable "enable_image_streaming" { + description = "Enable image streaming (GCFS)" + type = bool + default = false +} + variable "gke_maintenance_policy" { description = "The maintenance policy to use for the cluster. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#maintenance_policy" type = object({ diff --git a/terraform/aptos-node/aws/cluster.tf b/terraform/aptos-node/aws/cluster.tf index b0103425c0afe..b5a99a185bb5a 100644 --- a/terraform/aptos-node/aws/cluster.tf +++ b/terraform/aptos-node/aws/cluster.tf @@ -12,7 +12,7 @@ resource "aws_eks_cluster" "aptos" { tags = local.default_tags vpc_config { - subnet_ids = concat(aws_subnet.public.*.id, aws_subnet.private.*.id) + subnet_ids = concat(aws_subnet.public[*].id, aws_subnet.private[*].id) public_access_cidrs = var.k8s_api_sources endpoint_private_access = true security_group_ids = [aws_security_group.cluster.id] diff --git a/terraform/aptos-node/aws/kubernetes.tf b/terraform/aptos-node/aws/kubernetes.tf index 92b8219c48c7c..358b4e04de2e4 100644 --- a/terraform/aptos-node/aws/kubernetes.tf +++ b/terraform/aptos-node/aws/kubernetes.tf @@ -1,13 +1,13 @@ provider "kubernetes" { host = aws_eks_cluster.aptos.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority.0.data) + cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority[0].data) token = data.aws_eks_cluster_auth.aptos.token } provider "helm" { kubernetes { host = aws_eks_cluster.aptos.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority.0.data) + cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority[0].data) token = data.aws_eks_cluster_auth.aptos.token } } @@ -16,8 +16,6 @@ locals { kubeconfig = "/tmp/kube.config.${md5(timestamp())}" # helm chart paths - monitoring_helm_chart_path = "${path.module}/../../helm/monitoring" - logger_helm_chart_path = "${path.module}/../../helm/logger" aptos_node_helm_chart_path = var.helm_chart != "" ? var.helm_chart : "${path.module}/../../helm/aptos-node" } @@ -43,7 +41,7 @@ resource "kubernetes_storage_class" "gp3" { type = "gp3" } - depends_on = [null_resource.delete-gp2, aws_eks_addon.aws-ebs-csi-driver] + depends_on = [null_resource.delete-gp2] } resource "kubernetes_storage_class" "io1" { @@ -70,25 +68,6 @@ resource "kubernetes_storage_class" "io2" { } } -resource "kubernetes_namespace" "tigera-operator" { - metadata { - annotations = { - name = "tigera-operator" - } - - name = "tigera-operator" - } -} - -resource "helm_release" "calico" { - count = var.enable_calico ? 1 : 0 - name = "calico" - repository = "https://docs.tigera.io/calico/charts" - chart = "tigera-operator" - version = "3.26.0" - namespace = "tigera-operator" -} - locals { helm_values = jsonencode({ numValidators = var.num_validators @@ -113,7 +92,6 @@ locals { value = "validators" effect = "NoExecute" }] - remoteLogAddress = var.enable_logger ? "${helm_release.logger[0].name}-aptos-logger.${helm_release.logger[0].namespace}.svc:5044" : null } fullnode = { storage = { @@ -171,80 +149,6 @@ resource "helm_release" "validator" { } } -resource "helm_release" "logger" { - count = var.enable_logger ? 1 : 0 - name = "${local.helm_release_name}-log" - chart = local.logger_helm_chart_path - max_history = 5 - wait = false - - values = [ - jsonencode({ - logger = { - name = "aptos-logger" - } - chain = { - name = var.chain_name - } - serviceAccount = { - create = false - # this name must match the serviceaccount created by the aptos-node helm chart - name = local.helm_release_name == "aptos-node" ? "aptos-node-validator" : "${local.helm_release_name}-aptos-node-validator" - } - }), - jsonencode(var.logger_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.logger_helm_chart_path, "**") : filesha1("${local.logger_helm_chart_path}/${f}")])) - } -} - - -resource "helm_release" "monitoring" { - count = var.enable_monitoring ? 1 : 0 - name = "${local.helm_release_name}-mon" - chart = local.monitoring_helm_chart_path - max_history = 5 - wait = false - - values = [ - jsonencode({ - chain = { - name = var.chain_name - } - validator = { - name = var.validator_name - } - service = { - domain = local.domain - } - monitoring = { - prometheus = { - storage = { - class = kubernetes_storage_class.gp3.metadata[0].name - } - } - } - kube-state-metrics = { - enabled = var.enable_kube_state_metrics - } - prometheus-node-exporter = { - enabled = var.enable_prometheus_node_exporter - } - }), - jsonencode(var.monitoring_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.monitoring_helm_chart_path, "**") : filesha1("${local.monitoring_helm_chart_path}/${f}")])) - } -} - resource "kubernetes_cluster_role" "debug" { metadata { name = "debug" diff --git a/terraform/aptos-node/aws/network.tf b/terraform/aptos-node/aws/network.tf index e1f28d501c683..d690356631c74 100644 --- a/terraform/aptos-node/aws/network.tf +++ b/terraform/aptos-node/aws/network.tf @@ -68,7 +68,7 @@ resource "aws_route_table" "public" { resource "aws_route_table_association" "public" { count = local.num_azs - subnet_id = element(aws_subnet.public.*.id, count.index) + subnet_id = element(aws_subnet.public[*].id, count.index) route_table_id = aws_route_table.public.id } @@ -114,7 +114,7 @@ resource "aws_route_table" "private" { resource "aws_route_table_association" "private" { count = local.num_azs - subnet_id = element(aws_subnet.private.*.id, count.index) + subnet_id = element(aws_subnet.private[*].id, count.index) route_table_id = aws_route_table.private.id } diff --git a/terraform/aptos-node/aws/security.tf b/terraform/aptos-node/aws/security.tf index f037dec881ad3..4addace8ff219 100644 --- a/terraform/aptos-node/aws/security.tf +++ b/terraform/aptos-node/aws/security.tf @@ -1,9 +1,6 @@ # Security-related resources -data "kubernetes_all_namespaces" "all" {} - locals { - kubernetes_master_version = substr(aws_eks_cluster.aptos.version, 0, 4) baseline_pss_labels = { "pod-security.kubernetes.io/audit" = "baseline" "pod-security.kubernetes.io/warn" = "baseline" @@ -11,40 +8,6 @@ locals { } } -# FIXME: Remove when migrating to K8s 1.25 -resource "kubernetes_role_binding" "disable-psp" { - for_each = toset(local.kubernetes_master_version <= "1.24" ? data.kubernetes_all_namespaces.all.namespaces : []) - metadata { - name = "privileged-psp" - namespace = each.value - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "eks:podsecuritypolicy:privileged" - } - - subject { - api_group = "rbac.authorization.k8s.io" - kind = "Group" - name = "system:serviceaccounts:${each.value}" - } -} - -# FIXME: Remove when migrating to K8s 1.25 -resource "null_resource" "delete-psp-authenticated" { - count = local.kubernetes_master_version <= "1.24" ? 1 : 0 - provisioner "local-exec" { - command = <<-EOT - aws --region ${var.region} eks update-kubeconfig --name ${aws_eks_cluster.aptos.name} --kubeconfig ${local.kubeconfig} && - kubectl --kubeconfig ${local.kubeconfig} delete --ignore-not-found clusterrolebinding eks:podsecuritypolicy:authenticated - EOT - } - - depends_on = [kubernetes_role_binding.disable-psp] -} - resource "kubernetes_labels" "pss-default" { api_version = "v1" kind = "Namespace" diff --git a/terraform/aptos-node/aws/variables.tf b/terraform/aptos-node/aws/variables.tf index 117f4df19404a..38b48580f1657 100644 --- a/terraform/aptos-node/aws/variables.tf +++ b/terraform/aptos-node/aws/variables.tf @@ -5,41 +5,49 @@ variable "region" { variable "num_azs" { description = "Number of availability zones" + type = number default = 3 } variable "kubernetes_version" { description = "Version of Kubernetes to use for EKS cluster" - default = "1.24" + type = string + default = "1.26" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } variable "num_validators" { description = "The number of validator nodes to create" + type = number default = 1 } variable "num_fullnode_groups" { description = "The number of fullnode groups to create" + type = number default = 1 } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { description = "Aptos chain ID" + type = string default = "TESTING" } variable "chain_name" { description = "Aptos chain name" + type = string default = "testnet" } @@ -50,31 +58,37 @@ variable "validator_name" { variable "image_tag" { description = "Docker image tag for Aptos node" + type = string default = "devnet" } variable "zone_id" { description = "Zone ID of Route 53 domain to create records in" + type = string default = "" } variable "workspace_dns" { description = "Include Terraform workspace name in DNS records" + type = bool default = true } variable "record_name" { description = "DNS record name to use ( is replaced with the TF workspace name)" + type = string default = ".aptos" } variable "create_records" { description = "Creates DNS records in var.zone_id that point to k8s service, as opposed to using external-dns or other means" + type = bool default = true } variable "helm_chart" { description = "Path to aptos-validator Helm chart file" + type = string default = "" } @@ -86,6 +100,7 @@ variable "helm_values" { variable "helm_values_file" { description = "Path to file containing values for Helm chart" + type = string default = "" } @@ -126,130 +141,109 @@ variable "k8s_debugger_roles" { } variable "iam_path" { - default = "/" description = "Path to use when naming IAM objects" + type = string + default = "/" } variable "permissions_boundary_policy" { - default = "" description = "ARN of IAM policy to set as permissions boundary on created roles" + type = string } variable "vpc_cidr_block" { - default = "192.168.0.0/16" description = "VPC CIDR Block" + type = string + default = "192.168.0.0/16" } variable "maximize_single_az_capacity" { description = "Whether to maximize the capacity of the cluster by allocating more IPs to the first AZ" + type = bool default = false } variable "helm_enable_validator" { description = "Enable deployment of the validator Helm chart" + type = bool default = true } variable "utility_instance_type" { description = "Instance type used for utilities" + type = string default = "t3.2xlarge" } variable "utility_instance_num" { description = "Number of instances for utilities" + type = number default = 1 } variable "utility_instance_min_num" { description = "Minimum number of instances for utilities" + type = number default = 1 } variable "utility_instance_max_num" { description = "Maximum number of instances for utilities. If left 0, defaults to 2 * var.utility_instance_num" + type = number default = 0 } variable "utility_instance_enable_taint" { description = "Whether to taint the instances in the utility nodegroup" + type = bool default = false } variable "validator_instance_type" { description = "Instance type used for validator and fullnodes" + type = string default = "c6i.8xlarge" } variable "validator_instance_num" { description = "Number of instances used for validator and fullnodes" + type = number default = 2 } variable "validator_instance_min_num" { description = "Minimum number of instances for validators" + type = number default = 1 } variable "validator_instance_max_num" { description = "Maximum number of instances for utilities. If left 0, defaults to 2 * var.validator_instance_num" + type = number default = 0 } variable "validator_instance_enable_taint" { description = "Whether to taint instances in the validator nodegroup" + type = bool default = false } variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } -variable "enable_calico" { - description = "Enable Calico networking for NetworkPolicy" - default = true -} - -variable "enable_logger" { - description = "Enable logger helm chart" - default = false -} - -variable "logger_helm_values" { - description = "Map of values to pass to logger Helm" - type = any - default = {} -} - - -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" - type = any - default = {} -} - -variable "enable_prometheus_node_exporter" { - description = "Enable prometheus-node-exporter within monitoring helm chart" - default = false -} - -variable "enable_kube_state_metrics" { - description = "Enable kube-state-metrics within monitoring helm chart" - default = false -} - variable "helm_release_name_override" { description = "If set, overrides the name of the aptos-node helm chart" + type = string default = "" } variable "validator_storage_class" { description = "Which storage class to use for the validator and fullnode" + type = string default = "io1" validation { condition = contains(["gp3", "io1", "io2"], var.validator_storage_class) @@ -259,6 +253,7 @@ variable "validator_storage_class" { variable "fullnode_storage_class" { description = "Which storage class to use for the validator and fullnode" + type = string default = "io1" validation { condition = contains(["gp3", "io1", "io2"], var.fullnode_storage_class) @@ -268,5 +263,6 @@ variable "fullnode_storage_class" { variable "manage_via_tf" { description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool default = true } diff --git a/terraform/aptos-node/aws/versions.tf b/terraform/aptos-node/aws/versions.tf index a2b7631af994b..1403cae46c48a 100644 --- a/terraform/aptos-node/aws/versions.tf +++ b/terraform/aptos-node/aws/versions.tf @@ -1,8 +1,9 @@ terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { aws = { - source = "hashicorp/aws" + source = "hashicorp/aws" + version = "~> 4.35.0" } helm = { source = "hashicorp/helm" diff --git a/terraform/aptos-node/azure/cluster.tf b/terraform/aptos-node/azure/cluster.tf index 48d156b68d8b0..69f23ecbc5d85 100644 --- a/terraform/aptos-node/azure/cluster.tf +++ b/terraform/aptos-node/azure/cluster.tf @@ -12,7 +12,6 @@ resource "azurerm_kubernetes_cluster" "aptos" { network_profile { network_plugin = "kubenet" - network_policy = "calico" load_balancer_sku = "standard" } diff --git a/terraform/aptos-node/azure/kubernetes.tf b/terraform/aptos-node/azure/kubernetes.tf index cb704f20dbbaa..14864dc62d6c9 100644 --- a/terraform/aptos-node/azure/kubernetes.tf +++ b/terraform/aptos-node/azure/kubernetes.tf @@ -16,8 +16,6 @@ provider "helm" { locals { # helm chart paths - monitoring_helm_chart_path = "${path.module}/../../helm/monitoring" - logger_helm_chart_path = "${path.module}/../../helm/logger" aptos_node_helm_chart_path = var.helm_chart != "" ? var.helm_chart : "${path.module}/../../helm/aptos-node" } @@ -73,66 +71,3 @@ resource "helm_release" "validator" { value = sha1(join("", [for f in fileset(local.aptos_node_helm_chart_path, "**") : filesha1("${local.aptos_node_helm_chart_path}/${f}")])) } } - -resource "helm_release" "logger" { - count = var.enable_logger ? 1 : 0 - name = "${terraform.workspace}-log" - chart = local.logger_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - logger = { - name = "aptos-logger" - } - chain = { - name = var.chain_name - } - serviceAccount = { - create = false - name = "${terraform.workspace}-aptos-node-validator" - } - }), - jsonencode(var.logger_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.logger_helm_chart_path, "**") : filesha1("${local.logger_helm_chart_path}/${f}")])) - } -} - -resource "helm_release" "monitoring" { - count = var.enable_monitoring ? 1 : 0 - name = "${terraform.workspace}-mon" - chart = local.monitoring_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - chain = { - name = var.chain_name - } - validator = { - name = var.validator_name - } - monitoring = { - prometheus = { - storage = { - class = "default" - } - } - } - }), - jsonencode(var.monitoring_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.monitoring_helm_chart_path, "**") : filesha1("${local.monitoring_helm_chart_path}/${f}")])) - } -} diff --git a/terraform/aptos-node/azure/variables.tf b/terraform/aptos-node/azure/variables.tf index e8e73e5e1cb12..f8ad067280f62 100644 --- a/terraform/aptos-node/azure/variables.tf +++ b/terraform/aptos-node/azure/variables.tf @@ -5,16 +5,19 @@ variable "region" { variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { description = "Aptos chain ID" + type = string default = "TESTING" } variable "chain_name" { description = "Aptos chain name" + type = string default = "testnet" } @@ -25,26 +28,31 @@ variable "validator_name" { variable "image_tag" { description = "Docker image tag for Aptos node" + type = string default = "devnet" } variable "zone_name" { description = "Zone name of Azure DNS domain to create records in" + type = string default = "" } variable "zone_resource_group" { description = "Azure resource group name of the DNS zone" + type = string default = "" } variable "record_name" { description = "DNS record name to use ( is replaced with the TF workspace name)" + type = string default = ".aptos" } variable "helm_chart" { description = "Path to aptos-validator Helm chart file" + type = string default = "" } @@ -56,11 +64,13 @@ variable "helm_values" { variable "helm_values_file" { description = "Path to file containing values for Helm chart" + type = string default = "" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } @@ -84,47 +94,30 @@ variable "k8s_debugger_groups" { variable "utility_instance_type" { description = "Instance type used for utilities" + type = string default = "Standard_B8ms" } variable "utility_instance_num" { description = "Number of instances for utilities" + type = number default = 1 } variable "validator_instance_type" { description = "Instance type used for validator and fullnodes" + type = string default = "Standard_F4s_v2" } variable "validator_instance_num" { description = "Number of instances used for validator and fullnodes" + type = string default = 2 } variable "validator_instance_enable_taint" { description = "Whether to taint the instances in the validator nodegroup" + type = bool default = false } - -variable "enable_logger" { - description = "Enable logger helm chart" - default = false -} - -variable "logger_helm_values" { - description = "Map of values to pass to logger Helm" - type = any - default = {} -} - -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" - type = any - default = {} -} diff --git a/terraform/aptos-node/azure/versions.tf b/terraform/aptos-node/azure/versions.tf index 487dac291ae73..c18e160e7df9a 100644 --- a/terraform/aptos-node/azure/versions.tf +++ b/terraform/aptos-node/azure/versions.tf @@ -1,7 +1,7 @@ provider "azuread" {} terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { azuread = { source = "hashicorp/azuread" diff --git a/terraform/aptos-node/gcp/cluster.tf b/terraform/aptos-node/gcp/cluster.tf index dae2708215e8a..2508bef0643b7 100644 --- a/terraform/aptos-node/gcp/cluster.tf +++ b/terraform/aptos-node/gcp/cluster.tf @@ -1,20 +1,23 @@ +locals { + location = var.zone == "" ? var.region : "${var.region}-${var.zone}" +} + resource "google_container_cluster" "aptos" { - provider = google-beta - name = "aptos-${local.workspace_name}" - location = local.zone - network = google_compute_network.aptos.id + provider = google-beta + name = "aptos-${local.workspace_name}" + location = local.location + node_locations = var.node_locations + network = google_compute_network.aptos.id remove_default_node_pool = true initial_node_count = 1 - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" - release_channel { - channel = "REGULAR" + cost_management_config { + enabled = true } - pod_security_policy_config { - enabled = false + release_channel { + channel = "STABLE" } master_auth { @@ -48,7 +51,7 @@ resource "google_container_cluster" "aptos" { addons_config { network_policy_config { - disabled = false + disabled = true } } @@ -56,18 +59,76 @@ resource "google_container_cluster" "aptos" { enabled = false } - cluster_autoscaling { - enabled = var.gke_enable_node_autoprovisioning + pod_security_policy_config { + enabled = false + } + + dynamic "dns_config" { + for_each = var.enable_clouddns ? ["clouddns"] : [] + content { + cluster_dns = "CLOUD_DNS" + cluster_dns_scope = "CLUSTER_SCOPE" + } + } - dynamic "resource_limits" { - for_each = var.gke_enable_node_autoprovisioning ? { - "cpu" = var.gke_node_autoprovisioning_max_cpu - "memory" = var.gke_node_autoprovisioning_max_memory - } : {} - content { - resource_type = resource_limits.key - minimum = 1 - maximum = resource_limits.value + monitoring_config { + managed_prometheus { + enabled = true + } + # Enable all components. + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "DAEMONSET", + "DEPLOYMENT", + "HPA", + "POD", + "SCHEDULER", + "STATEFULSET", + "STORAGE", + "SYSTEM_COMPONENTS", + ] + } + + dynamic "cluster_autoscaling" { + for_each = var.gke_enable_node_autoprovisioning ? [1] : [] + content { + enabled = var.gke_enable_node_autoprovisioning + autoscaling_profile = var.gke_autoscaling_profile + + dynamic "resource_limits" { + for_each = { + "cpu" = var.gke_node_autoprovisioning_max_cpu + "memory" = var.gke_node_autoprovisioning_max_memory + } + content { + resource_type = resource_limits.key + minimum = 1 + maximum = resource_limits.value + } + } + + auto_provisioning_defaults { + service_account = google_service_account.gke.email + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + disk_size = var.default_disk_size_gb + disk_type = var.default_disk_type + management { + auto_upgrade = true + auto_repair = true + } + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + } + } + } + + node_pool_defaults { + node_config_defaults { + gcfs_config { + enabled = var.enable_image_streaming } } } @@ -82,36 +143,99 @@ resource "google_container_cluster" "aptos" { } } } + + lifecycle { + ignore_changes = [ + private_cluster_config, + ] + } + deletion_protection = false +} + +resource "google_container_node_pool" "core" { + count = var.create_nodepools ? 1 : 0 + provider = google-beta + name = "core" + location = local.location + cluster = google_container_cluster.aptos.name + node_count = lookup(var.node_pool_sizes, "core", null) + + node_config { + machine_type = var.core_instance_type + image_type = "COS_CONTAINERD" + disk_size_gb = lookup(var.instance_disk_sizes, "core", var.default_disk_size_gb) + service_account = google_service_account.gke.email + tags = ["core"] + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + + workload_metadata_config { + mode = "GKE_METADATA" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + # The core machine type is too small (<16G) to support image streaming. + gcfs_config { + enabled = false + } + + gvnic { + enabled = true + } + + kubelet_config { + cpu_manager_policy = "none" + } + } + + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count + } } resource "google_container_node_pool" "utilities" { - provider = google-beta - name = "utilities" - location = local.zone - cluster = google_container_cluster.aptos.name - # If cluster autoscaling is enabled, node_count should not be set - # If node auto-provisioning is enabled, node_count should be set to 0 as this nodepool is most likely ignored - node_count = var.gke_enable_autoscaling ? null : (var.gke_enable_node_autoprovisioning ? 0 : lookup(var.node_pool_sizes, "utilities", var.utility_instance_num)) + count = var.create_nodepools ? 1 : 0 + provider = google-beta + name = "utilities" + location = local.location + cluster = google_container_cluster.aptos.name + node_count = lookup(var.node_pool_sizes, "utilities", null) node_config { machine_type = var.utility_instance_type image_type = "COS_CONTAINERD" - disk_size_gb = var.utility_instance_disk_size_gb + disk_size_gb = lookup(var.instance_disk_sizes, "utilities", var.default_disk_size_gb) service_account = google_service_account.gke.email tags = ["utilities"] oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + workload_metadata_config { + mode = "GKE_METADATA" + } + shielded_instance_config { - enable_secure_boot = true + enable_integrity_monitoring = true + enable_secure_boot = true } - workload_metadata_config { - mode = "GKE_METADATA" + gvnic { + enabled = true + } + + kubelet_config { + cpu_manager_policy = "none" + } + linux_node_config { + sysctls = var.nodepool_sysctls } # if the NodeGroup should be tainted, then create the below dynamic block dynamic "taint" { - for_each = var.utility_instance_enable_taint ? ["utilities"] : [] + for_each = var.validator_instance_enable_taint ? ["utilities"] : [] content { key = "aptos.org/nodepool" value = taint.value @@ -120,38 +244,46 @@ resource "google_container_node_pool" "utilities" { } } - dynamic "autoscaling" { - for_each = var.gke_enable_autoscaling ? [1] : [] - content { - min_node_count = 1 - max_node_count = var.gke_autoscaling_max_node_count - } + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count } } resource "google_container_node_pool" "validators" { - provider = google-beta - name = "validators" - location = local.zone - cluster = google_container_cluster.aptos.name - # If cluster autoscaling is enabled, node_count should not be set - # If node auto-provisioning is enabled, node_count should be set to 0 as this nodepool is most likely ignored - node_count = var.gke_enable_autoscaling ? null : (var.gke_enable_node_autoprovisioning ? 0 : lookup(var.node_pool_sizes, "validators", var.validator_instance_num)) + count = var.create_nodepools ? 1 : 0 + provider = google-beta + name = "validators" + location = local.location + cluster = google_container_cluster.aptos.name + node_count = lookup(var.node_pool_sizes, "validators", null) node_config { machine_type = var.validator_instance_type image_type = "COS_CONTAINERD" - disk_size_gb = var.validator_instance_disk_size_gb + disk_size_gb = lookup(var.instance_disk_sizes, "validators", var.default_disk_size_gb) service_account = google_service_account.gke.email tags = ["validators"] oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + workload_metadata_config { + mode = "GKE_METADATA" + } + shielded_instance_config { - enable_secure_boot = true + enable_integrity_monitoring = true + enable_secure_boot = true } - workload_metadata_config { - mode = "GKE_METADATA" + gvnic { + enabled = true + } + + kubelet_config { + cpu_manager_policy = "static" + } + linux_node_config { + sysctls = var.nodepool_sysctls } # if the NodeGroup should be tainted, then create the below dynamic block @@ -165,11 +297,8 @@ resource "google_container_node_pool" "validators" { } } - dynamic "autoscaling" { - for_each = var.gke_enable_autoscaling ? [1] : [] - content { - min_node_count = 1 - max_node_count = var.gke_autoscaling_max_node_count - } + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count } } diff --git a/terraform/aptos-node/gcp/kubernetes.tf b/terraform/aptos-node/gcp/kubernetes.tf index 970b4487b94c8..1ff60429cd2e0 100644 --- a/terraform/aptos-node/gcp/kubernetes.tf +++ b/terraform/aptos-node/gcp/kubernetes.tf @@ -25,8 +25,6 @@ provider "helm" { locals { # helm chart paths - monitoring_helm_chart_path = "${path.module}/../../helm/monitoring" - logger_helm_chart_path = "${path.module}/../../helm/logger" aptos_node_helm_chart_path = var.helm_chart != "" ? var.helm_chart : "${path.module}/../../helm/aptos-node" # override the helm release name if an override exists, otherwise adopt the workspace name @@ -55,9 +53,9 @@ resource "helm_release" "validator" { storage = { class = kubernetes_storage_class.ssd.metadata[0].name } - nodeSelector = var.gke_enable_node_autoprovisioning ? {} : { - "cloud.google.com/gke-nodepool" = google_container_node_pool.validators.name - } + nodeSelector = var.validator_instance_enable_taint ? { + "cloud.google.com/gke-nodepool" = "validators" + } : {} tolerations = [{ key = "aptos.org/nodepool" value = "validators" @@ -68,9 +66,9 @@ resource "helm_release" "validator" { storage = { class = kubernetes_storage_class.ssd.metadata[0].name } - nodeSelector = var.gke_enable_node_autoprovisioning ? {} : { - "cloud.google.com/gke-nodepool" = google_container_node_pool.validators.name - } + nodeSelector = var.validator_instance_enable_taint ? { + "cloud.google.com/gke-nodepool" = "validators" + } : {} tolerations = [{ key = "aptos.org/nodepool" value = "validators" @@ -78,9 +76,14 @@ resource "helm_release" "validator" { }] } haproxy = { - nodeSelector = var.gke_enable_node_autoprovisioning ? {} : { - "cloud.google.com/gke-nodepool" = google_container_node_pool.utilities.name - } + nodeSelector = var.utility_instance_enable_taint ? { + "cloud.google.com/gke-nodepool" = "utilities" + } : {} + tolerations = [{ + key = "aptos.org/nodepool" + value = "utilities" + effect = "NoExecute" + }] } service = { domain = local.domain @@ -99,82 +102,3 @@ resource "helm_release" "validator" { } } } - -resource "helm_release" "logger" { - count = var.enable_logger ? 1 : 0 - name = "${local.helm_release_name}-log" - chart = local.logger_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - logger = { - name = "aptos-logger" - } - chain = { - name = var.chain_name - } - serviceAccount = { - create = false - # this name must match the serviceaccount created by the aptos-node helm chart - name = local.helm_release_name == "aptos-node" ? "aptos-node-validator" : "${local.helm_release_name}-aptos-node-validator" } - }), - jsonencode(var.logger_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.logger_helm_chart_path, "**") : filesha1("${local.logger_helm_chart_path}/${f}")])) - } -} - -resource "helm_release" "monitoring" { - count = var.enable_monitoring ? 1 : 0 - name = "${local.helm_release_name}-mon" - chart = local.monitoring_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - chain = { - name = var.chain_name - } - validator = { - name = var.validator_name - } - monitoring = { - prometheus = { - storage = { - class = kubernetes_storage_class.ssd.metadata[0].name - } - } - } - }), - jsonencode(var.monitoring_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.monitoring_helm_chart_path, "**") : filesha1("${local.monitoring_helm_chart_path}/${f}")])) - } -} - -resource "helm_release" "node_exporter" { - count = var.enable_node_exporter ? 1 : 0 - name = "prometheus-node-exporter" - repository = "https://prometheus-community.github.io/helm-charts" - chart = "prometheus-node-exporter" - version = "4.0.0" - namespace = "kube-system" - max_history = 5 - wait = false - - values = [ - jsonencode({}), - jsonencode(var.node_exporter_helm_values), - ] -} diff --git a/terraform/aptos-node/gcp/main.tf b/terraform/aptos-node/gcp/main.tf index 7229840f6c49b..5a56adef7c8a9 100644 --- a/terraform/aptos-node/gcp/main.tf +++ b/terraform/aptos-node/gcp/main.tf @@ -11,7 +11,6 @@ provider "google-beta" { data "google_client_config" "provider" {} locals { - zone = "${var.region}-${var.zone}" workspace_name = var.workspace_name_override == "" ? terraform.workspace : var.workspace_name_override } diff --git a/terraform/aptos-node/gcp/variables.tf b/terraform/aptos-node/gcp/variables.tf index c5dd42daf4f58..04a25dd872d99 100644 --- a/terraform/aptos-node/gcp/variables.tf +++ b/terraform/aptos-node/gcp/variables.tf @@ -1,11 +1,5 @@ ### Project config -variable "cluster_bootstrap" { - description = "Set when bootstrapping a new cluster" - type = bool - default = false -} - variable "project" { description = "GCP project" type = string @@ -19,20 +13,30 @@ variable "region" { variable "zone" { description = "GCP zone suffix" type = string + default = "" # if empty, it's a regional cluster +} + +variable "node_locations" { + description = "List of node locations" + type = list(string) + default = [] # if empty, let GCP choose } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { description = "Aptos chain ID" + type = string default = "TESTING" } variable "chain_name" { description = "Aptos chain name" + type = string default = "testnet" } @@ -43,11 +47,13 @@ variable "validator_name" { variable "image_tag" { description = "Docker image tag for Aptos node" + type = string default = "devnet" } variable "helm_chart" { description = "Path to aptos-validator Helm chart file" + type = string default = "" } @@ -59,166 +65,169 @@ variable "helm_values" { variable "helm_values_file" { description = "Path to file containing values for Helm chart" + type = string default = "" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } -variable "node_pool_sizes" { - type = map(number) - default = {} - description = "Override the number of nodes in the specified pool" -} - -variable "utility_instance_type" { - description = "Instance type used for utilities" - default = "n2-standard-8" +variable "manage_via_tf" { + description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool + default = true } -variable "utility_instance_num" { - description = "Number of instances for utilities" - default = 1 -} +### DNS -variable "utility_instance_enable_taint" { - description = "Whether to taint the instances in the utility nodegroup" - default = false +variable "zone_name" { + description = "Zone name of GCP Cloud DNS zone to create records in" + type = string + default = "" } -variable "utility_instance_disk_size_gb" { - description = "Disk size for utility instances" - default = 20 +variable "zone_project" { + description = "GCP project which the DNS zone is in (if different)" + type = string + default = "" } -variable "validator_instance_type" { - description = "Instance type used for validator and fullnodes" - default = "n2-standard-32" +variable "workspace_dns" { + description = "Include Terraform workspace name in DNS records" + type = bool + default = true } -variable "validator_instance_num" { - description = "Number of instances used for validator and fullnodes" - default = 2 +variable "record_name" { + description = "DNS record name to use ( is replaced with the TF workspace name)" + type = string + default = ".aptos" } -variable "validator_instance_enable_taint" { - description = "Whether to taint instances in the validator nodegroup" - default = false +variable "create_dns_records" { + description = "Creates DNS records in var.zone_name that point to k8s service, as opposed to using external-dns or other means" + type = bool + default = true } -variable "validator_instance_disk_size_gb" { - description = "Disk size for validator instances" - default = 20 +variable "dns_ttl" { + description = "Time-to-Live for the Validator and Fullnode DNS records" + type = number + default = 300 } -variable "enable_logger" { - description = "Enable logger helm chart" - default = false -} +### Node pools and Autoscaling -variable "logger_helm_values" { - description = "Map of values to pass to logger Helm" - type = any +variable "node_pool_sizes" { + type = map(number) default = {} + description = "Override the number of nodes in the specified pool" } -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" - type = any +variable "instance_disk_sizes" { + type = map(number) default = {} + description = "Override the disk size in the specified pool" } -variable "enable_node_exporter" { - description = "Enable Prometheus node exporter helm chart" - default = false +variable "default_disk_size_gb" { + description = "Default disk size for nodes" + type = number + default = 100 } -variable "node_exporter_helm_values" { - description = "Map of values to pass to node exporter Helm" - type = any - default = {} +variable "default_disk_type" { + description = "Default disk type for nodes" + type = string + default = "pd-standard" } -variable "manage_via_tf" { - description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" +variable "create_nodepools" { + description = "Create managed nodepools" + type = bool default = true } -### DNS - -variable "zone_name" { - description = "Zone name of GCP Cloud DNS zone to create records in" - default = "" +variable "nodepool_sysctls" { + description = "Sysctls to set on nodepools" + type = map(string) + default = {} } -variable "zone_project" { - description = "GCP project which the DNS zone is in (if different)" - default = "" +variable "core_instance_type" { + description = "Instance type used for core pods" + type = string + default = "e2-medium" } -variable "workspace_dns" { - description = "Include Terraform workspace name in DNS records" - default = true +variable "utility_instance_type" { + description = "Instance type used for utility pods" + type = string + default = "e2-standard-8" } -variable "record_name" { - description = "DNS record name to use ( is replaced with the TF workspace name)" - default = ".aptos" +variable "validator_instance_type" { + description = "Instance type used for validator and fullnodes" + type = string + default = "t2d-standard-16" } -variable "create_dns_records" { - description = "Creates DNS records in var.zone_name that point to k8s service, as opposed to using external-dns or other means" - default = true +variable "utility_instance_enable_taint" { + description = "Whether to taint instances in the utilities nodegroup" + type = bool + default = false } -variable "dns_ttl" { - description = "Time-to-Live for the Validator and Fullnode DNS records" - default = 300 +variable "validator_instance_enable_taint" { + description = "Whether to taint instances in the validator nodegroup" + type = bool + default = false } -### Autoscaling - variable "gke_enable_node_autoprovisioning" { - description = "Enable node autoprovisioning for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning" - default = false + description = "Enable GKE node autoprovisioning" + type = bool + default = true } variable "gke_node_autoprovisioning_max_cpu" { - description = "Maximum CPU utilization for GKE node_autoprovisioning" - default = 10 + description = "Maximum CPU allocation for GKE node autoprovisioning" + type = number + default = 500 } variable "gke_node_autoprovisioning_max_memory" { - description = "Maximum memory utilization for GKE node_autoprovisioning" - default = 100 + description = "Maximum memory allocation for GKE node autoprovisioning" + type = number + default = 2000 } -variable "gke_enable_autoscaling" { - description = "Enable autoscaling for the nodepools in the GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler" - default = true +variable "gke_autoscaling_profile" { + description = "Autoscaling profile for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler#autoscaling_profiles" + type = string + default = "OPTIMIZE_UTILIZATION" } variable "gke_autoscaling_max_node_count" { description = "Maximum number of nodes for GKE nodepool autoscaling" - default = 10 + type = number + default = 250 } ### Naming overrides variable "helm_release_name_override" { description = "If set, overrides the name of the aptos-node helm chart" + type = string default = "" } variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } @@ -226,18 +235,33 @@ variable "workspace_name_override" { variable "cluster_ipv4_cidr_block" { description = "The IP address range of the container pods in this cluster, in CIDR notation. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#cluster_ipv4_cidr_block" + type = string default = "" } +variable "enable_clouddns" { + description = "Enable CloudDNS (Google-managed cluster DNS)" + type = bool + default = false +} + +variable "enable_image_streaming" { + description = "Enable image streaming (GCFS)" + type = bool + default = false +} + ### Helm variable "num_validators" { description = "The number of validator nodes to create" + type = number default = 1 } variable "num_fullnode_groups" { description = "The number of fullnode groups to create" + type = number default = 1 } diff --git a/terraform/aptos-node/gcp/versions.tf b/terraform/aptos-node/gcp/versions.tf index 2b8786efb55aa..b74536e8f4cc8 100644 --- a/terraform/aptos-node/gcp/versions.tf +++ b/terraform/aptos-node/gcp/versions.tf @@ -1,13 +1,13 @@ terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { google = { source = "hashicorp/google" - version = "~> 4.54.0" + version = "~> 5.0.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.54.0" + version = "~> 5.0.0" } helm = { source = "hashicorp/helm" diff --git a/terraform/fullnode/aws/addons.tf b/terraform/fullnode/aws/addons.tf index 0f301173bd899..52bb764c77565 100644 --- a/terraform/fullnode/aws/addons.tf +++ b/terraform/fullnode/aws/addons.tf @@ -54,4 +54,42 @@ resource "helm_release" "external-dns" { ] } +resource "helm_release" "pfn-addons" { + depends_on = [ + helm_release.fullnode + ] + name = "pfn-addons" + chart = local.pfn_addons_helm_chart_path + max_history = 10 + wait = false + values = [ + jsonencode({ + service = { + domain = local.domain + aws_tags = local.aws_tags + fullnode = { + numFullnodes = var.num_fullnodes + loadBalancerSourceRanges = var.client_sources_ipv4 + } + } + ingress = { + class = "alb" + acm_certificate = var.zone_id != "" ? aws_acm_certificate.ingress[0].arn : null + loadBalancerSourceRanges = var.client_sources_ipv4 + } + load_test = { + config = { + numFullnodeGroups = var.num_fullnodes + } + } + }), + jsonencode(var.pfn_helm_values), + ] + + # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. + set { + name = "chart_sha1" + value = sha1(join("", [for f in fileset(local.pfn_addons_helm_chart_path, "**") : filesha1("${local.pfn_addons_helm_chart_path}/${f}")])) + } +} diff --git a/terraform/fullnode/aws/kubernetes.tf b/terraform/fullnode/aws/kubernetes.tf index 3be8bea180f11..263106320e6f4 100644 --- a/terraform/fullnode/aws/kubernetes.tf +++ b/terraform/fullnode/aws/kubernetes.tf @@ -1,43 +1,6 @@ locals { pfn_addons_helm_chart_path = "${path.module}/../../helm/pfn-addons" - pfn_logger_helm_chart_path = "${path.module}/../../helm/logger" fullnode_helm_chart_path = "${path.module}/../../helm/fullnode" - monitoring_helm_chart_path = "${path.module}/../../helm/monitoring" -} - -resource "helm_release" "pfn-addons" { - depends_on = [ - helm_release.fullnode - ] - name = "pfn-addons" - chart = local.pfn_addons_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - service = { - domain = local.domain - aws_tags = local.aws_tags - fullnode = { - numFullnodes = var.num_fullnodes - loadBalancerSourceRanges = var.client_sources_ipv4 - } - } - ingress = { - class = "alb" - acm_certificate = var.zone_id != "" ? aws_acm_certificate.ingress[0].arn : null - loadBalancerSourceRanges = var.client_sources_ipv4 - } - }), - jsonencode(var.pfn_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.pfn_addons_helm_chart_path, "**") : filesha1("${local.pfn_addons_helm_chart_path}/${f}")])) - } } resource "helm_release" "fullnode" { @@ -60,9 +23,6 @@ resource "helm_release" "fullnode" { image = { tag = local.image_tag } - logging = { - address = var.enable_pfn_logger ? "fullnode-pfn-aptos-logger:5044" : "" - } nodeSelector = { "eks.amazonaws.com/nodegroup" = "fullnode" } @@ -113,72 +73,3 @@ resource "helm_release" "fullnode" { } } } - - -resource "helm_release" "pfn-logger" { - count = var.enable_pfn_logger ? 1 : 0 - name = "pfn-logger" - chart = local.pfn_logger_helm_chart_path - max_history = 10 - wait = false - - values = [ - jsonencode({ - logger = { - name = "pfn" - } - chain = { - name = "aptos-${local.workspace_name}" - } - }), - jsonencode(var.pfn_logger_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.pfn_logger_helm_chart_path, "**") : filesha1("${local.pfn_logger_helm_chart_path}/${f}")])) - } -} - -resource "helm_release" "monitoring" { - count = var.enable_monitoring ? 1 : 0 - name = "aptos-monitoring" - chart = local.monitoring_helm_chart_path - max_history = 5 - wait = false - - values = [ - jsonencode({ - chain = { - name = var.chain_name - } - fullnode = { - name = var.fullnode_name - } - service = { - domain = local.domain - } - kube-state-metrics = { - enabled = var.enable_kube_state_metrics - } - prometheus-node-exporter = { - enabled = var.enable_prometheus_node_exporter - } - monitoring = { - prometheus = { - storage = { - class = "gp3" - } - } - } - }), - jsonencode(var.monitoring_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.monitoring_helm_chart_path, "**") : filesha1("${local.monitoring_helm_chart_path}/${f}")])) - } -} diff --git a/terraform/fullnode/aws/network.tf b/terraform/fullnode/aws/network.tf index 9ac9bad604596..9f0e636f8d221 100644 --- a/terraform/fullnode/aws/network.tf +++ b/terraform/fullnode/aws/network.tf @@ -5,14 +5,14 @@ data "aws_route53_zone" "pfn" { locals { dns_prefix = var.workspace_dns ? "${local.workspace_name}.${var.dns_prefix_name}." : "${var.dns_prefix_name}." - domain = var.zone_id != "" ? "${local.dns_prefix}${data.aws_route53_zone.pfn[0].name}" : null + domain = var.zone_id != "" ? "${local.dns_prefix}${data.aws_route53_zone.pfn[0].name}" : terraform.workspace } resource "aws_acm_certificate" "ingress" { count = var.zone_id != "" ? 1 : 0 domain_name = local.domain - subject_alternative_names = concat(["*.${local.domain}"], var.tls_sans) + subject_alternative_names = distinct(concat(["*.${local.domain}"], var.tls_sans)) validation_method = "DNS" lifecycle { diff --git a/terraform/fullnode/aws/security.tf b/terraform/fullnode/aws/security.tf index 6dbf2c6a4933c..4addace8ff219 100644 --- a/terraform/fullnode/aws/security.tf +++ b/terraform/fullnode/aws/security.tf @@ -1,9 +1,6 @@ # Security-related resources -data "kubernetes_all_namespaces" "all" {} - locals { - kubernetes_master_version = substr(data.aws_eks_cluster.aptos.version, 0, 4) baseline_pss_labels = { "pod-security.kubernetes.io/audit" = "baseline" "pod-security.kubernetes.io/warn" = "baseline" @@ -11,27 +8,6 @@ locals { } } -# FIXME: Remove when migrating to K8s 1.25 -resource "kubernetes_role_binding" "disable-psp" { - for_each = toset(local.kubernetes_master_version <= "1.24" ? data.kubernetes_all_namespaces.all.namespaces : []) - metadata { - name = "privileged-psp" - namespace = each.value - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "eks:podsecuritypolicy:privileged" - } - - subject { - api_group = "rbac.authorization.k8s.io" - kind = "Group" - name = "system:serviceaccounts:${each.value}" - } -} - resource "kubernetes_labels" "pss-default" { api_version = "v1" kind = "Namespace" diff --git a/terraform/fullnode/aws/variables.tf b/terraform/fullnode/aws/variables.tf index af70ce8a0402b..98fc577581b82 100644 --- a/terraform/fullnode/aws/variables.tf +++ b/terraform/fullnode/aws/variables.tf @@ -1,20 +1,23 @@ variable "region" { description = "AWS region" + type = string } variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } variable "iam_path" { - default = "/" description = "Path to use when naming IAM objects" + type = string + default = "/" } variable "permissions_boundary_policy" { - default = "" description = "ARN of IAM policy to set as permissions boundary on created roles" + type = string } variable "admin_sources_ipv4" { @@ -40,31 +43,38 @@ variable "k8s_admins" { } variable "num_fullnodes" { - default = 1 + description = "Number of fullnodes." + type = number + default = 1 } variable "image_tag" { description = "Docker image tag for aptos components. Overrides ecr_repo method." + type = string default = "" } variable "ecr_repo" { description = "Name of an ECR repo to resolve 'stable' tag to a specific revision" + type = string default = "" } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 15 } variable "chain_id" { - description = "aptos chain ID" + description = "Aptos chain ID" + type = string default = "DEVNET" } variable "chain_name" { description = "Aptos chain name" + type = string default = "devnet" } @@ -93,6 +103,7 @@ variable "fullnode_helm_values_list" { variable "zone_id" { description = "Route53 Zone ID to create records in" + type = string default = "" } @@ -104,86 +115,64 @@ variable "tls_sans" { variable "workspace_dns" { description = "Include Terraform workspace name in DNS records" + type = bool default = true } variable "dns_prefix_name" { description = "DNS prefix for fullnode url" + type = string default = "fullnode" } -variable "enable_pfn_logger" { - description = "Enable separate public fullnode logger pod" - default = false -} - -variable "pfn_logger_helm_values" { - description = "Map of values to pass to public fullnode logger Helm" - type = any - default = {} -} - variable "utility_instance_type" { description = "Instance type used for utilities" + type = string default = "t3.medium" } variable "fullnode_instance_type" { description = "Instance type used for validator and fullnodes" + type = string default = "c6i.8xlarge" } variable "num_extra_instance" { - default = 0 description = "Number of extra instances to add into node pool" + type = number + default = 0 } variable "enable_backup" { - description = "enable data backup from fullnode" + description = "Enable data backup from fullnode" + type = bool default = false } variable "enable_public_backup" { - description = "provide data backups to the public" + description = "Provide data backups to the public" + type = bool default = false } variable "backup_fullnode_index" { - description = "index of fullnode to backup data from" + description = "Index of fullnode to backup data from" + type = number default = 0 } variable "fullnode_storage_class" { description = "Which storage class to use for the validator and fullnode" + type = string default = "io1" validation { - condition = contains(["gp3", "gp2", "io1", "io2"], var.fullnode_storage_class) - error_message = "Supported storage classes are gp3, io1, io2" + condition = contains(["gp2", "gp3", "io1", "io2"], var.fullnode_storage_class) + error_message = "Supported storage classes are gp2, gp3, io1, io2" } } -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" - type = any - default = {} -} - -variable "enable_prometheus_node_exporter" { - description = "Enable prometheus-node-exporter within monitoring helm chart" - default = false -} - -variable "enable_kube_state_metrics" { - description = "Enable kube-state-metrics within monitoring helm chart" - default = false -} - variable "manage_via_tf" { description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool default = true } diff --git a/terraform/fullnode/aws/versions.tf b/terraform/fullnode/aws/versions.tf index 9e00b537a668e..134ae20d5c34d 100644 --- a/terraform/fullnode/aws/versions.tf +++ b/terraform/fullnode/aws/versions.tf @@ -1,8 +1,9 @@ terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { aws = { - source = "hashicorp/aws" + source = "hashicorp/aws" + version = "~> 4.35.0" } helm = { source = "hashicorp/helm" diff --git a/terraform/fullnode/digital_ocean/variables.tf b/terraform/fullnode/digital_ocean/variables.tf index 89a0b5eca62bb..a6ed5dd0ad6fa 100644 --- a/terraform/fullnode/digital_ocean/variables.tf +++ b/terraform/fullnode/digital_ocean/variables.tf @@ -11,8 +11,8 @@ variable "fullnode_helm_values" { } variable "do_token" { - type = string description = "Digital Notion API token" + type = string } variable "region" { @@ -27,41 +27,49 @@ variable "fullnode_helm_values_list" { } variable "k8s_namespace" { - default = "aptos" description = "Kubernetes namespace that the fullnode will be deployed into" + type = string + default = "aptos" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } variable "num_fullnodes" { - default = 1 description = "Number of fullnodes" + type = number + default = 1 } variable "image_tag" { - default = "devnet" description = "Docker image tag to use for the fullnode" + type = string + default = "devnet" } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { - description = "aptos chain ID" + description = "Aptos chain ID" + type = string default = "DEVNET" } variable "chain_name" { description = "Aptos chain name" + type = string default = "devnet" } variable "machine_type" { description = "Machine type for running fullnode" + type = string default = "s-16vcpu-32gb" } diff --git a/terraform/fullnode/gcp/addons.tf b/terraform/fullnode/gcp/addons.tf index 8985415254817..de354a72a76be 100644 --- a/terraform/fullnode/gcp/addons.tf +++ b/terraform/fullnode/gcp/addons.tf @@ -31,8 +31,9 @@ data "google_dns_managed_zone" "pfn" { } locals { - dns_prefix = var.workspace_dns ? "${local.workspace_name}.${var.dns_prefix_name}." : "${var.dns_prefix_name}." - domain = var.zone_name != "" ? trimsuffix("${local.dns_prefix}${data.google_dns_managed_zone.pfn[0].dns_name}", ".") : null + zone_project = var.zone_project != "" ? var.zone_project : var.project + dns_prefix = var.workspace_dns ? "${local.workspace_name}.${var.dns_prefix_name}." : "${var.dns_prefix_name}." + domain = var.zone_name != "" ? trimsuffix("${local.dns_prefix}${data.google_dns_managed_zone.pfn[0].dns_name}", ".") : null } resource "helm_release" "external-dns" { @@ -61,3 +62,41 @@ resource "helm_release" "external-dns" { }) ] } + +resource "helm_release" "pfn-addons" { + depends_on = [ + helm_release.fullnode + ] + name = "pfn-addons" + chart = local.pfn_addons_helm_chart_path + max_history = 10 + wait = false + namespace = var.k8s_namespace + + values = [ + jsonencode({ + service = { + domain = local.domain + } + ingress = { + class = "gce" + backend_http2 = var.backend_http2 + gce_managed_certificate = var.create_google_managed_ssl_certificate ? "aptos-${local.workspace_name}-ingress" : null + gce_managed_certificate_domains = var.create_google_managed_ssl_certificate ? join(",", distinct(concat([local.domain], var.tls_sans))) : "" + # loadBalancerSourceRanges = var.client_sources_ipv4 # not supported yet + } + load_test = { + config = { + numFullnodeGroups = var.num_fullnodes + } + } + }), + jsonencode(var.pfn_helm_values), + ] + + # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. + set { + name = "chart_sha1" + value = sha1(join("", [for f in fileset(local.pfn_addons_helm_chart_path, "**") : filesha1("${local.pfn_addons_helm_chart_path}/${f}")])) + } +} diff --git a/terraform/fullnode/gcp/auth.tf b/terraform/fullnode/gcp/auth.tf index c098b1b962382..4af1fe37bfd83 100644 --- a/terraform/fullnode/gcp/auth.tf +++ b/terraform/fullnode/gcp/auth.tf @@ -1,7 +1,3 @@ -locals { - zone_project = var.zone_project != "" ? var.zone_project : var.project -} - resource "google_service_account" "gke" { account_id = "aptos-${terraform.workspace}-gke" } diff --git a/terraform/fullnode/gcp/cluster.tf b/terraform/fullnode/gcp/cluster.tf index 2b74252fbb336..ae573e650dd20 100644 --- a/terraform/fullnode/gcp/cluster.tf +++ b/terraform/fullnode/gcp/cluster.tf @@ -1,24 +1,23 @@ -resource "google_container_cluster" "aptos" { - provider = google-beta - name = "aptos-${terraform.workspace}" - location = local.zone - network = google_compute_network.aptos.id +locals { + location = var.zone == "" ? var.region : "${var.region}-${var.zone}" +} - lifecycle { - ignore_changes = [ - private_cluster_config, - cluster_autoscaling[0].auto_provisioning_defaults[0].shielded_instance_config - ] - prevent_destroy = true - } +resource "google_container_cluster" "aptos" { + provider = google-beta + name = "aptos-${terraform.workspace}" + location = local.location + node_locations = var.node_locations + network = google_compute_network.aptos.id remove_default_node_pool = true initial_node_count = 1 - logging_service = "logging.googleapis.com/kubernetes" - monitoring_service = "monitoring.googleapis.com/kubernetes" + + cost_management_config { + enabled = true + } release_channel { - channel = "REGULAR" + channel = "STABLE" } master_auth { @@ -37,7 +36,7 @@ resource "google_container_cluster" "aptos" { } private_cluster_config { - enable_private_nodes = var.gke_enable_private_nodes + enable_private_nodes = true enable_private_endpoint = false master_ipv4_cidr_block = "172.16.0.0/28" } @@ -52,7 +51,7 @@ resource "google_container_cluster" "aptos" { addons_config { network_policy_config { - disabled = false + disabled = true } } @@ -60,55 +59,246 @@ resource "google_container_cluster" "aptos" { enabled = false } - cluster_autoscaling { - enabled = var.gke_enable_node_autoprovisioning + pod_security_policy_config { + enabled = false + } + + dynamic "dns_config" { + for_each = var.enable_clouddns ? ["clouddns"] : [] + content { + cluster_dns = "CLOUD_DNS" + cluster_dns_scope = "CLUSTER_SCOPE" + } + } + + monitoring_config { + managed_prometheus { + enabled = true + } + # Enable all components. + enable_components = [ + "APISERVER", + "CONTROLLER_MANAGER", + "DAEMONSET", + "DEPLOYMENT", + "HPA", + "POD", + "SCHEDULER", + "STATEFULSET", + "STORAGE", + "SYSTEM_COMPONENTS", + ] + } + + dynamic "cluster_autoscaling" { + for_each = var.gke_enable_node_autoprovisioning ? [1] : [] + content { + enabled = var.gke_enable_node_autoprovisioning + autoscaling_profile = var.gke_autoscaling_profile + + dynamic "resource_limits" { + for_each = { + "cpu" = var.gke_node_autoprovisioning_max_cpu + "memory" = var.gke_node_autoprovisioning_max_memory + } + content { + resource_type = resource_limits.key + minimum = 1 + maximum = resource_limits.value + } + } + + auto_provisioning_defaults { + service_account = google_service_account.gke.email + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + disk_size = var.default_disk_size_gb + disk_type = var.default_disk_type + management { + auto_upgrade = true + auto_repair = true + } + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + } + } + } + + node_pool_defaults { + node_config_defaults { + gcfs_config { + enabled = var.enable_image_streaming + } + } + } - dynamic "resource_limits" { - for_each = var.gke_enable_node_autoprovisioning ? { - "cpu" = var.gke_node_autoprovisioning_max_cpu - "memory" = var.gke_node_autoprovisioning_max_memory - } : {} + maintenance_policy { + dynamic "recurring_window" { + for_each = var.gke_maintenance_policy.recurring_window != null ? [1] : [] content { - resource_type = resource_limits.key - minimum = 1 - maximum = resource_limits.value + start_time = var.gke_maintenance_policy.recurring_window.start_time + end_time = var.gke_maintenance_policy.recurring_window.end_time + recurrence = var.gke_maintenance_policy.recurring_window.recurrence } } - auto_provisioning_defaults { - oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] - service_account = google_service_account.gke.email + } + + lifecycle { + ignore_changes = [ + private_cluster_config, + ] + } + deletion_protection = false +} + +resource "google_container_node_pool" "core" { + count = var.create_nodepools ? 1 : 0 + provider = google-beta + name = "core" + location = local.location + cluster = google_container_cluster.aptos.name + node_count = lookup(var.node_pool_sizes, "core", null) + + node_config { + machine_type = var.core_instance_type + image_type = "COS_CONTAINERD" + disk_size_gb = lookup(var.instance_disk_sizes, "core", var.default_disk_size_gb) + service_account = google_service_account.gke.email + tags = ["core"] + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + + workload_metadata_config { + mode = "GKE_METADATA" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + # The core machine type is too small (<16G) to support image streaming. + gcfs_config { + enabled = false + } + + gvnic { + enabled = true + } + + kubelet_config { + cpu_manager_policy = "none" } } + + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count + } +} + +resource "google_container_node_pool" "utilities" { + count = var.create_nodepools ? 1 : 0 + provider = google-beta + name = "utilities" + location = local.location + cluster = google_container_cluster.aptos.name + node_count = lookup(var.node_pool_sizes, "utilities", null) + + node_config { + machine_type = var.utility_instance_type + image_type = "COS_CONTAINERD" + disk_size_gb = lookup(var.instance_disk_sizes, "utilities", var.default_disk_size_gb) + service_account = google_service_account.gke.email + tags = ["utilities"] + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + + workload_metadata_config { + mode = "GKE_METADATA" + } + + shielded_instance_config { + enable_integrity_monitoring = true + enable_secure_boot = true + } + + gvnic { + enabled = true + } + + kubelet_config { + cpu_manager_policy = "none" + } + linux_node_config { + sysctls = var.nodepool_sysctls + } + + # if the NodeGroup should be tainted, then create the below dynamic block + dynamic "taint" { + for_each = var.utility_instance_enable_taint ? ["utilities"] : [] + content { + key = "aptos.org/nodepool" + value = taint.value + effect = "NO_EXECUTE" + } + } + } + + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count + } } resource "google_container_node_pool" "fullnodes" { + count = var.create_nodepools ? 1 : 0 provider = google-beta name = "fullnodes" - location = local.zone + location = local.location cluster = google_container_cluster.aptos.name - node_count = var.gke_enable_autoscaling ? null : var.num_fullnodes + var.num_extra_instance + node_count = lookup(var.node_pool_sizes, "fullnodes", null) node_config { - machine_type = var.machine_type + machine_type = var.fullnode_instance_type image_type = "COS_CONTAINERD" - disk_size_gb = var.instance_disk_size_gb + disk_size_gb = lookup(var.instance_disk_sizes, "fullnodes", var.default_disk_size_gb) service_account = google_service_account.gke.email tags = ["fullnodes"] + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + + workload_metadata_config { + mode = "GKE_METADATA" + } shielded_instance_config { - enable_secure_boot = true + enable_integrity_monitoring = true + enable_secure_boot = true } - workload_metadata_config { - mode = "GKE_METADATA" + gvnic { + enabled = true } - } - dynamic "autoscaling" { - for_each = var.gke_enable_autoscaling ? [1] : [] - content { - min_node_count = 1 - max_node_count = var.gke_autoscaling_max_node_count + kubelet_config { + cpu_manager_policy = "static" + } + linux_node_config { + sysctls = var.nodepool_sysctls } + + # if the NodeGroup should be tainted, then create the below dynamic block + dynamic "taint" { + for_each = var.fullnode_instance_enable_taint ? ["fullnodes"] : [] + content { + key = "aptos.org/nodepool" + value = taint.value + effect = "NO_EXECUTE" + } + } + } + + autoscaling { + min_node_count = 0 + max_node_count = var.gke_autoscaling_max_node_count } } diff --git a/terraform/fullnode/gcp/kubernetes.tf b/terraform/fullnode/gcp/kubernetes.tf index 0acd8c7338473..b9975f7e90b3f 100644 --- a/terraform/fullnode/gcp/kubernetes.tf +++ b/terraform/fullnode/gcp/kubernetes.tf @@ -32,40 +32,15 @@ provider "helm" { locals { fullnode_helm_chart_path = "${path.module}/../../helm/fullnode" pfn_addons_helm_chart_path = "${path.module}/../../helm/pfn-addons" - monitoring_helm_chart_path = "${path.module}/../../helm/monitoring" -} - - -resource "helm_release" "pfn-addons" { - depends_on = [ - helm_release.fullnode - ] - name = "pfn-addons" - chart = local.pfn_addons_helm_chart_path - max_history = 10 - wait = false - namespace = var.k8s_namespace - values = [ - jsonencode({ - service = { - domain = local.domain - } - ingress = { - class = "gce" - gce_managed_certificate = var.create_google_managed_ssl_certificate ? "aptos-${local.workspace_name}-ingress" : null - gce_managed_certificate_domains = var.create_google_managed_ssl_certificate ? join(",", concat([for x in range(var.num_fullnodes) : "pfn${x}.${local.domain}"], [local.domain], var.tls_sans)) : "" - # loadBalancerSourceRanges = var.client_sources_ipv4 # not supported yet - } - }), - jsonencode(var.pfn_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.pfn_addons_helm_chart_path, "**") : filesha1("${local.pfn_addons_helm_chart_path}/${f}")])) - } + utility_nodeSelector = var.utility_instance_enable_taint ? { + "cloud.google.com/gke-nodepool" = "utilities" + } : {} + utility_tolerations = [{ + key = "aptos.org/nodepool" + value = "utilities" + effect = "NoExecute" + }] } resource "helm_release" "fullnode" { @@ -88,10 +63,14 @@ resource "helm_release" "fullnode" { image = { tag = var.image_tag } - nodeSelector = var.gke_enable_node_autoprovisioning ? {} : { - "cloud.google.com/gke-nodepool" = "fullnodes" - "iam.gke.io/gke-metadata-server-enabled" = "true" - } + nodeSelector = var.fullnode_instance_enable_taint ? { + "cloud.google.com/gke-nodepool" = "fullnodes" + } : {} + tolerations = [{ + key = "aptos.org/nodepool" + value = "fullnodes" + effect = "NoExecute" + }] storage = { class = kubernetes_storage_class.ssd.metadata[0].name } @@ -103,7 +82,9 @@ resource "helm_release" "fullnode" { } backup = { # only enable backup for fullnode 0 - enable = count.index == var.backup_fullnode_index ? var.enable_backup : false + enable = count.index == var.backup_fullnode_index ? var.enable_backup : false + nodeSelector = local.utility_nodeSelector + tolerations = local.utility_tolerations config = { location = "gcs" gcs = { @@ -111,6 +92,14 @@ resource "helm_release" "fullnode" { } } } + backup_verify = { + nodeSelector = local.utility_nodeSelector + tolerations = local.utility_tolerations + } + backup_compaction = { + nodeSelector = local.utility_nodeSelector + tolerations = local.utility_tolerations + } restore = { config = { location = "gcs" @@ -139,49 +128,3 @@ resource "helm_release" "fullnode" { } } } - - - -resource "helm_release" "monitoring" { - count = var.enable_monitoring ? 1 : 0 - name = "aptos-monitoring" - chart = local.monitoring_helm_chart_path - max_history = 5 - wait = false - namespace = var.k8s_namespace - - - values = [ - jsonencode({ - chain = { - name = var.chain_name - } - fullnode = { - name = var.fullnode_name - } - service = { - domain = var.zone_name != "" ? trimsuffix(local.domain, ".") : "" - } - kube-state-metrics = { - enabled = var.enable_kube_state_metrics - } - prometheus-node-exporter = { - enabled = var.enable_prometheus_node_exporter - } - monitoring = { - prometheus = { - storage = { - class = "standard" - } - } - } - }), - jsonencode(var.monitoring_helm_values), - ] - - # inspired by https://stackoverflow.com/a/66501021 to trigger redeployment whenever any of the charts file contents change. - set { - name = "chart_sha1" - value = sha1(join("", [for f in fileset(local.monitoring_helm_chart_path, "**") : filesha1("${local.monitoring_helm_chart_path}/${f}")])) - } -} diff --git a/terraform/fullnode/gcp/main.tf b/terraform/fullnode/gcp/main.tf index 478577495fd7b..5a56adef7c8a9 100644 --- a/terraform/fullnode/gcp/main.tf +++ b/terraform/fullnode/gcp/main.tf @@ -11,19 +11,21 @@ provider "google-beta" { data "google_client_config" "provider" {} locals { - zone = "${var.region}-${var.zone}" workspace_name = var.workspace_name_override == "" ? terraform.workspace : var.workspace_name_override } resource "google_project_service" "services" { for_each = { "clouderrorreporting.googleapis.com" = true + "cloudkms.googleapis.com" = true "cloudresourcemanager.googleapis.com" = true "compute.googleapis.com" = true "container.googleapis.com" = true "iam.googleapis.com" = true "logging.googleapis.com" = true "monitoring.googleapis.com" = true + "secretmanager.googleapis.com" = true + "spanner.googleapis.com" = true } service = each.key disable_on_destroy = false diff --git a/terraform/fullnode/gcp/network.tf b/terraform/fullnode/gcp/network.tf index 41f24d0f4fafa..839e4a01474e0 100644 --- a/terraform/fullnode/gcp/network.tf +++ b/terraform/fullnode/gcp/network.tf @@ -22,15 +22,13 @@ resource "google_compute_router" "nat" { } resource "google_compute_address" "nat" { - count = var.gke_enable_private_nodes ? 1 : 0 - name = "aptos-${terraform.workspace}-nat" + name = "aptos-${terraform.workspace}-nat" } resource "google_compute_router_nat" "nat" { - count = var.gke_enable_private_nodes ? 1 : 0 name = "aptos-${terraform.workspace}-nat" router = google_compute_router.nat.name nat_ip_allocate_option = "MANUAL_ONLY" - nat_ips = [google_compute_address.nat[0].self_link] + nat_ips = [google_compute_address.nat.self_link] source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_PRIMARY_IP_RANGES" } diff --git a/terraform/fullnode/gcp/security.tf b/terraform/fullnode/gcp/security.tf index 206c41b54e3d3..4addace8ff219 100644 --- a/terraform/fullnode/gcp/security.tf +++ b/terraform/fullnode/gcp/security.tf @@ -1,9 +1,6 @@ # Security-related resources -data "kubernetes_all_namespaces" "all" {} - locals { - kubernetes_master_version = substr(google_container_cluster.aptos.master_version, 0, 4) baseline_pss_labels = { "pod-security.kubernetes.io/audit" = "baseline" "pod-security.kubernetes.io/warn" = "baseline" @@ -11,27 +8,6 @@ locals { } } -# FIXME: Remove when migrating to K8s 1.25 -resource "kubernetes_role_binding" "disable-psp" { - for_each = toset(local.kubernetes_master_version <= "1.24" ? data.kubernetes_all_namespaces.all.namespaces : []) - metadata { - name = "privileged-psp" - namespace = each.value - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "gce:podsecuritypolicy:privileged" - } - - subject { - api_group = "rbac.authorization.k8s.io" - kind = "Group" - name = "system:serviceaccounts:${each.value}" - } -} - resource "kubernetes_labels" "pss-default" { api_version = "v1" kind = "Namespace" diff --git a/terraform/fullnode/gcp/variables.tf b/terraform/fullnode/gcp/variables.tf index 8dfc0195ef1cc..8025e9571c4f0 100644 --- a/terraform/fullnode/gcp/variables.tf +++ b/terraform/fullnode/gcp/variables.tf @@ -1,3 +1,5 @@ +### Project config + variable "project" { description = "GCP project" type = string @@ -11,44 +13,205 @@ variable "region" { variable "zone" { description = "GCP zone suffix" type = string + default = "" # if empty, it's a regional cluster } -variable "workspace_name_override" { - description = "If specified, overrides the usage of Terraform workspace for naming purposes" - default = "" +variable "node_locations" { + description = "List of node locations" + type = list(string) + default = [] # if empty, let GCP choose } -variable "tls_sans" { - description = "List of Subject Alternate Names to include in TLS certificate" - type = list(string) - default = [] +variable "manage_via_tf" { + description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" + type = bool + default = true } +### DNS + variable "workspace_dns" { description = "Include Terraform workspace name in DNS records" + type = bool default = true } variable "dns_prefix_name" { description = "DNS prefix for fullnode url" + type = string default = "fullnode" } variable "zone_name" { description = "Zone name of GCP Cloud DNS zone to create records in" + type = string default = "" } variable "zone_project" { description = "GCP project which the DNS zone is in (if different)" + type = string default = "" } variable "create_google_managed_ssl_certificate" { description = "Whether to create a Google Managed SSL Certificate for the GCE Ingress" + type = bool + default = false +} + +variable "backend_http2" { + description = "Whether to enable HTTP/2 between Ingress and backends" + type = bool + default = false +} + +### Node pools and Autoscaling + +variable "node_pool_sizes" { + type = map(number) + default = {} + description = "Override the number of nodes in the specified pool" +} + +variable "instance_disk_sizes" { + type = map(number) + default = {} + description = "Override the disk size in the specified pool" +} + +variable "default_disk_size_gb" { + description = "Default disk size for nodes" + type = number + default = 100 +} + +variable "default_disk_type" { + description = "Default disk type for nodes" + type = string + default = "pd-standard" +} + +variable "create_nodepools" { + description = "Create managed nodepools" + type = bool + default = true +} + +variable "nodepool_sysctls" { + description = "Sysctls to set on nodepools" + type = map(string) + default = {} +} + +variable "core_instance_type" { + description = "Instance type used for core pods" + type = string + default = "e2-medium" +} + +variable "utility_instance_type" { + description = "Instance type used for utility pods" + type = string + default = "e2-standard-8" +} + +variable "fullnode_instance_type" { + description = "Instance type used for validator and fullnodes" + type = string + default = "t2d-standard-16" +} + +variable "utility_instance_enable_taint" { + description = "Whether to taint instances in the utilities nodegroup" + type = bool default = false } +variable "fullnode_instance_enable_taint" { + description = "Whether to taint instances in the validator nodegroup" + type = bool + default = true +} + +variable "gke_enable_node_autoprovisioning" { + description = "Enable GKE node autoprovisioning" + type = bool + default = true +} + +variable "gke_node_autoprovisioning_max_cpu" { + description = "Maximum CPU allocation for GKE node autoprovisioning" + type = number + default = 500 +} + +variable "gke_node_autoprovisioning_max_memory" { + description = "Maximum memory allocation for GKE node autoprovisioning" + type = number + default = 2000 +} + +variable "gke_autoscaling_profile" { + description = "Autoscaling profile for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler#autoscaling_profiles" + type = string + default = "OPTIMIZE_UTILIZATION" +} + +variable "gke_autoscaling_max_node_count" { + description = "Maximum number of nodes for GKE nodepool autoscaling" + type = number + default = 250 +} + +### Naming overrides + +variable "helm_release_name_override" { + description = "If set, overrides the name of the aptos-node helm chart" + type = string + default = "" +} + +variable "workspace_name_override" { + description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string + default = "" +} + +### GKE cluster config + +variable "enable_clouddns" { + description = "Enable CloudDNS (Google-managed cluster DNS)" + type = bool + default = false +} + +variable "enable_image_streaming" { + description = "Enable image streaming (GCFS)" + type = bool + default = false +} + +variable "gke_maintenance_policy" { + description = "The maintenance policy to use for the cluster. See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#maintenance_policy" + type = object({ + recurring_window = object({ + start_time = string + end_time = string + recurrence = string + }) + }) + default = { + recurring_window = { + start_time = "2023-06-15T00:00:00Z" + end_time = "2023-06-15T23:59:00Z" + recurrence = "FREQ=DAILY" + } + } +} + +### Helm + variable "helm_values" { description = "Map of values to pass to Helm" type = any @@ -74,47 +237,44 @@ variable "fullnode_helm_values_list" { } variable "k8s_namespace" { - default = "aptos" description = "Kubernetes namespace that the fullnode will be deployed into" + type = string + default = "aptos" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } variable "num_fullnodes" { - default = 1 description = "Number of fullnodes" -} - -variable "num_extra_instance" { - default = 0 - description = "Number of extra instances to add into node pool" -} - -variable "instance_disk_size_gb" { - default = 100 - description = "Disk size for fullnode instance" + type = number + default = 1 } variable "image_tag" { - default = "devnet" description = "Docker image tag to use for the fullnode" + type = string + default = "devnet" } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { - description = "aptos chain ID" + description = "Aptos chain ID" + type = string default = "DEVNET" } variable "chain_name" { description = "Aptos chain name" + type = string default = "devnet" } @@ -123,79 +283,28 @@ variable "fullnode_name" { type = string } -variable "machine_type" { - description = "Machine type for running fullnode" - default = "n2-standard-32" -} +### Addons variable "enable_backup" { - description = "enable data backup from fullnode" + description = "Enable data backup from fullnode" + type = bool default = false } variable "enable_public_backup" { - description = "provide data backups to the public" + description = "Provide data backups to the public" + type = bool default = false } - variable "backup_fullnode_index" { - description = "index of fullnode to backup data from" + description = "Index of fullnode to backup data from" + type = number default = 0 } -variable "enable_monitoring" { - description = "Enable monitoring helm chart" - default = false -} - -variable "monitoring_helm_values" { - description = "Map of values to pass to monitoring Helm" - type = any - default = {} -} - -variable "enable_prometheus_node_exporter" { - description = "Enable prometheus-node-exporter within monitoring helm chart" - default = false -} - -variable "enable_kube_state_metrics" { - description = "Enable kube-state-metrics within monitoring helm chart" - default = false -} - -variable "gke_enable_private_nodes" { - description = "Enable private nodes for GKE cluster" - default = true -} - -variable "gke_enable_node_autoprovisioning" { - description = "Enable node autoprovisioning for GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/how-to/node-auto-provisioning" - default = false -} - -variable "gke_node_autoprovisioning_max_cpu" { - description = "Maximum CPU utilization for GKE node_autoprovisioning" - default = 10 -} - -variable "gke_node_autoprovisioning_max_memory" { - description = "Maximum memory utilization for GKE node_autoprovisioning" - default = 100 -} - -variable "gke_enable_autoscaling" { - description = "Enable autoscaling for the nodepools in the GKE cluster. See https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler" - default = true -} - -variable "gke_autoscaling_max_node_count" { - description = "Maximum number of nodes for GKE nodepool autoscaling" - default = 10 -} - -variable "manage_via_tf" { - description = "Whether to manage the aptos-node k8s workload via Terraform. If set to false, the helm_release resource will still be created and updated when values change, but it may not be updated on every apply" - default = true +variable "tls_sans" { + description = "List of Subject Alternate Names to include in TLS certificate" + type = list(string) + default = [] } diff --git a/terraform/fullnode/gcp/versions.tf b/terraform/fullnode/gcp/versions.tf index f88de6cbc2a7e..b74536e8f4cc8 100644 --- a/terraform/fullnode/gcp/versions.tf +++ b/terraform/fullnode/gcp/versions.tf @@ -1,11 +1,13 @@ terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { google = { - source = "hashicorp/google" + source = "hashicorp/google" + version = "~> 5.0.0" } google-beta = { - source = "hashicorp/google-beta" + source = "hashicorp/google-beta" + version = "~> 5.0.0" } helm = { source = "hashicorp/helm" diff --git a/terraform/fullnode/vultr/variables.tf b/terraform/fullnode/vultr/variables.tf index 7bc3d6e8bb3ad..cf98dda3900bb 100644 --- a/terraform/fullnode/vultr/variables.tf +++ b/terraform/fullnode/vultr/variables.tf @@ -17,57 +17,68 @@ variable "fullnode_helm_values_list" { } variable "k8s_namespace" { - default = "aptos" description = "Kubernetes namespace that the fullnode will be deployed into" + type = string + default = "aptos" } variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } variable "num_fullnodes" { - default = 1 description = "Number of fullnodes" + type = number + default = 1 } variable "image_tag" { - default = "devnet" description = "Docker image tag to use for the fullnode" + type = string + default = "devnet" } variable "era" { description = "Chain era, used to start a clean chain" + type = number default = 1 } variable "chain_id" { - description = "aptos chain ID" + description = "Aptos chain ID" + type = string default = "DEVNET" } variable "chain_name" { description = "Aptos chain name" + type = string default = "devnet" } variable "machine_type" { description = "Machine type for running fullnode. All configurations can be obtained at https://www.vultr.com/api/#tag/plans" + type = string default = "vc2-16c-32gb" } variable "api_key" { description = "API Key, can be obtained at https://my.vultr.com/settings/#settingsapi" + type = string default = "" } variable "fullnode_region" { description = "Geographical region for the node location. All 25 regions can be obtained at https://api.vultr.com/v2/regions" + type = string default = "fra" } variable "block_storage_class" { description = "Either vultr-block-storage for high_perf/ssd, vultr-block-storage-hdd for storage_opt/hdd. high_perf is not available in all regions!" + type = string default = "vultr-block-storage" } diff --git a/terraform/helm/aptos-node/README.md b/terraform/helm/aptos-node/README.md index 94f45f063f02c..c60abcae4cea9 100644 --- a/terraform/helm/aptos-node/README.md +++ b/terraform/helm/aptos-node/README.md @@ -53,19 +53,19 @@ Aptos blockchain node deployment | labels | string | `nil` | | | loadTestGenesis | bool | `false` | Load test-data for starting a test network | | manageImages | bool | `true` | If true, helm will always override the deployed image with what is configured in the helm values. If not, helm will take the latest image from the currently running workloads, which is useful if you have a separate procedure to update images (e.g. rollout) | -| multicluster | object | `{"enabled":false,"targetClusters":["cluster1","cluster2","cluster3"]}` | Options for multicluster mode. This is *experimental only*. | +| multicluster | object | `{"enabled":false,"targetClusters":["forge-multiregion-1","forge-multiregion-2","forge-multiregion-3"]}` | Options for multicluster mode. This is *experimental only*. | | numFullnodeGroups | int | `1` | Total number of fullnode groups to deploy | | numValidators | int | `1` | Number of validators to deploy | | overrideNodeConfig | bool | `false` | Specify validator and fullnode NodeConfigs via named ConfigMaps, rather than the generated ones from this chart. | | service.domain | string | `nil` | If set, the base domain name to use for External DNS | -| service.fullnode.enableMetricsPort | bool | `true` | Enable the metrics port on fullnodes | +| service.fullnode.enableMetricsPort | bool | `false` | Enable the metrics port on fullnodes | | service.fullnode.enableRestApi | bool | `true` | Enable the REST API on fullnodes | | service.fullnode.external.type | string | `"LoadBalancer"` | The Kubernetes ServiceType to use for fullnodes' HAProxy | | service.fullnode.externalTrafficPolicy | string | `"Local"` | The externalTrafficPolicy for the fullnode service | | service.fullnode.internal.headless | bool | `false` | | | service.fullnode.internal.type | string | `"ClusterIP"` | The Kubernetes ServiceType to use for fullnodes | | service.fullnode.loadBalancerSourceRanges | string | `nil` | If set and if the ServiceType is LoadBalancer, allow traffic to fullnodes from these CIDRs | -| service.validator.enableMetricsPort | bool | `true` | Enable the metrics port on the validator | +| service.validator.enableMetricsPort | bool | `false` | Enable the metrics port on the validator | | service.validator.enableRestApi | bool | `true` | Enable the REST API on the validator | | service.validator.external.type | string | `"LoadBalancer"` | The Kubernetes ServiceType to use for validator's HAProxy | | service.validator.externalTrafficPolicy | string | `"Local"` | The externalTrafficPolicy for the validator service | @@ -83,7 +83,6 @@ Aptos blockchain node deployment | validator.image.tag | string | `nil` | Image tag to use for validator images. If set, overrides `imageTag` | | validator.name | string | `nil` | Internal: name of your validator for use in labels | | validator.nodeSelector | object | `{}` | | -| validator.remoteLogAddress | string | `nil` | Address for remote logging. See `logger` helm chart | | validator.resources.limits.cpu | float | `15.5` | | | validator.resources.limits.memory | string | `"26Gi"` | | | validator.resources.requests.cpu | int | `15` | | diff --git a/terraform/helm/aptos-node/files/haproxy.cfg b/terraform/helm/aptos-node/files/haproxy.cfg index b44601844c7af..268cc783df810 100644 --- a/terraform/helm/aptos-node/files/haproxy.cfg +++ b/terraform/helm/aptos-node/files/haproxy.cfg @@ -144,6 +144,21 @@ backend validator-metrics default-server maxconn 16 server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:9101 +frontend validator-admin + mode http + option httplog + bind :9202 + default_backend validator-admin + + # Deny requests from blocked IPs + tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips } + http-request add-header Forwarded "for=%ci" + +backend validator-admin + mode http + default-server maxconn 16 + server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-validator:9102 + # Exposes the validator's own REST API {{- if $.Values.service.validator.enableRestApi }} frontend validator-api @@ -235,6 +250,21 @@ backend {{ $config.name }}-metrics default-server maxconn 16 server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }} {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }}:9101 +frontend {{ $config.name }}-admin + mode http + option httplog + bind :{{ add 9203 $index }} + default_backend {{ $config.name }}-admin + + # Deny requests from blocked IPs + tcp-request connection reject if { src -n -f /usr/local/etc/haproxy/blocked.ips } + http-request add-header Forwarded "for=%ci" + +backend {{ $config.name }}-admin + mode http + default-server maxconn 16 + server {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }} {{ include "aptos-validator.fullname" $ }}-{{ $.Values.i }}-{{ $config.name }}:9102 + {{- end }} {{- end }} diff --git a/terraform/helm/aptos-node/templates/fullnode.yaml b/terraform/helm/aptos-node/templates/fullnode.yaml index 717ae39d561d1..3e917059169e1 100644 --- a/terraform/helm/aptos-node/templates/fullnode.yaml +++ b/terraform/helm/aptos-node/templates/fullnode.yaml @@ -24,9 +24,33 @@ spec: port: 6182 - name: metrics port: 9101 + - name: admin + port: 9102 - name: api port: 8080 +{{- if $.Values.migrations.enable_vfn_explicit_pvc }} +--- + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "aptos-validator.fullname" $ }}-{{$i}}-{{ .name }}-e{{ $.Values.chain.era }} + labels: + {{- include "aptos-validator.labels" $ | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + storageClassName: {{ $.Values.fullnode.storage.class }} + resources: + requests: + storage: {{ $.Values.fullnode.storage.size }} + {{- if $.Values.fullnode.storage.labels }} + selector: + matchLabels: + {{- toYaml $.Values.fullnode.storage.labels | nindent 6}} + {{- end }} +{{- end }} --- {{ $fullnode_statefulset := lookup "apps/v1" "StatefulSet" $.Release.Namespace (printf "%s-%d-%s-e%s" (include "aptos-validator.fullname" $) $i .name (toYaml $.Values.chain.era)) }} apiVersion: apps/v1 @@ -49,6 +73,7 @@ spec: app.kubernetes.io/name: fullnode app.kubernetes.io/instance: fullnode-{{$i}} group: {{ .name }} + {{- if not $.Values.migrations.enable_vfn_explicit_pvc }} volumeClaimTemplates: - metadata: name: fn @@ -64,6 +89,7 @@ spec: matchLabels: {{- toYaml $.Values.fullnode.storage.labels | nindent 10}} {{- end }} + {{- end }} template: metadata: labels: @@ -86,7 +112,18 @@ spec: image: {{ $.Values.validator.image.repo }}:{{ $.Values.validator.image.tag | default $.Values.imageTag }} {{- end }} imagePullPolicy: {{ $.Values.validator.image.pullPolicy }} - command: ["/usr/local/bin/aptos-node", "-f", "/opt/aptos/etc/fullnode.yaml"] + command: + - /bin/bash + - -c + - |- + set -euxo pipefail + if [[ -f /opt/aptos/data/wipe-db ]]; then + # Wipe DB + rm -rf /opt/aptos/data/db + # Delete the command file so we only wipe the DB once + rm -vf /opt/aptos/data/wipe-db + fi + /usr/local/bin/aptos-node -f /opt/aptos/etc/fullnode.yaml {{- with $.Values.fullnode }} resources: {{- toYaml .resources | nindent 10 }} @@ -113,7 +150,11 @@ spec: mountPath: /opt/aptos/etc - name: genesis-config mountPath: /opt/aptos/genesis + {{- if $.Values.migrations.enable_vfn_explicit_pvc }} + - name: aptos-data + {{- else }} - name: fn + {{- end }} mountPath: /opt/aptos/data ports: - containerPort: 6181 @@ -122,6 +163,8 @@ spec: name: api - containerPort: 9101 name: metrics + - containerPort: 9102 + name: admin securityContext: {{- if $.Values.enablePrivilegedMode }} runAsUser: 0 @@ -165,6 +208,11 @@ spec: - name: genesis-config secret: secretName: {{ include "aptos-validator.fullname" $ }}-{{$i}}-genesis-e{{ $.Values.chain.era }} + {{- if $.Values.migrations.enable_vfn_explicit_pvc }} + - name: aptos-data + persistentVolumeClaim: + claimName: {{ include "aptos-validator.fullname" $ }}-{{$i}}-{{ .name }}-e{{ $.Values.chain.era }} + {{- end }} serviceAccountName: {{ include "aptos-validator.fullname" $ }}-fullnode {{- if $.Values.imagePullSecret }} imagePullSecrets: diff --git a/terraform/helm/aptos-node/templates/haproxy.yaml b/terraform/helm/aptos-node/templates/haproxy.yaml index ff93bee17a356..ef2ce118cb7e2 100644 --- a/terraform/helm/aptos-node/templates/haproxy.yaml +++ b/terraform/helm/aptos-node/templates/haproxy.yaml @@ -26,7 +26,7 @@ metadata: service.beta.kubernetes.io/aws-load-balancer-type: nlb service.beta.kubernetes.io/oci-load-balancer-security-list-management-mode: All {{- if $.Values.service.domain }} - external-dns.alpha.kubernetes.io/hostname: val{{$i}}.{{ $.Values.service.domain }} + external-dns.alpha.kubernetes.io/hostname: vn{{$i}}.{{ $.Values.service.domain }},val{{$i}}.{{ $.Values.service.domain }} {{- end }} spec: selector: @@ -41,6 +41,11 @@ spec: port: 9101 targetPort: 9102 {{- end }} + {{- if $.Values.service.validator.enableAdminPort }} + - name: admin + port: 9102 + targetPort: 9202 + {{- end }} {{- if $.Values.service.validator.enableRestApi }} - name: api port: 80 @@ -69,7 +74,7 @@ metadata: service.beta.kubernetes.io/aws-load-balancer-type: nlb service.beta.kubernetes.io/oci-load-balancer-security-list-management-mode: All {{- if $.Values.service.domain }} - external-dns.alpha.kubernetes.io/hostname: {{ $config.name }}{{$i}}.{{ $.Values.service.domain }} + external-dns.alpha.kubernetes.io/hostname: {{ $config.dns_name }}{{$i}}.{{ $.Values.service.domain }},{{ $config.name }}{{$i}}.{{ $.Values.service.domain }} {{- end }} spec: selector: @@ -85,6 +90,11 @@ spec: port: 9101 targetPort: {{ add 9103 $index }} {{- end }} + {{- if $.Values.service.fullnode.enableAdminPort }} + - name: admin + port: 9102 + targetPort: {{ add 9203 $index }} + {{- end }} {{- if $.Values.service.fullnode.enableRestApi }} - name: api port: 80 @@ -146,14 +156,24 @@ spec: imagePullPolicy: {{ .image.pullPolicy }} resources: {{- toYaml .resources | nindent 10 }} + # These ports are exposed by HAProxy. See haproxy.cfg for more details + # Fullnode ports are dynamically assigned based on the number of fullnode groups ports: - - containerPort: 6180 - - containerPort: 6182 + # Aptosnet + - containerPort: 6180 # validator + - containerPort: 6182 # fullnode + # Fullnode API - containerPort: 8080 + # Validator API - containerPort: 8180 + # HAProxy metrics port - containerPort: 9101 + # Node ports - containerPort: 9102 - containerPort: 9103 + # AdminService ports + - containerPort: 9202 # validator admin + - containerPort: 9203 # fullnode admin volumeMounts: - name: haproxy-config mountPath: /usr/local/etc/haproxy diff --git a/terraform/helm/aptos-node/templates/validator.yaml b/terraform/helm/aptos-node/templates/validator.yaml index 9b6514e81c46f..7d42270632819 100644 --- a/terraform/helm/aptos-node/templates/validator.yaml +++ b/terraform/helm/aptos-node/templates/validator.yaml @@ -23,6 +23,8 @@ spec: port: 6181 - name: metrics port: 9101 + - name: admin + port: 9102 {{- if $.Values.service.validator.enableRestApi }} - name: api port: 8080 @@ -91,7 +93,18 @@ spec: {{- end }} {{- with $.Values.validator }} imagePullPolicy: {{ .image.pullPolicy }} - command: ["/usr/local/bin/aptos-node", "-f", "/opt/aptos/etc/validator.yaml"] + command: + - /bin/bash + - -c + - |- + set -euxo pipefail + if [[ -f /opt/aptos/data/wipe-db ]]; then + # Wipe DB + rm -rf /opt/aptos/data/db + # Delete the command file so we only wipe the DB once + rm -vf /opt/aptos/data/wipe-db + fi + /usr/local/bin/aptos-node -f /opt/aptos/etc/validator.yaml resources: {{- toYaml .resources | nindent 10 }} env: @@ -123,6 +136,8 @@ spec: name: api - containerPort: 9101 name: metrics + - containerPort: 9102 + name: admin securityContext: {{- if $.Values.enablePrivilegedMode }} runAsUser: 0 diff --git a/terraform/helm/aptos-node/values.yaml b/terraform/helm/aptos-node/values.yaml index 11e92886690d5..5ae8433a9e977 100644 --- a/terraform/helm/aptos-node/values.yaml +++ b/terraform/helm/aptos-node/values.yaml @@ -39,11 +39,11 @@ haproxy: pullPolicy: IfNotPresent resources: limits: - cpu: 4 - memory: 8Gi + cpu: 3 + memory: 6Gi requests: - cpu: 4 - memory: 8Gi + cpu: 3 + memory: 6Gi nodeSelector: {} tolerations: [] affinity: {} @@ -74,11 +74,11 @@ validator: pullPolicy: IfNotPresent resources: limits: - cpu: 15.5 - memory: 26Gi + cpu: 14 + memory: 56Gi requests: - cpu: 15 - memory: 26Gi + cpu: 14 + memory: 56Gi storage: # -- Kubernetes storage class to use for validator persistent storage class: @@ -86,8 +86,6 @@ validator: size: 2048Gi # -- Log level for the validator rust_log: info - # -- Address for remote logging. See `logger` helm chart - remoteLogAddress: # -- Flag to force enable telemetry service (useful for forge tests) force_enable_telemetry: false nodeSelector: {} @@ -103,14 +101,15 @@ fullnode: # -- Specify fullnode groups by `name` and number of `replicas` groups: - name: fullnode + dns_name: vfn replicas: 1 resources: limits: - cpu: 15.5 - memory: 26Gi + cpu: 14 + memory: 56Gi requests: - cpu: 15 - memory: 26Gi + cpu: 14 + memory: 56Gi storage: # -- Kubernetes storage class to use for fullnode persistent storage class: @@ -131,6 +130,7 @@ fullnode: full_node_networks: # The first item in the array `full_node_networks` must always refer to the public fullnode network - network_id: "public" + seeds: {} service: # -- If set, the base domain name to use for External DNS @@ -151,6 +151,8 @@ service: enableRestApi: true # -- Enable the metrics port on the validator enableMetricsPort: false + # -- Enable the admin port on the validator + enableAdminPort: false fullnode: external: # -- The Kubernetes ServiceType to use for fullnodes' HAProxy @@ -167,6 +169,8 @@ service: enableRestApi: true # -- Enable the metrics port on fullnodes enableMetricsPort: false + # -- Enable the admin port on fullnodes + enableAdminPort: false serviceAccount: # -- Specifies whether a service account should be created @@ -182,3 +186,9 @@ enablePrivilegedMode: false # Additional labels labels: + +# Infra migrations +migrations: + # -- Explicitly define a PVC for VFNs. + # -- See templates/fullnode.yaml + enable_vfn_explicit_pvc: false diff --git a/terraform/helm/autoscaling/templates/dns.yaml b/terraform/helm/autoscaling/templates/dns.yaml index 13da61e39c714..eb4a0fb5a8b1c 100644 --- a/terraform/helm/autoscaling/templates/dns.yaml +++ b/terraform/helm/autoscaling/templates/dns.yaml @@ -1,4 +1,4 @@ -apiVersion: autoscaling/v2beta2 +apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: name: hpa-coredns diff --git a/terraform/helm/autoscaling/values.yaml b/terraform/helm/autoscaling/values.yaml index 89c2175891dd3..bc2b6cd32c667 100644 --- a/terraform/helm/autoscaling/values.yaml +++ b/terraform/helm/autoscaling/values.yaml @@ -16,8 +16,8 @@ autoscaler: # How long after scale up that scale down evaluation resumes scaleDownDelayAfterAdd: 5m image: - repo: k8s.gcr.io/autoscaling/cluster-autoscaler - tag: v1.25.2 + repo: registry.k8s.io/autoscaling/cluster-autoscaler + tag: v1.25.1 resources: requests: cpu: 1 diff --git a/terraform/helm/fullnode/files/backup/gcs.yaml b/terraform/helm/fullnode/files/backup/gcs.yaml index b4a41de012e3f..561a00384c64a 100644 --- a/terraform/helm/fullnode/files/backup/gcs.yaml +++ b/terraform/helm/fullnode/files/backup/gcs.yaml @@ -5,12 +5,12 @@ commands: FILE_HANDLE="$BACKUP_HANDLE/$FILE_NAME" echo "$FILE_HANDLE" exec 1>&- # close stdout - gzip -c | gcloud storage cp - "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" > /dev/null - open_for_read: 'gcloud storage cp "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" - | gzip -cd' + gzip -c | gsutil -q cp - "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" > /dev/null + open_for_read: 'gsutil -q cp "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" - | gzip -cd' save_metadata_line: | FILE_HANDLE="metadata/$FILE_NAME" echo "$FILE_HANDLE" exec 1>&- - gzip -c | gcloud storage cp - "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" > /dev/null - list_metadata_files: '(gcloud storage ls gs://$BUCKET/$SUB_DIR/metadata/ ||:) | sed -ne "s#gs://.*/metadata/#metadata/#p"' - backup_metadata_file: 'gcloud storage mv gs://$BUCKET/$SUB_DIR/metadata/$FILE_NAME gs://$BUCKET/$SUB_DIR/metadata_backup/$FILE_NAME' + gzip -c | gsutil -q cp - "gs://$BUCKET/$SUB_DIR/$FILE_HANDLE" > /dev/null + list_metadata_files: '(gsutil -q ls gs://$BUCKET/$SUB_DIR/metadata/ ||:) | sed -ne "s#gs://.*/metadata/#metadata/#p"' + backup_metadata_file: 'gsutil mv gs://$BUCKET/$SUB_DIR/metadata/$FILE_NAME gs://$BUCKET/$SUB_DIR/metadata_backup/$FILE_NAME' diff --git a/terraform/helm/fullnode/templates/fullnode.yaml b/terraform/helm/fullnode/templates/fullnode.yaml index 3f0d08f22f90c..93e4abeb71d9a 100644 --- a/terraform/helm/fullnode/templates/fullnode.yaml +++ b/terraform/helm/fullnode/templates/fullnode.yaml @@ -25,6 +25,80 @@ spec: prometheus.io/port: "9101" spec: terminationGracePeriodSeconds: 0 + initContainers: + {{- with .Values.restore }} + {{- if .enabled }} + - name: restore + image: {{ .image.repo }}:{{ .image.tag | default $.Values.imageTag }} + imagePullPolicy: {{ .image.pullPolicy }} + resources: + {{- toYaml .resources | nindent 10 }} + command: + - /bin/bash + - -c + - |- + set -euxo pipefail + # cleanup aptosdb + if [ -f /opt/aptos/data/restore-failed ] || \ + [ ! -f /opt/aptos/data/restore-uid ] || \ + [ "$(cat /opt/aptos/data/restore-uid)" != "{{ .config.restore_epoch }}" ]; then + rm -rf /opt/aptos/data/db /opt/aptos/data/restore-{complete,failed} + echo "{{ .config.restore_epoch }}" > /opt/aptos/data/restore-uid + fi + + [ -f /opt/aptos/data/restore-complete ] && exit 0 + # start restore process + /usr/local/bin/aptos-debugger aptos-db restore bootstrap-db \ + --concurrent-downloads {{ .config.concurrent_downloads }} \ + {{ range .config.trusted_waypoints }} --trust-waypoint {{ . }}{{ end }} \ + --target-db-dir /opt/aptos/data/db \ + --metadata-cache-dir /opt/aptos/data/aptos-restore-metadata \ + --ledger-history-start-version {{ .config.start_version }} \ + {{- if .config.target_version }} --target-version {{- .config.target_version }}{{- end }} + --command-adapter-config /opt/aptos/etc/{{ .config.location }}.yaml + + if [ $? -gt 0 ]; then + # mark restore as failed + touch /opt/aptos/data/restore-failed + exit 1 + else + # success, remove the marker + rm -f /opt/aptos/data/restore-failed + touch /opt/aptos/data/restore-complete + fi + env: + - name: RUST_LOG + value: "debug" + - name: RUST_BACKTRACE + value: "full" + {{- if (include "backup.pushMetricsEndpoint" $) }} + - name: KUBERNETES_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: PUSH_METRICS_ENDPOINT + value: "{{- include "backup.pushMetricsEndpoint" $ }}/api/v1/import/prometheus?extra_label=role={{- .jobName | default "db_restore" }}&extra_label=kubernetes_pod_name=$(KUBERNETES_POD_NAME)" + {{- end }} + - name: CONTROLLER_UID + valueFrom: + fieldRef: + fieldPath: "metadata.labels['controller-uid']" + {{- include "backup.backupEnvironment" (dict "config" .config "era" (default $.Values.chain.era .config.restore_era)) | nindent 8 }} + volumeMounts: + - name: backup-config + mountPath: /opt/aptos/etc + - name: aptos-data + mountPath: /opt/aptos/data + - name: tmp + mountPath: /tmp + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + {{- end }} # if .enabled + {{- end }} # with .Values.restore containers: - name: fullnode {{- if and $fullnode_statefulset (not $.Values.manageImages) }} # if the statefulset already exists and we do not want helm to simply overwrite the image, use the existing image @@ -34,10 +108,16 @@ spec: {{- end }} imagePullPolicy: {{ .Values.image.pullPolicy }} command: - - /bin/sh + - /bin/bash - -c - |- - set -e + set -euxo pipefail + if [[ -f /opt/aptos/data/wipe-db ]]; then + # Wipe DB + rm -rf /opt/aptos/data/db + # Delete the command file so we only wipe the DB once + rm -vf /opt/aptos/data/wipe-db + fi {{- if and (not .Values.chain.genesisConfigmap) (not .Values.chain.genesisSecret) }} # Download genesis and waypoint if necessary curl -o /opt/aptos/genesis/waypoint.txt {{ (get .Values.aptos_chains .Values.chain.name).waypoint_txt_url }} @@ -73,6 +153,8 @@ spec: name: api - containerPort: 9101 name: metrics + - containerPort: 9102 + name: admin # NOTE: these require the API to be enabled, which is not always the case livenessProbe: # restart the pod if the REST API is ever unresponsive httpGet: @@ -123,6 +205,9 @@ spec: - name: aptos-data persistentVolumeClaim: claimName: {{ include "aptos-fullnode.fullname" . }}-e{{ .Values.chain.era }} + - name: backup-config + configMap: + name: {{ include "backup.fullname" . }}-backup - name: tmp emptyDir: {} serviceAccountName: {{ include "aptos-fullnode.serviceAccountName" . }} diff --git a/terraform/helm/fullnode/templates/restore.yaml b/terraform/helm/fullnode/templates/restore.yaml deleted file mode 100644 index 575167ac3530c..0000000000000 --- a/terraform/helm/fullnode/templates/restore.yaml +++ /dev/null @@ -1,107 +0,0 @@ -{{ $restore_job_suffix := randAlpha 4 | lower }} -{{ $backup_restore_job := lookup "batch/v1" "Job" $.Release.Namespace (print (include "backup.fullname" .) "-restore-" $restore_job_suffix) }} -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ include "backup.fullname" . }}-restore-{{ $restore_job_suffix }} - labels: - {{- include "backup.labels" . | nindent 4 }} - app.kubernetes.io/name: restore -spec: - completions: 0 - template: - metadata: - labels: - {{- include "backup.selectorLabels" . | nindent 8 }} - app.kubernetes.io/name: restore - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - spec: - restartPolicy: Never - terminationGracePeriodSeconds: 0 - {{- with .Values.restore }} - containers: - - name: restore - {{- if and $backup_restore_job (not $.Values.manageImages) }} # if the statefulset already exists and we do not want helm to simply overwrite the image, use the existing image - image: {{ (first $backup_restore_job.spec.template.spec.containers).image }} - {{- else }} - image: {{ .image.repo }}:{{ .image.tag | default $.Values.imageTag }} - {{- end }} - imagePullPolicy: {{ .image.pullPolicy }} - resources: - {{- toYaml .resources | nindent 10 }} - command: - - sh - - -c - - |- - set -ex - # cleanup aptosdb - if [ ! -f /opt/aptos/data/restore-uid ] || [ "$(cat /opt/aptos/data/restore-uid)" != "$CONTROLLER_UID" ]; then - rm -rf /opt/aptos/data/db - echo "$CONTROLLER_UID" > /opt/aptos/data/restore-uid - fi - # start restore process - /usr/local/bin/aptos-debugger aptos-db restore bootstrap-db --concurrent-downloads {{ .config.concurrent_downloads }}{{ range .config.trusted_waypoints }} --trust-waypoint {{ . }}{{ end }} --target-db-dir /opt/aptos/data/db --metadata-cache-dir /tmp/aptos-restore-metadata --command-adapter-config /opt/aptos/etc/{{ .config.location }}.yaml - env: - - name: RUST_LOG - value: "debug" - - name: RUST_BACKTRACE - value: "1" - {{- if (include "backup.pushMetricsEndpoint" $) }} - - name: KUBERNETES_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: PUSH_METRICS_ENDPOINT - value: "{{- include "backup.pushMetricsEndpoint" $ }}/api/v1/import/prometheus?extra_label=role={{- .jobName | default "db_restore" }}&extra_label=kubernetes_pod_name=$(KUBERNETES_POD_NAME)" - {{- end }} - - name: CONTROLLER_UID - valueFrom: - fieldRef: - fieldPath: "metadata.labels['controller-uid']" - {{- include "backup.backupEnvironment" (dict "config" .config "era" (default $.Values.chain.era .config.restore_era)) | nindent 8 }} - volumeMounts: - - name: backup-config - mountPath: /opt/aptos/etc - - name: aptos-data - mountPath: /opt/aptos/data - - name: tmp - mountPath: /tmp - securityContext: - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - securityContext: - runAsNonRoot: true - runAsUser: 6180 - runAsGroup: 6180 - fsGroup: 6180 - {{- with .nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - volumes: - - name: backup-config - configMap: - name: {{ include "backup.fullname" . }}-backup - - name: tmp - emptyDir: {} - - name: aptos-data - persistentVolumeClaim: - claimName: {{ include "backup.persistentVolumeClaim" . }} - serviceAccountName: {{ include "backup.serviceAccount" . }} - {{- if .Values.imagePullSecret }} - imagePullSecrets: - - name: {{.Values.imagePullSecret}} - {{- end }} diff --git a/terraform/helm/fullnode/templates/service.yaml b/terraform/helm/fullnode/templates/service.yaml index 24ac09886a734..42cbba29fa79a 100644 --- a/terraform/helm/fullnode/templates/service.yaml +++ b/terraform/helm/fullnode/templates/service.yaml @@ -18,6 +18,16 @@ spec: port: 80 targetPort: 8080 {{- end }} + {{- if .Values.service.exposeMetrics }} + - name: metrics + port: 9101 + targetPort: 9101 + {{- end }} + {{- if .Values.service.exposeAdmin }} + - name: admin + port: 9102 + targetPort: 9102 + {{- end }} - name: aptosnet port: 6182 {{- with .Values.service }} diff --git a/terraform/helm/fullnode/values.yaml b/terraform/helm/fullnode/values.yaml index 59619c62b85d9..86f0f487f1d7e 100644 --- a/terraform/helm/fullnode/values.yaml +++ b/terraform/helm/fullnode/values.yaml @@ -32,6 +32,7 @@ fullnode: full_node_networks: # The first item in the array `full_node_networks` must always refer to the public fullnode network - network_id: "public" + seeds: {} identity: {} inbound_rate_limit_config: outbound_rate_limit_config: @@ -50,10 +51,10 @@ image: resources: limits: cpu: 14 - memory: 26Gi + memory: 56Gi requests: cpu: 14 - memory: 26Gi + memory: 56Gi nodeSelector: {} tolerations: [] @@ -70,6 +71,10 @@ service: type: ClusterIP # -- Whether to expose the node REST API exposeApi: true + # -- Whether to expose the metrics port on fullnodes + exposeMetrics: false + # -- Whether to expose the admin port on fullnodes + exposeAdmin: false # -- The externalTrafficPolicy for the fullnode service externalTrafficPolicy: # -- If set and if the ServiceType is LoadBalancer, allow traffic to fullnode from these CIDRs @@ -106,11 +111,11 @@ backup: pullPolicy: IfNotPresent resources: limits: - cpu: 1 - memory: 1Gi + cpu: 4 + memory: 8Gi requests: - cpu: 1 - memory: 1Gi + cpu: 4 + memory: 8Gi nodeSelector: {} tolerations: [] affinity: {} @@ -137,11 +142,11 @@ backup_verify: schedule: "@daily" resources: limits: - cpu: 4 - memory: 8Gi + cpu: 8 + memory: 32Gi requests: cpu: 4 - memory: 8Gi + memory: 16Gi nodeSelector: {} tolerations: [] affinity: {} @@ -151,11 +156,11 @@ backup_compaction: schedule: "@daily" resources: limits: - cpu: 1 - memory: 1Gi + cpu: 8 + memory: 32Gi requests: - cpu: 1 - memory: 1Gi + cpu: 4 + memory: 16Gi nodeSelector: {} tolerations: [] affinity: {} @@ -170,14 +175,15 @@ restore: pullPolicy: IfNotPresent resources: limits: - cpu: 6 - memory: 15Gi + cpu: 16 + memory: 120Gi requests: - cpu: 6 - memory: 15Gi + cpu: 16 + memory: 120Gi nodeSelector: {} tolerations: [] affinity: {} + enabled: false config: # -- Which of the below backup configurations to use location: @@ -192,6 +198,12 @@ restore: # -- List of trusted waypoints for restore trusted_waypoints: [] # -- Number of concurrent downloads for restore - concurrent_downloads: 2 + concurrent_downloads: 16 # -- If set, specifies a different era to restore other than the default era set in chain.era restore_era: + # -- Increase this value to trigger a restore from scratch, wiping the DB. + restore_epoch: 0 + # -- Start from genesis. + start_version: 0 + # -- Restore to the latest version. + target_version: diff --git a/terraform/helm/genesis/files/genesis.sh b/terraform/helm/genesis/files/genesis.sh index 7fb045f02bd3a..8e7d150f3b71f 100644 --- a/terraform/helm/genesis/files/genesis.sh +++ b/terraform/helm/genesis/files/genesis.sh @@ -32,26 +32,26 @@ echo $MULTICLUSTER_DOMAIN_SUFFIXES_STRING IFS=',' read -r -a MULTICLUSTER_DOMAIN_SUFFIXES <<< "${MULTICLUSTER_DOMAIN_SUFFIXES_STRING}" if ! [[ $(declare -p MULTICLUSTER_DOMAIN_SUFFIXES) =~ "declare -a" ]]; then - echo "MULTICLUSTER_DOMAIN_SUFFIXES must be an array" - exit 1 + echo "MULTICLUSTER_DOMAIN_SUFFIXES must be an array" + exit 1 fi if [[ "${ENABLE_MULTICLUSTER_DOMAIN_SUFFIX}" == "true" ]]; then - if [ -z ${NAMESPACE} ]; then - echo "NAMESPACE must be set" - exit 1 - fi + if [ -z ${NAMESPACE} ]; then + echo "NAMESPACE must be set" + exit 1 + fi fi if [ -z ${ERA} ] || [ -z ${NUM_VALIDATORS} ]; then - echo "ERA (${ERA:-null}) and NUM_VALIDATORS (${NUM_VALIDATORS:-null}) must be set" - exit 1 + echo "ERA (${ERA:-null}) and NUM_VALIDATORS (${NUM_VALIDATORS:-null}) must be set" + exit 1 fi -if [ "${FULLNODE_ENABLE_ONCHAIN_DISCOVERY}" = "true" ] && [ -z ${DOMAIN} ] || - [ "${VALIDATOR_ENABLE_ONCHAIN_DISCOVERY}" = "true" ] && [ -z ${DOMAIN} ]; then - echo "If FULLNODE_ENABLE_ONCHAIN_DISCOVERY or VALIDATOR_ENABLE_ONCHAIN_DISCOVERY is set, DOMAIN must be set" - exit 1 +if [ "${FULLNODE_ENABLE_ONCHAIN_DISCOVERY}" = "true" ] && [ -z ${DOMAIN} ] \ + || [ "${VALIDATOR_ENABLE_ONCHAIN_DISCOVERY}" = "true" ] && [ -z ${DOMAIN} ]; then + echo "If FULLNODE_ENABLE_ONCHAIN_DISCOVERY or VALIDATOR_ENABLE_ONCHAIN_DISCOVERY is set, DOMAIN must be set" + exit 1 fi echo "NUM_VALIDATORS=${NUM_VALIDATORS}" @@ -68,53 +68,53 @@ echo "RANDOM_SEED=${RANDOM_SEED}" RANDOM_SEED_IN_DECIMAL=$(printf "%d" 0x${RANDOM_SEED}) # generate all validator configurations -for i in $(seq 0 $(($NUM_VALIDATORS-1))); do - username="${USERNAME_PREFIX}-${i}" - user_dir="${WORKSPACE}/${username}" - - mkdir $user_dir - - if [[ "${FULLNODE_ENABLE_ONCHAIN_DISCOVERY}" = "true" ]]; then - fullnode_host="fullnode${i}.${DOMAIN}:6182" - elif [[ "${ENABLE_MULTICLUSTER_DOMAIN_SUFFIX}" = "true" ]]; then - index=$(($i % ${#MULTICLUSTER_DOMAIN_SUFFIXES[@]})) - cluster=${MULTICLUSTER_DOMAIN_SUFFIXES[${index}]} - fullnode_host="${username}-${FULLNODE_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.${cluster}:6182" - else - fullnode_host="${username}-${FULLNODE_INTERNAL_HOST_SUFFIX}:6182" - fi - - if [[ "${VALIDATOR_ENABLE_ONCHAIN_DISCOVERY}" = "true" ]]; then - validator_host="val${i}.${DOMAIN}:6180" - elif [[ "${ENABLE_MULTICLUSTER_DOMAIN_SUFFIX}" = "true" ]]; then - index=$(($i % ${#MULTICLUSTER_DOMAIN_SUFFIXES[@]})) - cluster=${MULTICLUSTER_DOMAIN_SUFFIXES[${index}]} - validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.${cluster}:6180" - else - validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}:6180" - fi - - if [ $i -lt $NUM_VALIDATORS_WITH_LARGER_STAKE ]; then - CUR_STAKE_AMOUNT=$LARGER_STAKE_AMOUNT - else - CUR_STAKE_AMOUNT=$STAKE_AMOUNT - fi - - echo "CUR_STAKE_AMOUNT=${CUR_STAKE_AMOUNT} for ${i} validator" - - if [[ -z "${RANDOM_SEED}" ]]; then - aptos genesis generate-keys --output-dir $user_dir - else - seed=$(printf "%064x" "$((${RANDOM_SEED_IN_DECIMAL}+i))") - echo "seed=$seed for ${i}th validator" - aptos genesis generate-keys --random-seed $seed --output-dir $user_dir - fi - - aptos genesis set-validator-configuration --owner-public-identity-file $user_dir/public-keys.yaml --local-repository-dir $WORKSPACE \ - --username $username \ - --validator-host $validator_host \ - --full-node-host $fullnode_host \ - --stake-amount $CUR_STAKE_AMOUNT +for i in $(seq 0 $(($NUM_VALIDATORS - 1))); do + username="${USERNAME_PREFIX}-${i}" + user_dir="${WORKSPACE}/${username}" + + mkdir $user_dir + + if [[ "${FULLNODE_ENABLE_ONCHAIN_DISCOVERY}" = "true" ]]; then + fullnode_host="fullnode${i}.${DOMAIN}:6182" + elif [[ "${ENABLE_MULTICLUSTER_DOMAIN_SUFFIX}" = "true" ]]; then + index=$(($i % ${#MULTICLUSTER_DOMAIN_SUFFIXES[@]})) + cluster=${MULTICLUSTER_DOMAIN_SUFFIXES[${index}]} + fullnode_host="${username}-${FULLNODE_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.${cluster}:6182" + else + fullnode_host="${username}-${FULLNODE_INTERNAL_HOST_SUFFIX}:6182" + fi + + if [[ "${VALIDATOR_ENABLE_ONCHAIN_DISCOVERY}" = "true" ]]; then + validator_host="val${i}.${DOMAIN}:6180" + elif [[ "${ENABLE_MULTICLUSTER_DOMAIN_SUFFIX}" = "true" ]]; then + index=$(($i % ${#MULTICLUSTER_DOMAIN_SUFFIXES[@]})) + cluster=${MULTICLUSTER_DOMAIN_SUFFIXES[${index}]} + validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}.${NAMESPACE}.svc.${cluster}:6180" + else + validator_host="${username}-${VALIDATOR_INTERNAL_HOST_SUFFIX}:6180" + fi + + if [ $i -lt $NUM_VALIDATORS_WITH_LARGER_STAKE ]; then + CUR_STAKE_AMOUNT=$LARGER_STAKE_AMOUNT + else + CUR_STAKE_AMOUNT=$STAKE_AMOUNT + fi + + echo "CUR_STAKE_AMOUNT=${CUR_STAKE_AMOUNT} for ${i} validator" + + if [[ -z "${RANDOM_SEED}" ]]; then + aptos genesis generate-keys --output-dir $user_dir + else + seed=$(printf "%064x" "$((${RANDOM_SEED_IN_DECIMAL} + i))") + echo "seed=$seed for ${i}th validator" + aptos genesis generate-keys --random-seed $seed --output-dir $user_dir + fi + + aptos genesis set-validator-configuration --owner-public-identity-file $user_dir/public-keys.yaml --local-repository-dir $WORKSPACE \ + --username $username \ + --validator-host $validator_host \ + --full-node-host $fullnode_host \ + --stake-amount $CUR_STAKE_AMOUNT done # get the framework @@ -130,10 +130,10 @@ kubectl get pvc -o name | grep /fn- | grep -v "e${ERA}-" | xargs -r kubectl dele kubectl get secret -o name | grep "genesis-e" | grep -v "e${ERA}-" | xargs -r kubectl delete # create genesis secrets for validators to startup -for i in $(seq 0 $(($NUM_VALIDATORS-1))); do -username="${USERNAME_PREFIX}-${i}" -user_dir="${WORKSPACE}/${username}" -kubectl create secret generic "${username}-genesis-e${ERA}" \ +for i in $(seq 0 $(($NUM_VALIDATORS - 1))); do + username="${USERNAME_PREFIX}-${i}" + user_dir="${WORKSPACE}/${username}" + kubectl create secret generic "${username}-genesis-e${ERA}" \ --from-file=genesis.blob=${WORKSPACE}/genesis.blob \ --from-file=waypoint.txt=${WORKSPACE}/waypoint.txt \ --from-file=validator-identity.yaml=${user_dir}/validator-identity.yaml \ diff --git a/terraform/helm/genesis/templates/genesis.yaml b/terraform/helm/genesis/templates/genesis.yaml index 1298d39676d2c..6d5716a25e6fe 100644 --- a/terraform/helm/genesis/templates/genesis.yaml +++ b/terraform/helm/genesis/templates/genesis.yaml @@ -25,12 +25,6 @@ data: rewards_apy_percentage: {{ .Values.chain.rewards_apy_percentage | int }} voting_duration_secs: {{ .Values.chain.voting_duration_secs | int }} voting_power_increase_limit: {{ .Values.chain.voting_power_increase_limit | int }} - {{- with .Values.chain.on_chain_consensus_config}} - on_chain_consensus_config: {{ . | toJson }} - {{- end}} - {{- with .Values.chain.on_chain_execution_config}} - on_chain_execution_config: {{ . | toJson }} - {{- end}} --- diff --git a/terraform/helm/genesis/values.yaml b/terraform/helm/genesis/values.yaml index 50bb124cda2d9..4a8c859e61859 100644 --- a/terraform/helm/genesis/values.yaml +++ b/terraform/helm/genesis/values.yaml @@ -31,10 +31,6 @@ chain: rewards_apy_percentage: 10 # -- Minimum price per gas unit min_price_per_gas_unit: 1 - # -- Onchain Consensus Config - on_chain_consensus_config: - # -- Onchain Execution Config - on_chain_execution_config: # -- Default image tag to use for all tools images imageTag: testnet diff --git a/terraform/helm/logger/.helmignore b/terraform/helm/logger/.helmignore deleted file mode 100644 index 0e8a0eb36f4ca..0000000000000 --- a/terraform/helm/logger/.helmignore +++ /dev/null @@ -1,23 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*.orig -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/terraform/helm/logger/Chart.yaml b/terraform/helm/logger/Chart.yaml deleted file mode 100644 index e2bdba437686c..0000000000000 --- a/terraform/helm/logger/Chart.yaml +++ /dev/null @@ -1,3 +0,0 @@ -apiVersion: v2 -name: aptos-logger -version: 0.2.0 diff --git a/terraform/helm/logger/README.md b/terraform/helm/logger/README.md deleted file mode 100644 index 09b96eeeffdbe..0000000000000 --- a/terraform/helm/logger/README.md +++ /dev/null @@ -1,37 +0,0 @@ -Aptos Logger Deployment -================================ - -This Helm chart deploys a central logger that aggregates logs from aptos nodes -using [Vector][]. The logger can be used to output logs to our central logging -system using mutual TLS, to file for debugging purposes, and any other outputs -possible with Vector output configuration. - -Note to partners: please don't point this logger towards our premainnet or mainnet -central logging stack. We'd like to keep that for validators and key Association-run -public fullnodes. - -Configuration -------------- - -See [values.yaml][] for the full list of options you can configure. - -* `logging.vector.logToFile`: logs to /tmp/logs for debugging purposes -* `logging.vector.outputs`: your own custom vector outputs -* `loggingClientCert`, `loggingClientKey`, `loggingCA`, `loggingCentralHost`: for mutual TLS with a central loging system - -There exist template helm values files in the `values` directory, for premainnet and mainnet. - -Deployment ----------- - -1. Install Helm v3: https://helm.sh/docs/intro/install/ -2. Configure `kubectl` with the Kubernetes cluster you wish to use. -3. Set the value `logger.name` to `-`, e.g. `novi-pfn` -4. Set the value `serviceAccount.name` to an existing fullnode or validator service account, or do a role binding, e.g. with `aptos-validator-psp`. -5. Configure any of the other helm values if applicable. An example to connect to `mainnet` is included in the `values` directory. If unset, the fullnode will connect to premainnet by default. -6. Install the release, setting any options: - - $ helm install fullnode-logger --set logging.vector.logToFile=true . - -[Vector]: https://vector.dev/ -[values.yaml]: values.yaml diff --git a/terraform/helm/logger/files/vector.toml b/terraform/helm/logger/files/vector.toml deleted file mode 100644 index 3da31d3c469c5..0000000000000 --- a/terraform/helm/logger/files/vector.toml +++ /dev/null @@ -1,70 +0,0 @@ -[sources.tcp_input] - type = "socket" - address = "0.0.0.0:5044" - max_length = 1024000 - mode = "tcp" - -[sources.syslog] - type = "syslog" - mode = "udp" - address = "0.0.0.0:1514" - max_length = 10240 - -[transforms.parse_json] - type = "json_parser" - inputs = ["tcp_input"] - drop_invalid = false - field = "message" - -[transforms.add_fields] - type = "add_fields" - inputs = ["parse_json", "syslog"] - overwrite = true - - # Fields - fields.owner = "{{ required "logger.name must be set" .Values.logger.name }}" - fields.chain_name = "{{ required "chain.name must be set" .Values.chain.name }}" - -{{- if .Values.loggingCentralHost }} -[sinks.http_output] - # General - type = "http" # required - inputs = ["add_fields"] # required - compression = "none" # optional, default - healthcheck = true # optional, default - uri = "https://{{.Values.loggingCentralHost}}:9000" # required - - # Batch - batch.max_events = 1000 # optional, no default, events - - # Buffer - buffer.max_events = 50000 # optional - buffer.type = "memory" # optional, default - buffer.when_full = "drop_newest" # optional - - # Encoding - encoding.codec = "ndjson" # required - - # TLS - tls.enabled = true - tls.ca_file = "/etc/vector/cert/ca.crt" - tls.crt_file = "/etc/vector/cert/tls.crt" - tls.key_file = "/etc/vector/cert/tls.key" - tls.verify_certificate = {{ .Values.logging.vector.verifyServer | default true }} - tls.verify_hostname = {{ .Values.logging.vector.verifyServer | default true }} -{{- end }} - -{{- if .Values.logging.vector.logToFile }} -[sinks.file] - inputs = ["add_fields"] - type = "file" - path = "/tmp/logs/vector-%Y-%m-%d.log" - encoding.codec = "ndjson" -{{- end }} - -{{- range .Values.logging.vector.outputs }} -[sinks.{{ .output_id }}] - {{- range $k, $v := .config }} - {{ $k }} = {{ toJson $v }} - {{- end }} -{{- end }} diff --git a/terraform/helm/logger/templates/NOTES.txt b/terraform/helm/logger/templates/NOTES.txt deleted file mode 100644 index 902fb066b17ff..0000000000000 --- a/terraform/helm/logger/templates/NOTES.txt +++ /dev/null @@ -1 +0,0 @@ -Your {{ .Chart.Name }} deployment named {{ .Release.Name }} is now deployed. diff --git a/terraform/helm/logger/templates/_helpers.tpl b/terraform/helm/logger/templates/_helpers.tpl deleted file mode 100644 index 759d584dc4899..0000000000000 --- a/terraform/helm/logger/templates/_helpers.tpl +++ /dev/null @@ -1,63 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "aptos-logger.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "aptos-logger.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "aptos-logger.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "aptos-logger.labels" -}} -helm.sh/chart: {{ include "aptos-logger.chart" . }} -{{ include "aptos-logger.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "aptos-logger.selectorLabels" -}} -app.kubernetes.io/name: {{ include "aptos-logger.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "aptos-logger.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "aptos-logger.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} diff --git a/terraform/helm/logger/templates/logging.yaml b/terraform/helm/logger/templates/logging.yaml deleted file mode 100644 index c138983431c1f..0000000000000 --- a/terraform/helm/logger/templates/logging.yaml +++ /dev/null @@ -1,139 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "aptos-logger.fullname" . }}-vector - labels: - {{- include "aptos-logger.labels" . | nindent 4 }} -data: - vector.toml: |- -{{ (tpl (.Files.Get "files/vector.toml") .) | indent 4 }} - ---- -{{- if .Values.loggingCentralHost }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "aptos-logger.fullname" . }}-vector - labels: - {{- include "aptos-logger.labels" . | nindent 4 }} -type: kubernetes.io/tls -data: - tls.crt: {{.Values.loggingClientCert}} - tls.key: {{.Values.loggingClientKey}} - ca.crt: {{.Values.loggingCA}} ---- -{{- end }} - -apiVersion: v1 -kind: Service -metadata: - name: {{ include "aptos-logger.fullname" . }} - labels: - {{- include "aptos-logger.labels" . | nindent 4 }} -spec: - selector: - {{- include "aptos-logger.selectorLabels" . | nindent 4 }} - app.kubernetes.io/name: logging - ports: - - name: json - port: 5044 - - name: syslog - protocol: UDP - port: 1514 - ---- - -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: {{ include "aptos-logger.fullname" . }} - labels: - {{- include "aptos-logger.labels" . | nindent 4 }} - app.kubernetes.io/name: logging -spec: - serviceName: {{ include "aptos-logger.fullname" . }} - replicas: 1 - podManagementPolicy: Parallel - selector: - matchLabels: - {{- include "aptos-logger.selectorLabels" . | nindent 6 }} - app.kubernetes.io/name: logging - template: - metadata: - labels: - {{- include "aptos-logger.selectorLabels" . | nindent 8 }} - app.kubernetes.io/name: logging - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - checksum/vector.toml: {{ tpl (.Files.Get "files/vector.toml") . | sha256sum }} - spec: - volumes: - {{- if .Values.logging.vector.logToFile }} - - name: vector-logs - emptyDir: {} - {{- end }} - - name: vector-config - configMap: - name: {{ include "aptos-logger.fullname" . }}-vector - {{- if .Values.loggingCentralHost }} - - name: vector-secret - secret: - secretName: {{ include "aptos-logger.fullname" . }}-vector - {{- end }} - {{- with .Values.logging }} - containers: - - name: vector - image: {{ .vector.image.repo }}:{{ .vector.image.tag }} - args: - - "--watch-config=true" - resources: - {{- toYaml .vector.resources | nindent 10 }} - {{- end }} - securityContext: - readOnlyRootFilesystem: true - allowPrivilegeEscalation: false - runAsUser: 65534 - runAsGroup: 65534 - capabilities: - drop: - - ALL - ports: - - containerPort: 5044 - - containerPort: 1514 - livenessProbe: - tcpSocket: - port: 5044 - initialDelaySeconds: 10 - readinessProbe: - tcpSocket: - port: 5044 - volumeMounts: - - name: vector-config - mountPath: /etc/vector - readOnly: true - {{- if .Values.loggingCentralHost }} - - name: vector-secret - mountPath: /etc/vector/cert - {{- end }} - {{- with .Values.logging }} - {{- if .vector.logToFile }} - - name: vector-logs - mountPath: /tmp/logs - {{- end }} - {{- with .nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - runAsNonRoot: true - fsGroup: 65534 - {{- end }} - serviceAccountName: {{ include "aptos-logger.serviceAccountName" . }} diff --git a/terraform/helm/logger/templates/serviceaccount.yaml b/terraform/helm/logger/templates/serviceaccount.yaml deleted file mode 100644 index 0f9ad0635574d..0000000000000 --- a/terraform/helm/logger/templates/serviceaccount.yaml +++ /dev/null @@ -1,8 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "aptos-logger.serviceAccountName" . }} - labels: - {{ include "aptos-logger.labels" . | nindent 4 }} -{{- end -}} diff --git a/terraform/helm/logger/values.yaml b/terraform/helm/logger/values.yaml deleted file mode 100644 index 552fcd3c5e8c3..0000000000000 --- a/terraform/helm/logger/values.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# These are used for aggregation purposes in central logging -chain: - name: -logger: - name: - -logging: - vector: - verifyServer: # default is true - logToFile: false - image: - repo: timberio/vector - tag: 0.20.0-alpine@sha256:1b6a76585ccb0a764b6374fe448825f1f46d40c3a05473337dad7c2e1f7322b5 - pullPolicy: IfNotPresent - resources: - limits: - cpu: 1.5 - memory: 2Gi - requests: - cpu: 1 - memory: 1.5Gi - outputs: [] - nodeSelector: {} - tolerations: [] - affinity: {} - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - -# please do not send to pre/mainnet central logging -loggingClientCert: "" -loggingClientKey: "" -loggingCA: "" -loggingCentralHost: "" diff --git a/terraform/helm/logger/values/mainnet.yaml b/terraform/helm/logger/values/mainnet.yaml deleted file mode 100644 index d76a3d3c0530a..0000000000000 --- a/terraform/helm/logger/values/mainnet.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# These are used for aggregation purposes in central logging -chain: - name: mainnet - -# sync from validator helm chart -loggingClientCert: -loggingClientKey: -loggingCA: -loggingCentralHost: diff --git a/terraform/helm/logger/values/premainnet.yaml b/terraform/helm/logger/values/premainnet.yaml deleted file mode 100644 index da7bacc1570d8..0000000000000 --- a/terraform/helm/logger/values/premainnet.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# These are used for aggregation purposes in central logging -chain: - name: premainnet - -# sync from validator helm chart -loggingClientCert: -loggingClientKey: -loggingCA: -loggingCentralHost: diff --git a/terraform/helm/monitoring/Chart.lock b/terraform/helm/monitoring/Chart.lock deleted file mode 100644 index 335f818158950..0000000000000 --- a/terraform/helm/monitoring/Chart.lock +++ /dev/null @@ -1,9 +0,0 @@ -dependencies: -- name: prometheus-node-exporter - repository: https://prometheus-community.github.io/helm-charts - version: 4.0.0 -- name: kube-state-metrics - repository: https://prometheus-community.github.io/helm-charts - version: 4.16.0 -digest: sha256:a5f034385599a788bf58d04acc029c014317d5df0efbebdc5ae034a731d4aaa7 -generated: "2022-09-07T17:04:07.275506-07:00" diff --git a/terraform/helm/monitoring/Chart.yaml b/terraform/helm/monitoring/Chart.yaml deleted file mode 100644 index 732cdd97823a4..0000000000000 --- a/terraform/helm/monitoring/Chart.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v2 -name: aptos-monitoring -version: 0.2.0 - -dependencies: - - name: prometheus-node-exporter - condition: prometheus-node-exporter.enabled - version: 4.0.0 - repository: "https://prometheus-community.github.io/helm-charts" - - name: kube-state-metrics - condition: kube-state-metrics.enabled - version: 4.16.0 - repository: "https://prometheus-community.github.io/helm-charts" diff --git a/terraform/helm/monitoring/charts/kube-state-metrics-4.16.0.tgz b/terraform/helm/monitoring/charts/kube-state-metrics-4.16.0.tgz deleted file mode 100644 index 4ef32f978317c..0000000000000 Binary files a/terraform/helm/monitoring/charts/kube-state-metrics-4.16.0.tgz and /dev/null differ diff --git a/terraform/helm/monitoring/charts/prometheus-node-exporter-4.0.0.tgz b/terraform/helm/monitoring/charts/prometheus-node-exporter-4.0.0.tgz deleted file mode 100644 index d81ac648e175e..0000000000000 Binary files a/terraform/helm/monitoring/charts/prometheus-node-exporter-4.0.0.tgz and /dev/null differ diff --git a/terraform/helm/monitoring/files/alertmanager.yml b/terraform/helm/monitoring/files/alertmanager.yml deleted file mode 100644 index 13e9d3f81f645..0000000000000 --- a/terraform/helm/monitoring/files/alertmanager.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Severeties: info, warning, [error, critical] -# Last 2 items are high urgency - -global: - -route: - group_by: ["instance", "kubernetes_pod_name", "role"] - - # When a new group of alerts is created by an incoming alert, wait at - # least 'group_wait' to send the initial notification. - # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first - # notification. - group_wait: 30s - - # When the first notification was sent, wait 'group_interval' to send a batch - # of new alerts that started firing for that group. - group_interval: 5m - - # If an alert has successfully been sent, wait 'repeat_interval' to - # resend them. - repeat_interval: 10m - - # A default receiver - receiver: "default" - - # The child route trees. - # https://prometheus.io/docs/alerting/latest/configuration/#route - routes: {{ .Values.monitoring.alertmanager.alertRouteTrees | toJson }} - -# A list of notification receivers -# https://prometheus.io/docs/alerting/latest/configuration/#receiver -receivers: {{ .Values.monitoring.alertmanager.alertReceivers | toJson }} diff --git a/terraform/helm/monitoring/files/dashboards b/terraform/helm/monitoring/files/dashboards deleted file mode 120000 index 9791cdc4da5a6..0000000000000 --- a/terraform/helm/monitoring/files/dashboards +++ /dev/null @@ -1 +0,0 @@ -../../../../dashboards \ No newline at end of file diff --git a/terraform/helm/monitoring/files/grafana.ini b/terraform/helm/monitoring/files/grafana.ini deleted file mode 100644 index c5f998f8b3b54..0000000000000 --- a/terraform/helm/monitoring/files/grafana.ini +++ /dev/null @@ -1,34 +0,0 @@ -{{- if .Values.monitoring.grafana.googleAuth }} - -[auth] -# Set to true to disable (hide) the login form, useful if you use OAuth -disable_login_form = true - -{{- with .Values.monitoring.grafana.config }} -[auth.google] -enabled = true -client_id = {{ .client_id }} -client_secret = {{ .client_secret }} -scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email -auth_url = https://accounts.google.com/o/oauth2/auth -token_url = https://accounts.google.com/o/oauth2/token -allowed_domains = {{ .allowed_domains }} -allow_sign_up = true -{{- end }} - -[users] -auto_assign_org_role = Editor - -[server] -protocol = http -root_url = http://mon.{{ .Values.service.domain }}/grafana -serve_from_sub_path = true - -{{- else }} -[auth.anonymous] -enabled = true - -# Role for unauthenticated users, other valid values are `Editor` and `Admin` -org_role = Editor - -{{- end }} \ No newline at end of file diff --git a/terraform/helm/monitoring/files/prometheus.yml b/terraform/helm/monitoring/files/prometheus.yml deleted file mode 100644 index 274eebaccbe93..0000000000000 --- a/terraform/helm/monitoring/files/prometheus.yml +++ /dev/null @@ -1,225 +0,0 @@ -global: - scrape_interval: 15s - evaluation_interval: 15s - external_labels: - chain_name: {{ .Values.chain.name }} - {{- if .Values.validator.name }} - owner: {{ .Values.validator.name }} - {{- else if .Values.fullnode.name }} - owner: {{ .Values.fullnode.name }} - {{- else }} - owner: release:{{ .Release.Name }} - {{- end }} - -# Alertmanager configuration -alerting: - alertmanagers: - - static_configs: - - targets: - - localhost:9093 - -# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. -rule_files: -{{- range $path, $_ := .Files.Glob "files/rules/*.yml" }} - - {{ base $path }} -{{- end }} - -scrape_configs: -{{ if .Values.monitoring.prometheus.fullKubernetesScrape }} -- job_name: 'kubernetes-apiservers' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: endpoints - - # Keep only the default/kubernetes service endpoints for the https port. This - # will add targets for each API server which Kubernetes adds an endpoint to - # the default/kubernetes service. - metric_relabel_configs: - - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] - action: keep - regex: default;kubernetes;https - - source_labels: [__name__] - action: drop - regex: '(.+)_request_duration_seconds_bucket' - - target_label: owner - {{- if .Values.validator.name }} - replacement: {{ .Values.validator.name }} - {{- else if .Values.fullnode.name }} - replacement: {{ .Values.fullnode.name }} - {{- else }} - replacement: {{ .Release.Name }} - {{- end }} -{{ end }} - -- job_name: 'kubernetes-nodes' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - {{ if not .Values.monitoring.prometheus.fullKubernetesScrape }} - metric_relabel_configs: - - source_labels: [namespace] - action: keep - regex: "{{ .Release.Namespace }}" - # Explicitly drop spammy metrics - - source_labels: [__name__] - regex: 'storage_operation_duration_seconds_bucket' - action: drop - {{ end }} - - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics - -- job_name: 'kubernetes-cadvisor' - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - - kubernetes_sd_configs: - - role: node - - relabel_configs: - - target_label: __address__ - replacement: kubernetes.default.svc:443 - - source_labels: [__meta_kubernetes_node_name] - regex: (.+) - target_label: __metrics_path__ - replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor - - {{ if not .Values.monitoring.prometheus.fullKubernetesScrape }} - # Only keep container task state for key containers - metric_relabel_configs: - - source_labels: [__name__, container] - action: drop - regex: container_tasks_state;!validator|!fullnode - - source_labels: [container] - action: drop - regex: calico.*|csi.*|ebs.*|chaos.*|aws-node|node-driver-registrar - {{ end }} - -# Scrape config for service endpoints. -# -# The relabeling allows the actual service scrape endpoint to be configured -# via the following annotations: -# -# * `prometheus.io/scrape`: Only scrape services that have a value of `true` -# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need -# to set this to `https` & most likely set the `tls_config` of the scrape config. -# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# * `prometheus.io/port`: If the metrics are exposed on a different port to the -# service then set this appropriately. -- job_name: 'kubernetes-service-endpoints' - - kubernetes_sd_configs: - - role: endpoints - - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace - target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - source_labels: [__meta_kubernetes_pod_node_name] - action: replace - target_label: kubernetes_node - - # Drop some redundant labels from kube-state-metrics - metric_relabel_configs: - - action: labeldrop - regex: uid|container_id - # Drop tmpfs metrics from node-exporter - - source_labels: [fstype] - regex: tmpfs - action: drop - - # Scrape config for pods - # - # The relabeling allows the actual pod scrape endpoint to be configured via the - # following annotations: - # - # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` - # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. - # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the default of `9102`. -- job_name: "kubernetes-pods" - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: ${1}:${2} - target_label: __address__ - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: namespace - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] - action: replace - target_label: role - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - action: replace - target_label: instance - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name - # Explicitly drop all vector metrics - - source_labels: [namespace] - regex: 'vector' - action: drop - -{{ if .Values.monitoring.prometheus.remote_write.enabled }} -{{ with .Values.monitoring.prometheus.remote_write }} -remote_write: - - url: {{ .url }} - sigv4: - region: {{ .region }} - queue_config: - max_samples_per_send: 1000 - max_shards: 200 - capacity: 2500 -{{ end }} -{{ end }} - diff --git a/terraform/helm/monitoring/files/rules/alerts.yml b/terraform/helm/monitoring/files/rules/alerts.yml deleted file mode 100644 index 658692ac5b4cd..0000000000000 --- a/terraform/helm/monitoring/files/rules/alerts.yml +++ /dev/null @@ -1,166 +0,0 @@ -groups: -- name: "Aptos alerts" - rules: -{{- if .Values.validator.name }} - # consensus - - alert: Zero Block Commit Rate - expr: rate(aptos_consensus_last_committed_round{role="validator"}[1m]) == 0 OR absent(aptos_consensus_last_committed_round{role="validator"}) - for: 20m - labels: - severity: error - summary: "The block commit rate is low" - annotations: - - alert: High local timeout rate - expr: rate(aptos_consensus_timeout_count{role="validator"}[1m]) > 0.5 - for: 20m - labels: - severity: warning - summary: "Consensus timeout rate is high" - annotations: - - alert: High consensus error rate - expr: rate(aptos_consensus_error_count{role="validator"}[1m]) / on (role) rate(consensus_duration_count{op='main_loop', role="validator"}[1m]) > 0.25 - for: 20m - labels: - severity: warning - summary: "Consensus error rate is high" - annotations: -{{- end }} - # State sync alerts - - alert: State sync is not making progress - expr: rate(aptos_state_sync_version{type="synced"}[5m]) == 0 OR absent(aptos_state_sync_version{type="synced"}) - for: 5m - labels: - severity: error - summary: "State sync is not making progress (i.e., the synced version is not increasing!)" - annotations: - - alert: State sync is lagging significantly - expr: (aptos_data_client_highest_advertised_data{data_type="transactions"} - on(kubernetes_pod_name, role) aptos_state_sync_version{type="synced"}) > 1000000 - for: 5m - labels: - severity: error - summary: "State sync is lagging significantly (i.e., the lag is greater than 1 million versions)" - annotations: - - # Mempool alerts - - alert: Mempool has no active upstream peers - expr: (sum by (kubernetes_pod_name) (aptos_mempool_active_upstream_peers_count)) == 0 - for: 3m - labels: - severity: error - summary: "Mempool has no active upstream peers (unable to forward transactions to anyone!)" - annotations: - - alert: Mempool is at >80% capacity (count) - expr: aptos_core_mempool_index_size{index="system_ttl"} > 1600000 # assumes default mempool size 2_000_000 - for: 5m - labels: - severity: warning - summary: "Mempool count is at >80% capacity (it may soon become full!)" - annotations: - - alert: Mempool is at >80% capacity (bytes) - expr: aptos_core_mempool_index_size{index="size_bytes"} > 1717986918 # assumes default mempool size 2 * 1024 * 1024 * 1024 - for: 5m - labels: - severity: warning - summary: "Mempool bytes is at >80% capacity (it may soon become full!)" - annotations: - - alert: Mempool is growing at a significant rate (count) - expr: rate(aptos_core_mempool_index_size{index="system_ttl"}[1m]) > 60000 # 3% growth per minute - assumes default mempool size 2_000_000 - for: 10m - labels: - severity: warning - summary: "Mempool count is growing at a significant rate (it may soon become full!)" - annotations: - - alert: Mempool is growing at a significant rate (bytes) - expr: rate(aptos_core_mempool_index_size{index="size_bytes"}[1m]) > 64424509 # 3% growth per minute - assumes default mempool size 2 * 1024 * 1024 * 1024 - for: 10m - labels: - severity: warning - summary: "Mempool bytes is growing at a significant rate (it may soon become full!)" - annotations: - - # Networking alerts - - alert: Validator Connected Peers - expr: 0 == min(aptos_network_peers{state="connected", role_type="validator", role="validator"}) - for: 15m - labels: - severity: error - summary: "Validator node has zero connected peers" - annotations: - - # Storage core metrics - - alert: Validator Low Disk Space (warning) - expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(validator|fullnode)-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 200 - for: 1h - labels: - severity: warning - summary: "Less than 200 GB of free space on Aptos Node." - annotations: - description: "(This is a warning, deal with it in working hours.) A validator or fullnode pod has less than 200 GB of disk space. Take these steps: - 1. If only a few nodes have this issue, it might be that they are not typically spec'd or customized differently, \ - it's most likely a expansion of the volume is needed soon. Talk to the PE team. Otherwise, it's a bigger issue. - 2. Pass this issue on to the storage team. If you are the storage team, read on. - 3. Go to the dashboard and look for the stacked up column family sizes. \ - If the total size on that chart can't justify low free disk space, we need to log in to a node to see if something other than the AptosDB is eating up disk. \ - Start from things under /opt/aptos/data. - 3 Otherwise, if the total size on that chart is the majority of the disk consumption, zoom out and look for anomalies -- sudden increases overall or on a few \ - specific Column Families, etc. Also check average size of each type of data. Reason about the anomaly with changes in recent releases in mind. - 4 If everything made sense, it's a bigger issue, somehow our gas schedule didn't stop state explosion before an alert is triggered. Our recommended disk \ - spec and/or default pruning configuration, as well as storage gas schedule need updates. Discuss with the ecosystem team and send out a PR on the docs site, \ - form a plan to inform the node operator community and prepare for a on-chain proposal to update the gas schedule." - - alert: Validator Very Low Disk Space (critical) - expr: (kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(validator|fullnode)-e.*"} - kubelet_volume_stats_used_bytes) / 1024 / 1024 / 1024 < 50 - for: 5m - labels: - severity: critical - summary: "Less than 50 GB of free space on Aptos Node." - annotations: - description: "A validator or fullnode pod has less than 50 GB of disk space -- that's dangerously low. \ - 1. A warning level alert of disk space less than 200GB should've fired a few days ago at least, search on slack and understand why it's not dealt with. - 2. Search in the code for the runbook of the warning alert, quickly go through that too determine if it's a bug. Involve the storage team and other team accordingly. - If no useful information is found, evaluate the trend of disk usage increasing, how long can we run further? If it can't last the night, you have these options to mitigate this: - 1. Expand the disk if it's a cloud volume. - 2. Shorten the pruner windows. Before that, find the latest version of these https://github.com/aptos-labs/aptos-core/blob/48cc64df8a64f2d13012c10d8bd5bf25d94f19dc/config/src/config/storage_config.rs#L166-L218 \ - and read carefully the comments on the prune window config entries -- set safe values. - 3. If you believe this is happening on nodes that are not run by us, involve the PE / Community / Ecosystem teams to coordinate efforts needed on those nodes. - " - - alert: AptosDB API Success Rate - expr: sum by(kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count{result="Ok"}[1m])) / sum by(kubernetes_pod_name) (rate(aptos_storage_api_latency_seconds_count[1m])) < 0.99 # 99% - for: 5m - labels: - severity: error - summary: "AptosDB API success rate dropped." - annotations: - description: "AptosDB APIs started to return Error. - This must be looked at together with alerts / dashboards of upper level components -- it unfortunately can be either the cause or victim of issues over there. Things you can do: - 1. Go to the storage dashboard and see if the errors are on specific APIs. - 2. Look at logs and see storage related errors, understand if it's hardware / dependency errors or logical errors in our code. - 3. Previous steps should narrow down the possibilities of the issue, at this point if it's still not clear, read the code to understand if the error is caused by a bug or a change of input pattern. - 4. See if changes in recent releases can cause this issue. - " - - alert: RocksDB Read Latency - expr: sum by (kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_sum[1m])) / sum by (kubernetes_pod_name) (rate(aptos_schemadb_get_latency_seconds_count[1m])) > 0.001 # 1 millisecond - for: 5m - labels: - severity: warning - summary: "RocksDB read latency raised." - annotations: - description: "RocksDB read latency raised, which indicates bad performance. - If alerts on other components are not fired, this is probably not urgent. But things you can do: - 1. On the system dashboard, see if we get a flat line on the IOPs panel -- it can be disk being throttled. It's either the node is not spec'd as expected, or we are using more IOPs than expected. - 2. Check out the traffic pattern on various dashboards, is there a sudden increase in traffic? Verify that on the storage dashboard by looking at the number of API calls, per API if needed. - 3. Check the system dashboard to see if we are bottle necked by the memory (we rely heavily on the filesystem cache) or the CPU. It might be helpful to restart one of the nodes that's having this issue. - - 9. After all those, our threshold was set strictly initially, so if everything looks fine, we can change the alarm threshold. - " - # Logging alerts - - alert: Logs Being Dropped - expr: 1 < (rate(aptos_struct_log_queue_error[1m]) + rate(aptos_struct_log_send_error[1m])) - for: 5m - labels: - severity: warning - summary: "Logs being dropped" - annotations: - description: "Logging Transmit Error rate is high \ - check the logging dashboard and \ - there may be network issues, downstream throughput issues, or something wrong with Vector \ - TODO: Runbook" diff --git a/terraform/helm/monitoring/templates/_helpers.tpl b/terraform/helm/monitoring/templates/_helpers.tpl deleted file mode 100644 index f121a8d524ff9..0000000000000 --- a/terraform/helm/monitoring/templates/_helpers.tpl +++ /dev/null @@ -1,63 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "aptos-monitoring.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "aptos-monitoring.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "aptos-monitoring.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "aptos-monitoring.labels" -}} -helm.sh/chart: {{ include "aptos-monitoring.chart" . }} -{{ include "aptos-monitoring.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "aptos-monitoring.selectorLabels" -}} -app.kubernetes.io/name: {{ include "aptos-monitoring.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "aptos-monitoring.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "aptos-monitoring.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} diff --git a/terraform/helm/monitoring/templates/monitoring.yaml b/terraform/helm/monitoring/templates/monitoring.yaml deleted file mode 100644 index affba0e54a04b..0000000000000 --- a/terraform/helm/monitoring/templates/monitoring.yaml +++ /dev/null @@ -1,368 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-grafana - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -data: - prometheus.yml: |- - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - isDefault: true - access: proxy - url: http://localhost:9090 - - dashboards.yml: |- - apiVersion: 1 - providers: - - name: 'default' - folder: 'aptos' - type: file - options: - path: /etc/grafana/dashboards/aptos - grafana.ini: |- -{{ (tpl (.Files.Get "files/grafana.ini") .) | indent 4 }} - ---- - -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -spec: - accessModes: - - ReadWriteOnce - storageClassName: {{ .Values.monitoring.prometheus.storage.class }} - resources: - requests: - storage: {{ .Values.monitoring.prometheus.storage.size }} - ---- - -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "aptos-monitoring.fullname" . }} - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -data: -{{ (tpl (.Files.Glob "files/rules/*.yml").AsConfig .) | indent 2 }} - prometheus.yml: |- -{{ (tpl (.Files.Get "files/prometheus.yml") .) | indent 4 }} - alertmanager.yml: |- -{{ (tpl (.Files.Get "files/alertmanager.yml") .) | indent 4 }} - haproxy.cfg: |- -{{ (tpl (.Files.Get "files/haproxy-mon.cfg") .) | indent 4 }} - ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-dashboards - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -binaryData: -{{ (.Files.Glob "files/dashboards/*.json.gz").AsSecrets | indent 2 }} - ---- - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - annotations: -{{- toYaml .Values.monitoring.serviceAccount.annotations | nindent 4 }} - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} - ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: - - extensions - resources: - - ingresses - verbs: ["get", "list", "watch"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] - ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: {{ include "aptos-monitoring.fullname" . }}-prometheus -subjects: -- kind: ServiceAccount - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - namespace: {{ .Release.Namespace }} - ---- - -apiVersion: v1 -kind: Service -metadata: - name: {{ include "aptos-monitoring.fullname" . }} - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} - annotations: - service.beta.kubernetes.io/aws-load-balancer-type: nlb - {{- if .Values.service.domain }} - external-dns.alpha.kubernetes.io/hostname: mon.{{ .Values.service.domain }} - {{- end }} -spec: - selector: - {{- include "aptos-monitoring.selectorLabels" . | nindent 4 }} - app.kubernetes.io/name: monitoring - ports: - - name: grafana-http - port: 80 - targetPort: 3000 - type: LoadBalancer - {{- with .Values.service.monitoring.loadBalancerSourceRanges }} - loadBalancerSourceRanges: - {{- toYaml . | nindent 4 }} - {{- end }} - ---- - -apiVersion: v1 -kind: Service -metadata: - name: {{ include "aptos-monitoring.fullname" . }}-prometheus - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} -spec: - selector: - {{- include "aptos-monitoring.selectorLabels" . | nindent 4 }} - app.kubernetes.io/name: monitoring - ports: - - name: prometheus-http - port: 9090 - type: ClusterIP - ---- - -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: {{ include "aptos-monitoring.fullname" . }} - labels: - {{- include "aptos-monitoring.labels" . | nindent 4 }} - app.kubernetes.io/name: monitoring -spec: - serviceName: {{ include "aptos-monitoring.fullname" . }} - replicas: 1 - podManagementPolicy: Parallel - selector: - matchLabels: - {{- include "aptos-monitoring.selectorLabels" . | nindent 6 }} - app.kubernetes.io/name: monitoring - template: - metadata: - labels: - {{- include "aptos-monitoring.selectorLabels" . | nindent 8 }} - app.kubernetes.io/name: monitoring - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - checksum/prometheus.yml: {{ tpl (.Files.Get "files/prometheus.yml") . | sha256sum }} - checksum/alertmanager.yml: {{ tpl (.Files.Get "files/alertmanager.yml") . | sha256sum }} - checksum/rules.yml: {{ (tpl (.Files.Glob "files/rules/*.yml").AsConfig .) | sha256sum }} - checksum/dashboards.json: {{ (.Files.Glob "files/dashboards/*.json.gz").AsSecrets | sha256sum }} - spec: - {{- with .Values.monitoring }} - containers: - - name: prometheus - image: {{ .prometheus.image.repo }}:{{ .prometheus.image.tag }} - imagePullPolicy: {{ .prometheus.image.pullPolicy }} - command: - - sh - - -c - - | - {{- if .prometheus.deleteWal }} - rm -r /prometheus/data/wal/* - {{- end }} - prometheus \ - --web.enable-lifecycle \ - --web.external-url=http://mon.{{ $.Values.service.domain }} \ - --config.file=/etc/prometheus/prometheus.yml \ - --storage.tsdb.retention.time={{ .prometheus.tsdb_retention_time }} \ - --storage.tsdb.min-block-duration={{ .prometheus.tsdb_min_block_duration }} \ - --storage.tsdb.max-block-duration={{ .prometheus.tsdb_max_block_duration }} - resources: - {{- toYaml .prometheus.resources | nindent 10 }} - ports: - - containerPort: 9090 - livenessProbe: - httpGet: - path: /-/healthy - port: 9090 - initialDelaySeconds: 10 - readinessProbe: - httpGet: - path: /-/ready - port: 9090 - volumeMounts: - - name: monitoring-config - mountPath: /etc/prometheus - - name: prometheus-data - mountPath: /prometheus - securityContext: - readOnlyRootFilesystem: true - runAsUser: 65534 - runAsGroup: 65534 - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - - name: alertmanager - image: {{ .alertmanager.image.repo }}:{{ .alertmanager.image.tag }} - imagePullPolicy: {{ .alertmanager.image.pullPolicy }} - args: - - "--config.file=/etc/alertmanager/alertmanager.yml" - - "--storage.path=/alertmanager" - resources: - {{- toYaml .alertmanager.resources | nindent 10 }} - ports: - - containerPort: 9093 - livenessProbe: - httpGet: - path: /-/healthy - port: 9093 - initialDelaySeconds: 10 - readinessProbe: - httpGet: - path: /-/ready - port: 9093 - volumeMounts: - - name: monitoring-config - mountPath: /etc/alertmanager - - name: alertmanager-data - mountPath: /alertmanager - securityContext: - readOnlyRootFilesystem: true - runAsUser: 65534 - runAsGroup: 65534 - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - - name: grafana - image: {{ .grafana.image.repo }}:{{ .grafana.image.tag }} - imagePullPolicy: {{ .grafana.image.pullPolicy }} - env: - {{- range $k, $v := .grafana.env }} - - name: {{ quote $k }} - value: {{ quote $v }} - {{- end }} - command: ["/bin/sh", "-c"] - args: ["cp /dashboards/* /etc/grafana/dashboards/aptos && gunzip -f /etc/grafana/dashboards/aptos/*.json.gz && exec /run.sh"] - resources: - {{- toYaml .grafana.resources | nindent 10 }} - ports: - - containerPort: 3000 - livenessProbe: - httpGet: - path: /api/health - port: 3000 - initialDelaySeconds: 10 - readinessProbe: - httpGet: - path: /api/health - port: 3000 - volumeMounts: - - name: grafana-config - mountPath: /etc/grafana - - name: grafana-provisioning - mountPath: /etc/grafana/provisioning - - name: grafana-dashboards-archive - mountPath: /dashboards - - name: grafana-dashboards - mountPath: /etc/grafana/dashboards/aptos - - name: grafana-data - mountPath: /var/lib/grafana - securityContext: - readOnlyRootFilesystem: true - runAsUser: 472 - runAsGroup: 472 - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - {{- with .nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - runAsNonRoot: true - fsGroup: 65534 - {{- end }} - volumes: - - name: grafana-config - configMap: - name: {{ include "aptos-monitoring.fullname" . }}-grafana - items: - - key: grafana.ini - path: grafana.ini - - name: grafana-provisioning - configMap: - name: {{ include "aptos-monitoring.fullname" . }}-grafana - items: - - key: prometheus.yml - path: datasources/prometheus.yml - - key: dashboards.yml - path: dashboards/dashboards.yml - - name: grafana-dashboards-archive - configMap: - name: {{ include "aptos-monitoring.fullname" . }}-dashboards - - name: grafana-dashboards - emptyDir: {} - - name: monitoring-config - configMap: - name: {{ include "aptos-monitoring.fullname" . }} - - name: prometheus-data - persistentVolumeClaim: - claimName: {{ include "aptos-monitoring.fullname" . }}-prometheus - - name: pushgateway-data - emptyDir: {} - - name: alertmanager-data - emptyDir: {} - - name: grafana-data - emptyDir: {} - serviceAccountName: {{ include "aptos-monitoring.fullname" . }}-prometheus - {{- if .Values.imagePullSecret }} - imagePullSecrets: - - name: {{.Values.imagePullSecret}} - {{- end }} diff --git a/terraform/helm/monitoring/templates/serviceaccount.yaml b/terraform/helm/monitoring/templates/serviceaccount.yaml deleted file mode 100644 index 10457a743947f..0000000000000 --- a/terraform/helm/monitoring/templates/serviceaccount.yaml +++ /dev/null @@ -1,8 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "aptos-monitoring.serviceAccountName" . }} - labels: - {{ include "aptos-monitoring.labels" . | nindent 4 }} -{{- end -}} diff --git a/terraform/helm/monitoring/values.yaml b/terraform/helm/monitoring/values.yaml deleted file mode 100644 index 2b577afdd5e51..0000000000000 --- a/terraform/helm/monitoring/values.yaml +++ /dev/null @@ -1,118 +0,0 @@ -chain: - name: -validator: - name: -fullnode: - name: - -monitoring: - prometheus: - fullKubernetesScrape: false - deleteWal: false - tsdb_retention_time: 15d - tsdb_min_block_duration: 30m - tsdb_max_block_duration: 1h - remote_write: - enabled: false - url: - region: - image: - repo: prom/prometheus - tag: v2.34.0@sha256:cb42332b66ac51a05c52f255e48a4496c0a172676093123bf28b37762009e78a - pullPolicy: IfNotPresent - resources: - limits: - cpu: 1 - memory: 1.5Gi - requests: - cpu: 1 - memory: 1.5Gi - storage: - class: - size: 100Gi - pushgateway: - image: - repo: prom/pushgateway - tag: v1.4.1@sha256:b561435cb17ee816c5d90c2408bcc1ffe25304f1608e18db16a3969f6cc44626 - pullPolicy: IfNotPresent - resources: - limits: - cpu: 0.1 - memory: 128Mi - requests: - cpu: 0.1 - memory: 128Mi - alertmanager: - alertRouteTrees: - - match: - severity: critical - receiver: 'critical' - - match: - severity: error - receiver: 'error' - alertReceivers: - - name: 'critical' - - name: 'error' - - name: 'default' - image: - repo: prom/alertmanager - tag: v0.24.0@sha256:b1ba90841a82ea24d79d4e6255b96025a9e89275bec0fae87d75a5959461971e - pullPolicy: IfNotPresent - resources: - limits: - cpu: 0.1 - memory: 128Mi - requests: - cpu: 0.1 - memory: 128Mi - grafana: - image: - repo: grafana/grafana - tag: 9.0.9@sha256:4a6b9d8d88522d2851f947f8f84cca10b6a43ca26d5e93102daf3a87935f10a5 - pullPolicy: IfNotPresent - resources: - limits: - cpu: 1 - memory: 256Mi - requests: - cpu: 1 - memory: 256Mi - googleAuth: - config: - env: - GF_AUTH_ANONYMOUS_ENABLED: true - GF_AUTH_ANONYMOUS_ORG_ROLE: Editor - nodeSelector: {} - tolerations: [] - affinity: {} - serviceAccount: - annotations: {} - -service: - domain: - external: - type: LoadBalancer - monitoring: - loadBalancerSourceRanges: - -serviceAccount: - # Specifies whether a service account should be created - create: true - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: - annotations: - -kube-state-metrics: - enabled: false - namespaceOverride: kube-system - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - -prometheus-node-exporter: - enabled: false - namespaceOverride: kube-system - podAnnotations: - prometheus.io/scrape: "true" - prometheus.io/port: "9100" diff --git a/terraform/helm/node-health-checker/.helmignore b/terraform/helm/node-health-checker/.helmignore deleted file mode 100644 index 50af031725419..0000000000000 --- a/terraform/helm/node-health-checker/.helmignore +++ /dev/null @@ -1,22 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/terraform/helm/node-health-checker/Chart.yaml b/terraform/helm/node-health-checker/Chart.yaml deleted file mode 100644 index 04bfc6661914e..0000000000000 --- a/terraform/helm/node-health-checker/Chart.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v2 -name: node-health-checker -version: 0.1 -appVersion: 0.1.0 -description: Node health checker -home: https://aptoslabs.com/ -sources: -- https://github.com/aptos-labs/aptos-core diff --git a/terraform/helm/node-health-checker/README.md b/terraform/helm/node-health-checker/README.md deleted file mode 100644 index ce159edc32631..0000000000000 --- a/terraform/helm/node-health-checker/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# node-health-checker - -![Version: 0.1](https://img.shields.io/badge/Version-0.1-informational?style=flat-square) ![AppVersion: 0.1.0](https://img.shields.io/badge/AppVersion-0.1.0-informational?style=flat-square) - -Node health checker - -**Homepage:** - -## Source Code - -* - -## Values - -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| imageTag | string | `"devnet"` | Default image tag to use for all aptos images | -| node_health_checker.affinity | object | `{}` | | -| node_health_checker.baseline_node_url | string | `"http://aptos-node-0-validator:8080"` | The baseline node URL for the health checker. Defaults to the validator in your deployment | -| node_health_checker.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy to use for node-checker image | -| node_health_checker.image.repo | string | `"aptoslabs/node-checker"` | Image repo to use for node-checker image for running load tests | -| node_health_checker.image.tag | string | `nil` | Image tag to use for node-checker image | -| node_health_checker.mint_key | string | `nil` | The mint key for the validator used by node health checker | -| node_health_checker.nodeSelector | object | `{}` | | -| node_health_checker.resources.limits.cpu | int | `1` | | -| node_health_checker.resources.limits.memory | string | `"512Mi"` | | -| node_health_checker.resources.requests.cpu | int | `1` | | -| node_health_checker.resources.requests.memory | string | `"512Mi"` | | -| node_health_checker.tolerations | list | `[]` | | -| serviceAccount.create | bool | `true` | Specifies whether a service account should be created | -| serviceAccount.name | string | `nil` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template | - ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/terraform/helm/node-health-checker/files/nhc_baseline_fullnode.yaml b/terraform/helm/node-health-checker/files/nhc_baseline_fullnode.yaml deleted file mode 100644 index 377469473a8c6..0000000000000 --- a/terraform/helm/node-health-checker/files/nhc_baseline_fullnode.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# Based on config from https://github.com/aptos-labs/aptos-core/blob/main/ecosystem/node-checker/configuration_examples/single_node_validator.yaml ---- -node_address: - url: {{ .Values.node_health_checker.baseline_node_url }} - metrics_port: 9101 - api_port: 8080 - noise_port: 6180 -configuration_name: ait3_registration -configuration_name_pretty: AIT3 Registration -chain_id: ~ -role_type: ~ -evaluators: - - consensus_proposals - - performance_tps - - api_latency - - consensus_round - - consensus_timeouts - - state_sync_version - - api_transaction_availability -evaluator_args: - build_version_args: {} - consensus_proposals_args: {} - consensus_round_args: {} - consensus_timeouts_args: - allowed_consensus_timeouts: 0 - latency_args: - num_samples: 5 - delay_between_samples_ms: 20 - num_allowed_errors: 1 - max_api_latency_ms: 1000 - network_minimum_peers_args: - minimum_peers_inbound: 0 - minimum_peers_outbound: 1 - network_peers_tolerance_args: - inbound_peers_tolerance: 10 - outbound_peers_tolerance: 10 - node_identity_args: {} - state_sync_version_args: - version_delta_tolerance: 5000 - tps_args: - emit_args: - mempool_backlog: 5000 - target_tps: 0 - txn_expiration_time_secs: 30 - duration: 10 - invalid_tx: 0 - transaction_type: coin_transfer - mint_args: - mint_key: - key: {{ .Values.node_health_checker.mint_key }} - mint_file: ~ - minimum_tps: 1000 - repeat_target_count: 1 - transaction_availability_args: - transaction_fetch_delay_secs: 5 -runner_args: - blocking_runner_args: - metrics_fetch_delay_secs: 5 - api_client_timeout_secs: 4 diff --git a/terraform/helm/node-health-checker/templates/_helpers.tpl b/terraform/helm/node-health-checker/templates/_helpers.tpl deleted file mode 100644 index 33733f9caac78..0000000000000 --- a/terraform/helm/node-health-checker/templates/_helpers.tpl +++ /dev/null @@ -1,63 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "node-health-checker.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "node-health-checker.fullname" -}} -{{- if .Values.fullnameOverride -}} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- if contains $name .Release.Name -}} -{{- .Release.Name | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "node-health-checker.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Common labels -*/}} -{{- define "node-health-checker.labels" -}} -helm.sh/chart: {{ include "node-health-checker.chart" . }} -{{ include "node-health-checker.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end -}} - -{{/* -Selector labels -*/}} -{{- define "node-health-checker.selectorLabels" -}} -app.kubernetes.io/part-of: {{ include "node-health-checker.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end -}} - -{{/* -Create the name of the service account to use -*/}} -{{- define "node-health-checker.serviceAccountName" -}} -{{- if .Values.serviceAccount.create -}} - {{ default (include "node-health-checker.fullname" .) .Values.serviceAccount.name }} -{{- else -}} - {{ default "default" .Values.serviceAccount.name }} -{{- end -}} -{{- end -}} diff --git a/terraform/helm/node-health-checker/templates/configmap.yaml b/terraform/helm/node-health-checker/templates/configmap.yaml deleted file mode 100644 index 8370c6c7654cd..0000000000000 --- a/terraform/helm/node-health-checker/templates/configmap.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ include "node-health-checker.fullname" . }} - labels: - {{- include "node-health-checker.labels" . | nindent 4 }} -data: - baseline_fullnode.yaml: |- -{{ (tpl (.Files.Get "files/nhc_baseline_fullnode.yaml") .) | indent 4 }} diff --git a/terraform/helm/node-health-checker/templates/deployment.yaml b/terraform/helm/node-health-checker/templates/deployment.yaml deleted file mode 100644 index 61874ce1c8c2a..0000000000000 --- a/terraform/helm/node-health-checker/templates/deployment.yaml +++ /dev/null @@ -1,70 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "node-health-checker.fullname" . }} - labels: - {{- include "node-health-checker.labels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "node-health-checker.selectorLabels" . | nindent 6 }} - app.kubernetes.io/name: node-health-checker - template: - metadata: - labels: - {{- include "node-health-checker.selectorLabels" . | nindent 8 }} - app.kubernetes.io/name: node-health-checker - annotations: - seccomp.security.alpha.kubernetes.io/pod: runtime/default - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "node-health-checker.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - containers: - - name: node-health-checker - image: "{{ .Values.node_health_checker.image.repo }}:{{ .Values.node_health_checker.image.tag | default .Values.imageTag }}" - imagePullPolicy: {{ .Values.node_health_checker.image.pullPolicy }} - command: ["aptos-node-checker"] - args: - - server - - run - - --baseline-node-config-paths - - /nhc/baseline_fullnode.yaml - ports: - - containerPort: 20121 - volumeMounts: - - name: node-health-checker-config - mountPath: /nhc - resources: - {{- toYaml .Values.node_health_checkerresources | nindent 12 }} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - {{- with .Values.node_health_checker.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.node_health_checker.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.node_health_checker.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - securityContext: - runAsNonRoot: true - runAsUser: 6180 - runAsGroup: 6180 - fsGroup: 6180 - volumes: - - name: node-health-checker-config - configMap: - name: {{ include "node-health-checker.fullname" . }} diff --git a/terraform/helm/node-health-checker/templates/service.yaml b/terraform/helm/node-health-checker/templates/service.yaml deleted file mode 100644 index a59908042a8d6..0000000000000 --- a/terraform/helm/node-health-checker/templates/service.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "node-health-checker.fullname" . }} - labels: - {{- include "node-health-checker.labels" . | nindent 4 }} -spec: - selector: - {{- include "node-health-checker.selectorLabels" . | nindent 4 }} - app.kubernetes.io/name: node-health-checker - type: ClusterIP - ports: - - port: 20121 - protocol: TCP - name: http diff --git a/terraform/helm/node-health-checker/values.yaml b/terraform/helm/node-health-checker/values.yaml deleted file mode 100644 index 3f23ca843bdb9..0000000000000 --- a/terraform/helm/node-health-checker/values.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# -- Default image tag to use for all aptos images -imageTag: devnet - -node_health_checker: - image: - # -- Image repo to use for node-checker image for running load tests - repo: aptoslabs/node-checker - # -- Image tag to use for node-checker image - tag: - # -- Image pull policy to use for node-checker image - pullPolicy: IfNotPresent - resources: - limits: - cpu: 1 - memory: 512Mi - requests: - cpu: 1 - memory: 512Mi - nodeSelector: {} - tolerations: [] - affinity: {} - # -- The baseline node URL for the health checker. Defaults to the validator in your deployment - baseline_node_url: http://aptos-node-0-validator:8080 - # -- The mint key for the validator used by node health checker - mint_key: - -serviceAccount: - # -- Specifies whether a service account should be created - create: true - # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template - name: diff --git a/terraform/helm/pfn-addons/README.md b/terraform/helm/pfn-addons/README.md new file mode 100644 index 0000000000000..621a161f4c86a --- /dev/null +++ b/terraform/helm/pfn-addons/README.md @@ -0,0 +1,54 @@ +# pfn-addons + +![Version: 0.1](https://img.shields.io/badge/Version-0.1-informational?style=flat-square) + +Additional components for a public fullnode fleet deployment + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| ingress.acm_certificate | string | `nil` | | +| ingress.class | string | `"alb"` | | +| ingress.cookieDurationSeconds | int | `86400` | | +| ingress.enableStickyness | bool | `true` | | +| ingress.gce_managed_certificate | string | `nil` | | +| ingress.gce_managed_certificate_domains | string | `nil` | | +| ingress.gce_security_policy | string | `nil` | Security policy to apply to the backend services behind the ingress | +| ingress.loadBalancerSourceRanges | string | `nil` | | +| ingress.wafAclArn | string | `nil` | | +| load_test.affinity | object | `{}` | | +| load_test.config.duration | int | `300` | How long to emit transactions for | +| load_test.config.expected_max_txns | int | `6000000` | Default 20k * $duration | +| load_test.config.max_transactions_per_account | int | `5` | | +| load_test.config.mempool_backlog | int | `5000` | Number of transactions outstanding in mempool | +| load_test.config.mint_key | string | `nil` | The private key used to mint to fund load test | +| load_test.config.numFullnodeGroups | string | `nil` | The number of fullnode groups to run traffic against | +| load_test.config.target_tps | int | `0` | Whether to target a constant TPS, or 0 if not used. Cannot be used with mempool_backlog. | +| load_test.config.transaction_type | string | `"coin-transfer"` | | +| load_test.config.txn_expiration_time_secs | int | `30` | How long to wait for transactions to be expired | +| load_test.config.use_pfns | bool | `true` | If true, run $numFullnodeGroups parallel load tests | +| load_test.config.use_validators | bool | `false` | Whether to submit transactions through validator REST API | +| load_test.enabled | bool | `false` | Whether to enable the load test CronJob | +| load_test.fullnode | object | `{"groups":[{"name":"fullnode"}]}` | The fullnode groups to target | +| load_test.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy to use for tools image | +| load_test.image.repo | string | `"aptoslabs/tools"` | Image repo to use for tools image for running load tests | +| load_test.image.tag | string | `nil` | Image tag to use for tools image | +| load_test.intervalMins | int | `15` | How many minutes between load test runs | +| load_test.nodeSelector | object | `{}` | | +| load_test.resources.limits.cpu | int | `1` | | +| load_test.resources.limits.memory | string | `"512Mi"` | | +| load_test.resources.requests.cpu | int | `1` | | +| load_test.resources.requests.memory | string | `"512Mi"` | | +| load_test.tolerations | list | `[]` | | +| service.aws_tags | string | `nil` | | +| service.domain | string | `nil` | | +| service.enableOnchainDiscovery | bool | `false` | | +| service.loadBalancerSourceRanges | string | `nil` | | +| service.sessionAffinity | string | `nil` | | +| serviceAccount.annotations | string | `nil` | | +| serviceAccount.create | bool | `true` | Specifies whether a service account should be created | +| serviceAccount.name | string | `nil` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template | + +---------------------------------------------- +Autogenerated from chart metadata using [helm-docs v1.11.0](https://github.com/norwoodj/helm-docs/releases/v1.11.0) diff --git a/terraform/helm/pfn-addons/templates/ingress.yaml b/terraform/helm/pfn-addons/templates/ingress.yaml index d45300600d23b..2beceb6a22626 100644 --- a/terraform/helm/pfn-addons/templates/ingress.yaml +++ b/terraform/helm/pfn-addons/templates/ingress.yaml @@ -34,7 +34,7 @@ metadata: {{ if eq .Values.ingress.class "gce" }} # kubernetes.io/ingress.global-static-ip-name: config.ingressConfig.staticIpName # may not be necessary {{- if .Values.ingress.gce_managed_certificate }} - kubernetes.io/ingress.allow-http: "false" + kubernetes.io/ingress.allow-http: "true" networking.gke.io/managed-certificates: {{ .Values.ingress.gce_managed_certificate }} networking.gke.io/v1beta1.FrontendConfig: ssl-redirect {{- end }} diff --git a/terraform/helm/testnet-addons/templates/loadtest.yaml b/terraform/helm/pfn-addons/templates/loadtest.yaml similarity index 91% rename from terraform/helm/testnet-addons/templates/loadtest.yaml rename to terraform/helm/pfn-addons/templates/loadtest.yaml index c1a61feca8f44..e5a6048aa197a 100644 --- a/terraform/helm/testnet-addons/templates/loadtest.yaml +++ b/terraform/helm/pfn-addons/templates/loadtest.yaml @@ -2,9 +2,9 @@ apiVersion: batch/v1 kind: CronJob metadata: - name: {{ include "testnet-addons.fullname" . }}-load-test + name: {{ include "pfn-addons.fullname" . }}-load-test labels: - {{- include "testnet-addons.labels" . | nindent 4 }} + {{- include "pfn-addons.labels" . | nindent 4 }} app.kubernetes.io/name: load-test spec: concurrencyPolicy: Replace @@ -14,11 +14,11 @@ spec: template: metadata: labels: - {{- include "testnet-addons.selectorLabels" . | nindent 12 }} + {{- include "pfn-addons.selectorLabels" . | nindent 12 }} app.kubernetes.io/name: load-test spec: restartPolicy: Never - priorityClassName: {{ include "testnet-addons.fullname" . }}-high + priorityClassName: {{ include "pfn-addons.fullname" . }}-high containers: - name: load-test image: {{ .Values.load_test.image.repo }}:{{ .Values.load_test.image.tag | default .Values.imageTag }} @@ -27,7 +27,7 @@ spec: - aptos-transaction-emitter - emit-tx - --mint-key={{ .Values.load_test.config.mint_key }} - - --chain-id={{ .Values.genesis.chain_id }} + - --chain-id={{ .Values.load_test.config.chain_id }} # Build targets args for internal cluster targets {{- $numTargets := 0 }} {{- $targetSuffix := "" }} @@ -115,7 +115,7 @@ spec: # - name: net.ipv4.tcp_tw_reuse # value: "1" {{- end }} - serviceAccountName: {{ include "testnet-addons.serviceAccountName" . }} + serviceAccountName: {{ include "pfn-addons.serviceAccountName" . }} {{- if .Values.imagePullSecret }} imagePullSecrets: - name: {{.Values.imagePullSecret}} diff --git a/terraform/helm/pfn-addons/templates/service.yaml b/terraform/helm/pfn-addons/templates/service.yaml index eee24370f4cb4..4f555ad9f4706 100644 --- a/terraform/helm/pfn-addons/templates/service.yaml +++ b/terraform/helm/pfn-addons/templates/service.yaml @@ -5,7 +5,16 @@ metadata: labels: {{- include "pfn-addons.labels" . | nindent 4 }} annotations: + {{- if eq .Values.ingress.class "alb" }} alb.ingress.kubernetes.io/healthcheck-path: /v1/-/healthy + {{- end }} + {{- if eq .Values.ingress.class "gce" }} + {{- if .Values.ingress.backend_http2 }} + cloud.google.com/app-protocols: '{"default": "HTTP2"}' + {{- end }} + cloud.google.com/backend-config: '{"default":"{{ include "pfn-addons.fullname" . }}"}' + cloud.google.com/neg: '{"ingress": true}' + {{- end }} spec: selector: app.kubernetes.io/part-of: aptos-fullnode @@ -15,3 +24,30 @@ spec: targetPort: 8080 type: NodePort externalTrafficPolicy: Local +--- +{{- if eq .Values.ingress.class "gce" }} +apiVersion: cloud.google.com/v1 +kind: BackendConfig +metadata: + name: {{ include "pfn-addons.fullname" . }} +spec: + {{- if .Values.ingress.gce_security_policy }} + securityPolicy: + name: {{ .Values.ingress.gce_security_policy }} + {{- end }} + healthCheck: + checkIntervalSec: 30 + timeoutSec: 5 + healthyThreshold: 1 + unhealthyThreshold: 2 + type: HTTP + requestPath: /v1/-/healthy + # container targetPort + port: 8080 + {{- if .Values.ingress.enableStickyness }} + sessionAffinity: + affinityType: "GENERATED_COOKIE" + affinityCookieTtlSec: {{ .Values.ingress.cookieDurationSeconds }} + {{- end }} +--- +{{- end }} diff --git a/terraform/helm/node-health-checker/templates/serviceaccount.yaml b/terraform/helm/pfn-addons/templates/serviceaccount.yaml similarity index 60% rename from terraform/helm/node-health-checker/templates/serviceaccount.yaml rename to terraform/helm/pfn-addons/templates/serviceaccount.yaml index debfd370f2811..dae86c014e77a 100644 --- a/terraform/helm/node-health-checker/templates/serviceaccount.yaml +++ b/terraform/helm/pfn-addons/templates/serviceaccount.yaml @@ -2,9 +2,9 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "node-health-checker.serviceAccountName" . }} + name: {{ include "pfn-addons.serviceAccountName" . }} labels: -{{ include "node-health-checker.labels" . | nindent 4 }} +{{ include "pfn-addons.labels" . | nindent 4 }} annotations: {{- toYaml .Values.serviceAccount.annotations | nindent 4 }} {{- end -}} diff --git a/terraform/helm/pfn-addons/values.yaml b/terraform/helm/pfn-addons/values.yaml index 14a6d0d6e4d03..7aa23fe82899d 100644 --- a/terraform/helm/pfn-addons/values.yaml +++ b/terraform/helm/pfn-addons/values.yaml @@ -7,6 +7,13 @@ service: domain: aws_tags: +serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- The name of the service account to use. If not set and create is true, a name is generated using the fullname template + name: + annotations: + ingress: class: alb # the below only work for alb ingress @@ -18,3 +25,55 @@ ingress: # the below only work for gce ingress gce_managed_certificate: gce_managed_certificate_domains: + # -- Security policy to apply to the backend services behind the ingress + gce_security_policy: + # -- Enable HTTP/2 on the backends shards + backend_http2: false + +load_test: + # -- Whether to enable the load test CronJob + enabled: false + image: + # -- Image repo to use for tools image for running load tests + repo: aptoslabs/tools + # -- Image tag to use for tools image + tag: + # -- Image pull policy to use for tools image + pullPolicy: IfNotPresent + resources: + limits: + cpu: 4 + memory: 4Gi + requests: + cpu: 4 + memory: 4Gi + nodeSelector: {} + tolerations: [] + affinity: {} + # -- How many minutes between load test runs + intervalMins: 15 + # -- The fullnode groups to target + fullnode: + groups: + - name: fullnode + config: + # -- The number of fullnode groups to run traffic against + numFullnodeGroups: + # -- The private key used to mint to fund load test + mint_key: + # -- Number of transactions outstanding in mempool + mempool_backlog: 5000 + # -- Whether to target a constant TPS, or 0 if not used. Cannot be used with mempool_backlog. + target_tps: 0 + # -- How long to emit transactions for + duration: 300 + # -- How long to wait for transactions to be expired + txn_expiration_time_secs: 30 + # -- Whether to submit transactions through validator REST API + use_validators: false + # -- If true, run $numFullnodeGroups parallel load tests + use_pfns: true + # -- Default 20k * $duration + expected_max_txns: 6000000 + max_transactions_per_account: 5 + transaction_type: coin-transfer diff --git a/terraform/helm/testnet-addons/Chart.yaml b/terraform/helm/testnet-addons/Chart.yaml index 4aa559546d831..ee84f7ce7decd 100644 --- a/terraform/helm/testnet-addons/Chart.yaml +++ b/terraform/helm/testnet-addons/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v2 name: testnet-addons -version: 0.1 +version: "0.1" appVersion: 0.1.0 description: Additional components for aptos-nodes testnet home: https://aptoslabs.com/ sources: -- https://github.com/aptos-labs/aptos-core + - https://github.com/aptos-labs/aptos-core diff --git a/terraform/helm/testnet-addons/README.md b/terraform/helm/testnet-addons/README.md index 2e898549d11a2..242b66fbc2702 100644 --- a/terraform/helm/testnet-addons/README.md +++ b/terraform/helm/testnet-addons/README.md @@ -23,29 +23,10 @@ Additional components for aptos-nodes testnet | ingress.cookieDurationSeconds | int | `86400` | If stickiness is enabled, how long the session cookie should last | | ingress.enableStickyness | bool | `true` | Whether to enable session stickiness on the underlying load balancer | | ingress.gce_managed_certificate | string | `nil` | The GCE certificate to install on the ingress | +| ingress.gce_security_policy | string | `nil` | Security policy to apply to the backend services behind the ingress | | ingress.gce_static_ip | string | `nil` | The GCE static IP to install on the ingress | | ingress.loadBalancerSourceRanges | string | `nil` | List of CIDRs to accept traffic from | | ingress.wafAclArn | string | `nil` | The ARN of the WAF ACL to install on the ingress | -| load_test.affinity | object | `{}` | | -| load_test.config.duration | int | `300` | How long to emit transactions for | -| load_test.config.mempool_backlog | int | `5000` | Number of transactions outstanding in mempool | -| load_test.config.mint_key | string | `nil` | The private key used to mint to fund load test | -| load_test.config.numFullnodeGroups | string | `nil` | The number of fullnode groups to run traffic against | -| load_test.config.target_tps | int | `0` | Whether to target a constant TPS, or 0 if not used. Cannot be used with mempool_backlog. | -| load_test.config.txn_expiration_time_secs | int | `30` | How long to wait for transactions to be expired | -| load_test.config.use_validators | bool | `false` | Whether to submit transactions through validator REST API | -| load_test.enabled | bool | `false` | Whether to enable the load test CronJob | -| load_test.fullnode | object | `{"groups":[{"name":"fullnode"}]}` | The fullnode groups to target | -| load_test.image.pullPolicy | string | `"IfNotPresent"` | Image pull policy to use for tools image | -| load_test.image.repo | string | `"aptoslabs/tools"` | Image repo to use for tools image for running load tests | -| load_test.image.tag | string | `nil` | Image tag to use for tools image | -| load_test.intervalMins | int | `15` | How many minutes between load test runs | -| load_test.nodeSelector | object | `{}` | | -| load_test.resources.limits.cpu | int | `1` | | -| load_test.resources.limits.memory | string | `"512Mi"` | | -| load_test.resources.requests.cpu | int | `1` | | -| load_test.resources.requests.memory | string | `"512Mi"` | | -| load_test.tolerations | list | `[]` | | | service.domain | string | `nil` | If set, the base domain name to use for External DNS | | serviceAccount.create | bool | `true` | Specifies whether a service account should be created | | serviceAccount.name | string | `nil` | The name of the service account to use. If not set and create is true, a name is generated using the fullname template | diff --git a/terraform/helm/testnet-addons/templates/ingress.yaml b/terraform/helm/testnet-addons/templates/ingress.yaml index 632a9fc2fe545..46e6113d407af 100644 --- a/terraform/helm/testnet-addons/templates/ingress.yaml +++ b/terraform/helm/testnet-addons/templates/ingress.yaml @@ -36,23 +36,13 @@ metadata: # Allow HTTP but always return 301 because we have redirectToHttps enabled kubernetes.io/ingress.allow-http: "true" kubernetes.io/ingress.global-static-ip-name: {{ .Values.ingress.gce_static_ip }} + {{- if .Values.ingress.gce_managed_certificate }} networking.gke.io/managed-certificates: {{ .Values.ingress.gce_managed_certificate }} networking.gke.io/v1beta1.FrontendConfig: {{ include "testnet-addons.fullname" . }} + {{- end }} # ingress.gce_managed_certificate {{- end }} # "GKE" spec: rules: - {{- if .Values.service.domain }} - - host: api.{{ .Values.service.domain }} - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: {{ include "testnet-addons.fullname" . }}-api - port: - number: 80 - {{- end }} - host: {{ .Values.service.domain }} http: paths: @@ -78,7 +68,7 @@ spec: port: number: 80 --- -{{- if eq .Values.cloud "GKE" }} +{{- if .Values.ingress.gce_managed_certificate }} apiVersion: networking.gke.io/v1beta1 kind: FrontendConfig metadata: @@ -94,6 +84,5 @@ metadata: spec: domains: - {{ .Values.service.domain }} - - api.{{ .Values.service.domain }} --- {{- end }} diff --git a/terraform/helm/testnet-addons/templates/service.yaml b/terraform/helm/testnet-addons/templates/service.yaml index 74416c999ce53..a300e915f736f 100644 --- a/terraform/helm/testnet-addons/templates/service.yaml +++ b/terraform/helm/testnet-addons/templates/service.yaml @@ -30,6 +30,10 @@ metadata: name: {{ include "testnet-addons.fullname" . }}-api namespace: default spec: + {{- if .Values.ingress.gce_security_policy }} + securityPolicy: + name: {{ .Values.ingress.gce_security_policy }} + {{- end }} healthCheck: checkIntervalSec: 30 timeoutSec: 5 diff --git a/terraform/helm/testnet-addons/templates/waypoint.yaml b/terraform/helm/testnet-addons/templates/waypoint.yaml index 154fa3f986ed5..792f0ad1a3b9b 100644 --- a/terraform/helm/testnet-addons/templates/waypoint.yaml +++ b/terraform/helm/testnet-addons/templates/waypoint.yaml @@ -29,6 +29,10 @@ metadata: name: {{ include "testnet-addons.fullname" . }}-waypoint namespace: default spec: + {{- if .Values.ingress.gce_security_policy }} + securityPolicy: + name: {{ .Values.ingress.gce_security_policy }} + {{- end }} healthCheck: checkIntervalSec: 30 timeoutSec: 5 diff --git a/terraform/helm/testnet-addons/values.yaml b/terraform/helm/testnet-addons/values.yaml index bd5d5b9900935..93800823784bb 100644 --- a/terraform/helm/testnet-addons/values.yaml +++ b/terraform/helm/testnet-addons/values.yaml @@ -28,54 +28,6 @@ waypoint: cpu: 200m memory: 512Mi -load_test: - # -- Whether to enable the load test CronJob - enabled: false - image: - # -- Image repo to use for tools image for running load tests - repo: aptoslabs/tools - # -- Image tag to use for tools image - tag: - # -- Image pull policy to use for tools image - pullPolicy: IfNotPresent - resources: - limits: - cpu: 1 - memory: 512Mi - requests: - cpu: 1 - memory: 512Mi - nodeSelector: {} - tolerations: [] - affinity: {} - # -- How many minutes between load test runs - intervalMins: 15 - # -- The fullnode groups to target - fullnode: - groups: - - name: fullnode - config: - # -- The number of fullnode groups to run traffic against - numFullnodeGroups: - # -- The private key used to mint to fund load test - mint_key: - # -- Number of transactions outstanding in mempool - mempool_backlog: 5000 - # -- Whether to target a constant TPS, or 0 if not used. Cannot be used with mempool_backlog. - target_tps: 0 - # -- How long to emit transactions for - duration: 300 - # -- How long to wait for transactions to be expired - txn_expiration_time_secs: 30 - # -- Whether to submit transactions through validator REST API - use_validators: false - # -- If true, run $numFullnodeGroups parallel load tests - use_pfns: true - # -- Default 20k * $duration - expected_max_txns: 6000000 - max_transactions_per_account: 5 - transaction_type: coin-transfer - serviceAccount: # -- Specifies whether a service account should be created create: true @@ -93,6 +45,8 @@ ingress: gce_static_ip: # -- The GCE certificate to install on the ingress gce_managed_certificate: + # -- Security policy to apply to the backend services behind the ingress + gce_security_policy: # -- The ARN of the WAF ACL to install on the ingress wafAclArn: # -- List of CIDRs to accept traffic from diff --git a/terraform/helm/vector-log-agent/Chart.yaml b/terraform/helm/vector-log-agent/Chart.yaml index d06993717e7e5..2fae2ad2f9393 100644 --- a/terraform/helm/vector-log-agent/Chart.yaml +++ b/terraform/helm/vector-log-agent/Chart.yaml @@ -1,3 +1,3 @@ apiVersion: v2 name: aptos-vector-log-agent -version: 0.1.0 +version: 0.2.0 diff --git a/terraform/helm/vector-log-agent/files/vector-config.yaml b/terraform/helm/vector-log-agent/files/vector-config.yaml index 4f1e580bc8b49..a7374a4ed994d 100644 --- a/terraform/helm/vector-log-agent/files/vector-config.yaml +++ b/terraform/helm/vector-log-agent/files/vector-config.yaml @@ -1,6 +1,5 @@ data_dir: /vector-data-dir -# TODO: change this to expire_metrics_sec after vector 0.25 has been released. -expire_metrics: { secs: 1800, nanos: 0 } # expire metrics when no sample has been received after 30 minutes +expire_metrics_secs: 1800 # expire metrics when no sample has been received after 30 minutes api: enabled: true address: "127.0.0.1:8686" diff --git a/terraform/helm/vector-log-agent/files/vector-transforms.yaml b/terraform/helm/vector-log-agent/files/vector-transforms.yaml index 3f4b5185eea39..69b42e84934f9 100644 --- a/terraform/helm/vector-log-agent/files/vector-transforms.yaml +++ b/terraform/helm/vector-log-agent/files/vector-transforms.yaml @@ -36,7 +36,6 @@ transforms: del(.k8s.annotations."kubectl.kubernetes.io/last-applied-configuration") del(.k8s.annotations."seccomp.security.alpha.kubernetes.io/pod") del(.k8s.annotations."checksum/validator.yaml") - del(.k8s.annotations."kubernetes.io/psp") del(.k8s.labels."app.kubernetes.io/managed-by") del(.k8s.labels."app.kubernetes.io/part-of") @@ -75,7 +74,7 @@ transforms: if !exists(.message) && exists(.msg) { .message = del(.msg) } - parsed_timestamp, err = to_timestamp(.timestamp) + parsed_timestamp, err = parse_timestamp(.timestamp, "%+") # parse as ISO 8601 / RFC 3339 according to https://github.com/vectordotdev/vrl/blob/650547870a16c66dcfab01ec382cfdc23415d85b/lib/core/src/conversion.rs#L249C6-L249C8 if err == null { .timestamp = parsed_timestamp } diff --git a/terraform/helm/vector-log-agent/testing/test1.json b/terraform/helm/vector-log-agent/testing/test1.json index 5732b04282174..4aa13039a63bc 100644 --- a/terraform/helm/vector-log-agent/testing/test1.json +++ b/terraform/helm/vector-log-agent/testing/test1.json @@ -26,7 +26,6 @@ "annotations": { "kubectl.kubernetes.io/last-applied-configuration": "{\"some_very_long_json\":\"foo_bar\"}", "checksum/validator.yaml": "8430318f1be488c63b67a5041443bd9b70be34179068a89b42f52a1118f850e2", - "kubernetes.io/psp": "aptos-node", "seccomp.security.alpha.kubernetes.io/pod": "runtime/default" }, "pod_ip": "192.168.130.132", @@ -48,4 +47,4 @@ "message": "2022-07-24T03:39:54.744745Z [consensus] INFO consensus/src/round_manager.rs:314 Local state SyncInfo[certified_round: 966, ordered_round: 965, timeout round: 0, commit_info: BlockInfo: [epoch: 2, round: 963, id: 6d4833ab, executed_state_id: 81472687, version: 1380763, timestamp (us): 1658633992630447, next_epoch_state: None]], remote state SyncInfo[certified_round: 967, ordered_round: 966, timeout round: 0, commit_info: BlockInfo: [epoch: 2, round: 963, id: 6d4833ab, executed_state_id: 81472687, version: 1380763, timestamp (us): 1658633992630447, next_epoch_state: None]] {\"epoch\":2,\"event\":\"ReceiveNewCertificate\",\"remote_peer\":\"065f70c398566ebd3b806cbd11f7e86dd8e39d9616f2bb45a1bda1a0748c7c88\",\"round\":967}", "source_type": "kubernetes_logs", "stream": "stderr" -} \ No newline at end of file +} diff --git a/terraform/helm/vector-log-agent/values.yaml b/terraform/helm/vector-log-agent/values.yaml index 9e330245c7ef8..9b5db1308a616 100644 --- a/terraform/helm/vector-log-agent/values.yaml +++ b/terraform/helm/vector-log-agent/values.yaml @@ -1,7 +1,7 @@ image: repository: timberio/vector pullPolicy: IfNotPresent - tag: "0.25.X-distroless-libc" + tag: "0.34.X-distroless-libc" # -- Choose any (you can choose multiple) logging sinks supported by vector as found here https://vector.dev/docs/reference/configuration/sinks/ diff --git a/terraform/modules/eks/cluster.tf b/terraform/modules/eks/cluster.tf index 0657f3120abca..ea6d2f864fa00 100644 --- a/terraform/modules/eks/cluster.tf +++ b/terraform/modules/eks/cluster.tf @@ -12,7 +12,7 @@ resource "aws_eks_cluster" "aptos" { tags = local.default_tags vpc_config { - subnet_ids = concat(aws_subnet.public.*.id, aws_subnet.private.*.id) + subnet_ids = concat(aws_subnet.public[*].id, aws_subnet.private[*].id) public_access_cidrs = var.k8s_api_sources endpoint_private_access = true security_group_ids = [aws_security_group.cluster.id] @@ -218,11 +218,17 @@ data "aws_iam_policy_document" "cluster-autoscaler" { statement { sid = "DescribeAutoscaling" actions = [ - "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", "autoscaling:DescribeAutoScalingGroups", - "ec2:DescribeLaunchTemplateVersions", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeScalingActivities", "autoscaling:DescribeTags", - "autoscaling:DescribeLaunchConfigurations" + "ec2:DescribeInstanceTypes", + "ec2:DescribeLaunchTemplateVersions", + "ec2:DescribeImages", + "ec2:GetInstanceTypesFromInstanceRequirements", + "eks:DescribeNodegroup" ] resources = ["*"] } @@ -275,4 +281,4 @@ resource "helm_release" "autoscaling" { name = "chart_sha1" value = sha1(join("", [for f in fileset(local.autoscaling_helm_chart_path, "**") : filesha1("${local.autoscaling_helm_chart_path}/${f}")])) } -} \ No newline at end of file +} diff --git a/terraform/modules/eks/kubernetes.tf b/terraform/modules/eks/kubernetes.tf index 8a7d7f257c012..5de01b8e2875f 100644 --- a/terraform/modules/eks/kubernetes.tf +++ b/terraform/modules/eks/kubernetes.tf @@ -1,6 +1,6 @@ provider "kubernetes" { host = aws_eks_cluster.aptos.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority.0.data) + cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority[0].data) token = data.aws_eks_cluster_auth.aptos.token } @@ -72,71 +72,18 @@ resource "kubernetes_storage_class" "gp2" { depends_on = [null_resource.delete-gp2] } -# FIXME: Remove when migrating to K8s 1.25 -resource "kubernetes_role_binding" "psp-kube-system" { - metadata { - name = "eks:podsecuritypolicy:privileged" - namespace = "kube-system" - } - - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "ClusterRole" - name = "eks:podsecuritypolicy:privileged" - } - - subject { - api_group = "rbac.authorization.k8s.io" - kind = "Group" - name = "system:serviceaccounts:kube-system" - } -} - locals { kubeconfig = "/tmp/kube.config.${md5(timestamp())}" } -# FIXME: Remove when migrating to K8s 1.25 -resource "null_resource" "delete-psp-authenticated" { - provisioner "local-exec" { - command = <<-EOT - aws --region ${var.region} eks update-kubeconfig --name ${aws_eks_cluster.aptos.name} --kubeconfig ${local.kubeconfig} && - kubectl --kubeconfig ${local.kubeconfig} delete --ignore-not-found clusterrolebinding eks:podsecuritypolicy:authenticated - EOT - } - - depends_on = [kubernetes_role_binding.psp-kube-system] -} - provider "helm" { kubernetes { host = aws_eks_cluster.aptos.endpoint - cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority.0.data) + cluster_ca_certificate = base64decode(aws_eks_cluster.aptos.certificate_authority[0].data) token = data.aws_eks_cluster_auth.aptos.token } } -resource "kubernetes_namespace" "tigera-operator" { - metadata { - annotations = { - name = "tigera-operator" - } - - name = "tigera-operator" - } -} - -resource "helm_release" "calico" { - name = "calico" - repository = "https://docs.projectcalico.org/charts" - chart = "tigera-operator" - version = "3.23.3" - namespace = "tigera-operator" - depends_on = [ - kubernetes_namespace.tigera-operator - ] -} - resource "kubernetes_cluster_role" "debug" { metadata { name = "debug" @@ -247,7 +194,7 @@ resource "local_file" "kubernetes" { filename = "${local.workspace_name}-kubernetes.json" content = jsonencode({ kubernetes_host = aws_eks_cluster.aptos.endpoint - kubernetes_ca_cert = base64decode(aws_eks_cluster.aptos.certificate_authority.0.data) + kubernetes_ca_cert = base64decode(aws_eks_cluster.aptos.certificate_authority[0].data) issuer = aws_eks_cluster.aptos.identity[0].oidc[0].issuer service_account_prefix = "aptos-pfn" pod_cidrs = aws_subnet.private[*].cidr_block diff --git a/terraform/modules/eks/network.tf b/terraform/modules/eks/network.tf index 89983b601f27a..e42f1218aa8da 100644 --- a/terraform/modules/eks/network.tf +++ b/terraform/modules/eks/network.tf @@ -45,7 +45,7 @@ resource "aws_route_table" "public" { resource "aws_route_table_association" "public" { count = length(local.aws_availability_zones) - subnet_id = element(aws_subnet.public.*.id, count.index) + subnet_id = element(aws_subnet.public[*].id, count.index) route_table_id = aws_route_table.public.id } @@ -91,7 +91,7 @@ resource "aws_route_table" "private" { resource "aws_route_table_association" "private" { count = length(local.aws_availability_zones) - subnet_id = element(aws_subnet.private.*.id, count.index) + subnet_id = element(aws_subnet.private[*].id, count.index) route_table_id = aws_route_table.private.id } diff --git a/terraform/modules/eks/variables.tf b/terraform/modules/eks/variables.tf index 32572bf2b95bd..cd6ee985b67c0 100644 --- a/terraform/modules/eks/variables.tf +++ b/terraform/modules/eks/variables.tf @@ -5,7 +5,8 @@ variable "region" { variable "kubernetes_version" { description = "Version of Kubernetes to use for EKS cluster" - default = "1.22" + type = string + default = "1.27" } variable "eks_cluster_name" { @@ -15,6 +16,7 @@ variable "eks_cluster_name" { variable "k8s_api_sources" { description = "List of CIDR subnets which can access the Kubernetes API endpoint" + type = list(string) default = ["0.0.0.0/0"] } @@ -55,32 +57,37 @@ variable "k8s_debugger_roles" { } variable "iam_path" { - default = "/" description = "Path to use when naming IAM objects" + type = string + default = "/" } variable "permissions_boundary_policy" { - default = "" description = "ARN of IAM policy to set as permissions boundary on created roles" + type = string } variable "vpc_cidr_block" { - default = "192.168.0.0/16" description = "VPC CIDR Block" + type = string + default = "192.168.0.0/16" } variable "utility_instance_type" { description = "Instance type used for utilities" + type = string default = "t3.medium" } variable "fullnode_instance_type" { description = "Instance type used for validator and fullnodes" + type = string default = "c6i.8xlarge" } variable "num_fullnodes" { description = "Number of fullnodes to deploy" + type = number default = 1 } @@ -92,10 +99,12 @@ variable "node_pool_sizes" { variable "workspace_name_override" { description = "If specified, overrides the usage of Terraform workspace for naming purposes" + type = string default = "" } variable "num_extra_instance" { - default = 0 description = "Number of extra instances to add into node pool" + type = number + default = 0 } diff --git a/terraform/modules/eks/versions.tf b/terraform/modules/eks/versions.tf index a2b7631af994b..db37affd89991 100644 --- a/terraform/modules/eks/versions.tf +++ b/terraform/modules/eks/versions.tf @@ -1,5 +1,5 @@ terraform { - required_version = "~> 1.3.6" + required_version = "~> 1.5.6" required_providers { aws = { source = "hashicorp/aws" diff --git a/terraform/modules/resources/instance.tf b/terraform/modules/resources/instance.tf new file mode 100644 index 0000000000000..120dc71dae119 --- /dev/null +++ b/terraform/modules/resources/instance.tf @@ -0,0 +1,159 @@ +### Inputs + +variable "instance_type" { + description = "The instance type" + type = string + default = "" + + validation { + condition = can(regex("^(e2|n2d|t2d)-standard-(4|8|16|32|48|60)$", var.instance_type)) + error_message = "Unknown machine type" + } +} + +variable "utility_instance_type" { + description = "The utilities instance type" + type = string + default = "e2-standard-8" + + validation { + condition = can(regex("^(e2|n2d|t2d)-standard-(4|8|16|32|48|60)$", var.utility_instance_type)) + error_message = "Unknown machine type" + } +} + +variable "max_instances" { + description = "The maximum number of instances" + type = number + default = 100 +} + +variable "app_service" { + description = "Application service labeled using app.kubernetes.io/part-of" + type = string + default = "" +} + +### Computation + +locals { + machine_family = split("-", var.instance_type)[0] + utility_machine_family = split("-", var.utility_instance_type)[0] + machine_shapes = { + "t2d-standard-8" = { cores = 8, memory = 32 } + "t2d-standard-16" = { cores = 16, memory = 64 } + "t2d-standard-32" = { cores = 32, memory = 128 } + "t2d-standard-48" = { cores = 48, memory = 192 } + "t2d-standard-60" = { cores = 60, memory = 240 } + } + # leave 1 core for the system + available_cores = local.machine_shapes[var.instance_type].cores - 1 + # leave 4 GB for the system + available_memory = local.machine_shapes[var.instance_type].memory - 4 + + node_affinity = { + podAntiAffinity = { # don't schedule nodes on the same host + requiredDuringSchedulingIgnoredDuringExecution = [ + { + labelSelector = { + matchExpressions = [ + { + key = "app.kubernetes.io/part-of", + operator = "In", + values = [var.app_service] + } + ] + } + topologyKey = "kubernetes.io/hostname" + } + ] + } + nodeAffinity = { # affinity for the right instance types + requiredDuringSchedulingIgnoredDuringExecution = { + nodeSelectorTerms = [ + { + matchExpressions = [ + { + key = "cloud.google.com/machine-family", + operator = "In", + values = [local.machine_family], + } + ] + } + ] + } + } + } + + utility_affinity = { + podAntiAffinity = { # don't schedule nodes on the same host + requiredDuringSchedulingIgnoredDuringExecution = [ + { + labelSelector = { + matchExpressions = [ + { + key = "app.kubernetes.io/part-of", + operator = "In", + values = [var.app_service] + } + ] + } + topologyKey = "kubernetes.io/hostname" + } + ] + } + nodeAffinity = { # affinity for the right instance types + requiredDuringSchedulingIgnoredDuringExecution = { + nodeSelectorTerms = [ + { + matchExpressions = [ + { + key = "cloud.google.com/machine-family", + operator = "In", + values = [local.utility_machine_family], + } + ] + } + ] + } + } + } +} + +### Outputs + +output "resources" { + description = "Resources for the instance" + value = { + limits = { + cpu = local.available_cores + memory = "${local.available_memory}G" + ephemeral-storage = "5Gi" + } + requests = { + cpu = local.available_cores + memory = "${local.available_memory}G" + ephemeral-storage = "5Gi" + } + } +} + +output "max_cpu" { + description = "Maximum CPU for the Node autoprovisioning" + value = local.machine_shapes[var.instance_type].cores * var.max_instances +} + +output "max_memory" { + description = "Maximum RAM for the Node autoprovisioning" + value = local.machine_shapes[var.instance_type].memory * var.max_instances +} + +output "node_affinity" { + description = "Node affinity for the validator instances" + value = local.node_affinity +} + +output "utility_affinity" { + description = "Node affinity for the utility instances" + value = local.utility_affinity +} diff --git a/terraform/scripts/migrate_cluster_psp_to_pss.sh b/terraform/scripts/migrate_cluster_psp_to_pss.sh index a73b48e99bdd8..40c89a67ed666 100755 --- a/terraform/scripts/migrate_cluster_psp_to_pss.sh +++ b/terraform/scripts/migrate_cluster_psp_to_pss.sh @@ -1,66 +1,66 @@ #!/usr/bin/env bash function msg() { - if [[ ${VERBOSE} == true ]]; then - echo ${@} 2>&1 - fi + if [[ ${VERBOSE} == true ]]; then + echo ${@} 2>&1 + fi } -function disable_psp_ns () { - local _ns=${1} - msg "Disabling PodSecurityPolicy on namespace ${_ns}" - kubectl delete -n "${_ns}" rolebinding disable-psp 2>/dev/null - kubectl create -n "${_ns}" rolebinding disable-psp \ - --clusterrole privileged-psp --group "system:serviceaccounts:${_ns}" +function disable_psp_ns() { + local _ns=${1} + msg "Disabling PodSecurityPolicy on namespace ${_ns}" + kubectl delete -n "${_ns}" rolebinding disable-psp 2> /dev/null + kubectl create -n "${_ns}" rolebinding disable-psp \ + --clusterrole privileged-psp --group "system:serviceaccounts:${_ns}" } function set_pss_label() { - local _ns=${1} - local _policy=${2} - msg "Namespace ${_ns}: setting policy ${_policy}" - kubectl label --overwrite ns "${_ns}" "${_policy}" + local _ns=${1} + local _policy=${2} + msg "Namespace ${_ns}: setting policy ${_policy}" + kubectl label --overwrite ns "${_ns}" "${_policy}" } function set_pss_labels_ns() { - local _ns=${1} - set_pss_label "${_ns}" "pod-security.kubernetes.io/enforce=privileged" - set_pss_label "${_ns}" "pod-security.kubernetes.io/enforce-version=${POLICY_VERSION}" - set_pss_label "${_ns}" "pod-security.kubernetes.io/warn=baseline" - set_pss_label "${_ns}" "pod-security.kubernetes.io/warn-version=${POLICY_VERSION}" - set_pss_label "${_ns}" "pod-security.kubernetes.io/audit=baseline" - set_pss_label "${_ns}" "pod-security.kubernetes.io/audit-version=${POLICY_VERSION}" + local _ns=${1} + set_pss_label "${_ns}" "pod-security.kubernetes.io/enforce=privileged" + set_pss_label "${_ns}" "pod-security.kubernetes.io/enforce-version=${POLICY_VERSION}" + set_pss_label "${_ns}" "pod-security.kubernetes.io/warn=baseline" + set_pss_label "${_ns}" "pod-security.kubernetes.io/warn-version=${POLICY_VERSION}" + set_pss_label "${_ns}" "pod-security.kubernetes.io/audit=baseline" + set_pss_label "${_ns}" "pod-security.kubernetes.io/audit-version=${POLICY_VERSION}" } function list_ns() { - kubectl get ns | grep Active | awk '{ print $1 }' + kubectl get ns | grep Active | awk '{ print $1 }' } function migrate() { - msg "Creating resource PodSecurityPolicy/privileged-psp" - local scriptdir=$(dirname $(readlink -f ${0})) - kubectl apply -f "${scriptdir}"/privileged-psp.yaml + msg "Creating resource PodSecurityPolicy/privileged-psp" + local scriptdir=$(dirname $(readlink -f ${0})) + kubectl apply -f "${scriptdir}"/privileged-psp.yaml - msg "Creating role 'privileged-psp'" - kubectl delete clusterrole privileged-psp 2>/dev/null - kubectl create clusterrole privileged-psp \ - --verb use --resource podsecuritypolicies --resource-name privileged-psp + msg "Creating role 'privileged-psp'" + kubectl delete clusterrole privileged-psp 2> /dev/null + kubectl create clusterrole privileged-psp \ + --verb use --resource podsecuritypolicies --resource-name privileged-psp - local _ns - for _ns in $(list_ns); do - disable_psp_ns "${_ns}" - # set_pss_labels_ns "${_ns}" "${POLICY_VERSION}" - done - set_pss_labels_ns default "${POLICY_VERSION}" + local _ns + for _ns in $(list_ns); do + disable_psp_ns "${_ns}" + # set_pss_labels_ns "${_ns}" "${POLICY_VERSION}" + done + set_pss_labels_ns default "${POLICY_VERSION}" } function clean() { - msg "Cleaning up PSP resources" - kubectl delete clusterrole privileged-psp 2>/dev/null + msg "Cleaning up PSP resources" + kubectl delete clusterrole privileged-psp 2> /dev/null - local _ns - for _ns in $(list_ns); do - kubectl delete -n "${_ns}" rolebinding disable-psp 2>/dev/null - done + local _ns + for _ns in $(list_ns); do + kubectl delete -n "${_ns}" rolebinding disable-psp 2> /dev/null + done } POLICY_VERSION=v1.24 @@ -70,68 +70,69 @@ cmd="" optspec="h-:" while getopts "$optspec" optchar; do - case "${optchar}" in - -) - case "${OPTARG}" in - debug) - DEBUG=true - set +x - ;; - verbose) - VERBOSE=true - ;; - policy-version=*) - val=${OPTARG#*=} - opt=${OPTARG%=$val} - POLICY_VERSION=${val} - ;; - *) - if [ "$OPTERR" = 1 ] && [ "${optspec:0:1}" != ":" ]; then - echo "Unknown option --${OPTARG}" >&2 - fi - ;; - esac;; + case "${optchar}" in + -) + case "${OPTARG}" in + debug) + DEBUG=true + set +x + ;; + verbose) + VERBOSE=true + ;; + policy-version=*) + val=${OPTARG#*=} + opt=${OPTARG%=$val} + POLICY_VERSION=${val} + ;; *) - echo "Unknown argument: '-${OPTARG}'" >&2 - exit 2 - ;; - esac + if [ "$OPTERR" = 1 ] && [ "${optspec:0:1}" != ":" ]; then + echo "Unknown option --${OPTARG}" >&2 + fi + ;; + esac + ;; + *) + echo "Unknown argument: '-${OPTARG}'" >&2 + exit 2 + ;; + esac done -shift $((OPTIND -1)) +shift $((OPTIND - 1)) case $# in - 0) - cmd="usage" - ;; - 1) - cmd=${1} - ;; - *) - echo "Too many parameters on the command line" >&2 - exit 2 - ;; + 0) + cmd="usage" + ;; + 1) + cmd=${1} + ;; + *) + echo "Too many parameters on the command line" >&2 + exit 2 + ;; esac case ${cmd} in - usage) - echo "Usage: $(basename ${0}) [--verbose] [--debug] [--policy-version=] check | migrate | clean" >&2 - echo "Default PSS policy version: ${POLICY_VERSION}" >&2 - exit 1 - ;; - check) - echo "Hint: you can get the list of labels with kubectl get ns --show-labels" - kubectl label --dry-run=server \ - --overwrite ns --all \ - pod-security.kubernetes.io/enforce=baseline - ;; - clean) - clean - ;; - migrate) - migrate - ;; - *) - echo "Unknown command:" ${cmd} - exit 2 - ;; + usage) + echo "Usage: $(basename ${0}) [--verbose] [--debug] [--policy-version=] check | migrate | clean" >&2 + echo "Default PSS policy version: ${POLICY_VERSION}" >&2 + exit 1 + ;; + check) + echo "Hint: you can get the list of labels with kubectl get ns --show-labels" + kubectl label --dry-run=server \ + --overwrite ns --all \ + pod-security.kubernetes.io/enforce=baseline + ;; + clean) + clean + ;; + migrate) + migrate + ;; + *) + echo "Unknown command:" ${cmd} + exit 2 + ;; esac diff --git a/testsuite/fuzzer/fuzz/fuzz_targets/move/aptosvm_publish_and_run.rs b/testsuite/fuzzer/fuzz/fuzz_targets/move/aptosvm_publish_and_run.rs index fa2d25630e362..c808559ce3dfb 100644 --- a/testsuite/fuzzer/fuzz/fuzz_targets/move/aptosvm_publish_and_run.rs +++ b/testsuite/fuzzer/fuzz/fuzz_targets/move/aptosvm_publish_and_run.rs @@ -2,7 +2,9 @@ // Copyright © Aptos Foundation -use aptos_language_e2e_tests::{data_store::GENESIS_CHANGE_SET_HEAD, executor::FakeExecutor}; +use aptos_language_e2e_tests::{ + account::Account, data_store::GENESIS_CHANGE_SET_HEAD, executor::FakeExecutor, +}; use aptos_types::{ chain_id::ChainId, transaction::{ @@ -19,10 +21,9 @@ use move_binary_format::{ file_format::{CompiledModule, CompiledScript, FunctionDefinitionIndex}, }; use move_core_types::{ - account_address::AccountAddress, language_storage::{ModuleId, TypeTag}, value::MoveValue, - vm_status::{StatusType, VMStatus}, + vm_status::{StatusCode, StatusType, VMStatus}, }; use once_cell::sync::Lazy; use std::{ @@ -30,6 +31,70 @@ use std::{ convert::TryInto, }; +#[derive(Debug, Arbitrary, Eq, PartialEq, Clone, Copy)] +pub enum FundAmount { + Zero, + Poor, + Rich, +} + +#[derive(Debug, Arbitrary, Eq, PartialEq, Clone, Copy)] +pub struct UserAccount { + is_inited_and_funded: bool, + fund: FundAmount, +} + +#[derive(Debug, Arbitrary, Eq, PartialEq, Clone)] +pub enum Authenticator { + Ed25519 { + sender: UserAccount, + }, + MultiAgent { + sender: UserAccount, + secondary_signers: Vec, + }, + FeePayer { + sender: UserAccount, + secondary_signers: Vec, + fee_payer: UserAccount, + }, +} + +impl UserAccount { + fn fund_amount(&self) -> u64 { + match self.fund { + FundAmount::Zero => 0, + FundAmount::Poor => 1_000, + FundAmount::Rich => 1_000_000_000_000_000, + } + } + + fn convert_account(&self, vm: &mut FakeExecutor) -> Account { + if self.is_inited_and_funded { + vm.create_accounts(1, self.fund_amount(), 0).remove(0) + } else { + Account::new() + } + } +} + +impl Authenticator { + fn sender(&self) -> UserAccount { + match self { + Authenticator::Ed25519 { sender } => *sender, + Authenticator::MultiAgent { + sender, + secondary_signers: _, + } => *sender, + Authenticator::FeePayer { + sender, + secondary_signers: _, + fee_payer: _, + } => *sender, + } + } +} + #[derive(Debug, Arbitrary, Eq, PartialEq, Clone)] pub enum ExecVariant { Script { @@ -49,6 +114,7 @@ pub enum ExecVariant { pub struct RunnableState { pub dep_modules: Vec, pub exec_variant: ExecVariant, + pub tx_auth_type: Authenticator, } // genesis write set generated once for each fuzzing session @@ -217,7 +283,14 @@ fn run_case(mut input: RunnableState) -> Result<(), Corpus> { tdbg!("published"); } - let acc = vm.new_account_at(AccountAddress::from_hex_literal("0xcafe").unwrap()); + let sender_acc = if true { + // create sender pub/priv key. initialize and fund account + vm.create_accounts(1, input.tx_auth_type.sender().fund_amount(), 0) + .remove(0) + } else { + // only create sender pub/priv key. do not initialize + Account::new() + }; // build tx let tx = match input.exec_variant.clone() { ExecVariant::Script { @@ -229,7 +302,8 @@ fn run_case(mut input: RunnableState) -> Result<(), Corpus> { script .serialize(&mut script_bytes) .map_err(|_| Corpus::Keep)?; - acc.transaction() + sender_acc + .transaction() .gas_unit_price(100) .max_gas_amount(1000) .sequence_number(0) @@ -270,7 +344,8 @@ fn run_case(mut input: RunnableState) -> Result<(), Corpus> { .ok_or(Corpus::Keep)? .clone(); // } - acc.transaction() + sender_acc + .transaction() .gas_unit_price(100) .max_gas_amount(1000) .sequence_number(0) @@ -282,11 +357,65 @@ fn run_case(mut input: RunnableState) -> Result<(), Corpus> { ))) }, }; - let tx = tx - .raw() - .sign(&acc.privkey, acc.pubkey) - .map_err(|_| Corpus::Keep)? - .into_inner(); + + let raw_tx = tx.raw(); + let tx = match input.tx_auth_type { + Authenticator::Ed25519 { sender: _ } => raw_tx + .sign(&sender_acc.privkey, sender_acc.pubkey) + .map_err(|_| Corpus::Keep)? + .into_inner(), + Authenticator::MultiAgent { + sender: _, + secondary_signers, + } => { + // higher number here slows down fuzzer significatly due to slow signing process. + if secondary_signers.len() > 10 { + return Err(Corpus::Keep); + } + let secondary_accs: Vec<_> = secondary_signers + .iter() + .map(|acc| acc.convert_account(&mut vm)) + .collect(); + let secondary_signers = secondary_accs.iter().map(|acc| *acc.address()).collect(); + let secondary_private_keys = secondary_accs.iter().map(|acc| &acc.privkey).collect(); + raw_tx + .sign_multi_agent( + &sender_acc.privkey, + secondary_signers, + secondary_private_keys, + ) + .map_err(|_| Corpus::Keep)? + .into_inner() + }, + Authenticator::FeePayer { + sender: _, + secondary_signers, + fee_payer, + } => { + // higher number here slows down fuzzer significatly due to slow signing process. + if secondary_signers.len() > 10 { + return Err(Corpus::Keep); + } + let secondary_accs: Vec<_> = secondary_signers + .iter() + .map(|acc| acc.convert_account(&mut vm)) + .collect(); + + let secondary_signers = secondary_accs.iter().map(|acc| *acc.address()).collect(); + let secondary_private_keys = secondary_accs.iter().map(|acc| &acc.privkey).collect(); + let fee_payer_acc = fee_payer.convert_account(&mut vm); + raw_tx + .sign_fee_payer( + &sender_acc.privkey, + secondary_signers, + secondary_private_keys, + *fee_payer_acc.address(), + &fee_payer_acc.privkey, + ) + .map_err(|_| Corpus::Keep)? + .into_inner() + }, + }; // exec tx tdbg!("exec start"); @@ -316,13 +445,22 @@ fn run_case(mut input: RunnableState) -> Result<(), Corpus> { // if error exit gracefully let status = match tdbg!(res.status()) { TransactionStatus::Keep(status) => status, + TransactionStatus::Discard(e) => { + if e.status_type() == StatusType::InvariantViolation { + panic!("invariant violation {:?}", e); + } + return Err(Corpus::Keep); + }, _ => return Err(Corpus::Keep), }; match tdbg!(status) { ExecutionStatus::Success => (), ExecutionStatus::MiscellaneousError(e) => { if let Some(e) = e { - if e.status_type() == StatusType::InvariantViolation { + if e.status_type() == StatusType::InvariantViolation + && *e != StatusCode::TYPE_RESOLUTION_FAILURE + && *e != StatusCode::STORAGE_ERROR + { panic!("invariant violation {:?}", e); } } diff --git a/testsuite/generate-format/src/api.rs b/testsuite/generate-format/src/api.rs index 3a6e09a0f3e46..de5a573d417dd 100644 --- a/testsuite/generate-format/src/api.rs +++ b/testsuite/generate-format/src/api.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use aptos_crypto::{ + bls12381, ed25519::{Ed25519PrivateKey, Ed25519PublicKey}, hash::{CryptoHasher as _, TestOnlyHasher}, multi_ed25519::{MultiEd25519PublicKey, MultiEd25519Signature}, @@ -72,6 +73,13 @@ fn trace_crypto_values(tracer: &mut Tracer, samples: &mut Samples) -> Result<()> tracer.trace_value(samples, &secp256r1_ecdsa_public_key)?; tracer.trace_value(samples, &secp256r1_ecdsa_signature)?; + let bls12381_private_key = bls12381::PrivateKey::generate(&mut rng); + let bls12381_public_key = bls12381::PublicKey::from(&bls12381_private_key); + let bls12381_signature = bls12381_private_key.sign(&message).unwrap(); + tracer.trace_value(samples, &bls12381_private_key)?; + tracer.trace_value(samples, &bls12381_public_key)?; + tracer.trace_value(samples, &bls12381_signature)?; + Ok(()) } diff --git a/testsuite/generate-format/src/aptos.rs b/testsuite/generate-format/src/aptos.rs index 74c5573c8bc31..9619c96ca9a90 100644 --- a/testsuite/generate-format/src/aptos.rs +++ b/testsuite/generate-format/src/aptos.rs @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 use aptos_crypto::{ + bls12381, ed25519::{Ed25519PrivateKey, Ed25519PublicKey}, hash::{CryptoHasher as _, TestOnlyHasher}, multi_ed25519::{MultiEd25519PublicKey, MultiEd25519Signature}, @@ -69,6 +70,13 @@ fn trace_crypto_values(tracer: &mut Tracer, samples: &mut Samples) -> Result<()> tracer.trace_value(samples, &secp256r1_ecdsa_public_key)?; tracer.trace_value(samples, &secp256r1_ecdsa_signature)?; + let bls12381_private_key = bls12381::PrivateKey::generate(&mut rng); + let bls12381_public_key = bls12381::PublicKey::from(&bls12381_private_key); + let bls12381_signature = bls12381_private_key.sign(&message).unwrap(); + tracer.trace_value(samples, &bls12381_private_key)?; + tracer.trace_value(samples, &bls12381_public_key)?; + tracer.trace_value(samples, &bls12381_signature)?; + Ok(()) } diff --git a/testsuite/generate-format/tests/staged/api.yaml b/testsuite/generate-format/tests/staged/api.yaml index 2ac72adbaa8ba..063d1bdaafc7b 100644 --- a/testsuite/generate-format/tests/staged/api.yaml +++ b/testsuite/generate-format/tests/staged/api.yaml @@ -47,6 +47,17 @@ AccountAuthenticator: STRUCT: - authenticator: TYPENAME: MultiKeyAuthenticator +AggregateSignature: + STRUCT: + - validator_bitmask: + TYPENAME: BitVec + - sig: + OPTION: + TYPENAME: Signature +Any: + STRUCT: + - type_name: STR + - data: BYTES AnyPublicKey: ENUM: 0: @@ -191,10 +202,6 @@ DKGTranscriptMetadata: DepositEvent: STRUCT: - amount: U64 -DummyValidatorTransaction: - STRUCT: - - valid: BOOL - - payload: BYTES Ed25519PublicKey: NEWTYPESTRUCT: BYTES Ed25519Signature: @@ -291,6 +298,10 @@ IdCommitment: SIZE: 32 Identifier: NEWTYPESTRUCT: STR +JWKMoveStruct: + STRUCT: + - variant: + TYPENAME: Any Module: STRUCT: - code: BYTES @@ -373,6 +384,23 @@ Pepper: TUPLEARRAY: CONTENT: U8 SIZE: 31 +PrivateKey: + NEWTYPESTRUCT: BYTES +ProviderJWKs: + STRUCT: + - issuer: BYTES + - version: U64 + - jwks: + SEQ: + TYPENAME: JWKMoveStruct +PublicKey: + NEWTYPESTRUCT: BYTES +QuorumCertifiedUpdate: + STRUCT: + - update: + TYPENAME: ProviderJWKs + - multi_sig: + TYPENAME: AggregateSignature RandMetadata: STRUCT: - metadata_to_sign: @@ -422,6 +450,8 @@ Secp256r1EcdsaPublicKey: NEWTYPESTRUCT: BYTES Secp256r1EcdsaSignature: NEWTYPESTRUCT: BYTES +Signature: + NEWTYPESTRUCT: BYTES SignedGroth16Zkp: STRUCT: - proof: @@ -684,17 +714,13 @@ TypeTag: ValidatorTransaction: ENUM: 0: - DummyTopic1: - NEWTYPE: - TYPENAME: DummyValidatorTransaction - 1: DKGResult: NEWTYPE: TYPENAME: DKGTranscript - 2: - DummyTopic2: + 1: + ObservedJWKUpdate: NEWTYPE: - TYPENAME: DummyValidatorTransaction + TYPENAME: QuorumCertifiedUpdate WithdrawEvent: STRUCT: - amount: U64 diff --git a/testsuite/generate-format/tests/staged/aptos.yaml b/testsuite/generate-format/tests/staged/aptos.yaml index 7a34c8684e073..65ae950be420f 100644 --- a/testsuite/generate-format/tests/staged/aptos.yaml +++ b/testsuite/generate-format/tests/staged/aptos.yaml @@ -35,6 +35,17 @@ AccountAuthenticator: STRUCT: - authenticator: TYPENAME: MultiKeyAuthenticator +AggregateSignature: + STRUCT: + - validator_bitmask: + TYPENAME: BitVec + - sig: + OPTION: + TYPENAME: Signature +Any: + STRUCT: + - type_name: STR + - data: BYTES AnyPublicKey: ENUM: 0: @@ -168,10 +179,6 @@ DKGTranscriptMetadata: - epoch: U64 - author: TYPENAME: AccountAddress -DummyValidatorTransaction: - STRUCT: - - valid: BOOL - - payload: BYTES Ed25519PublicKey: NEWTYPESTRUCT: BYTES Ed25519Signature: @@ -237,6 +244,10 @@ IdCommitment: SIZE: 32 Identifier: NEWTYPESTRUCT: STR +JWKMoveStruct: + STRUCT: + - variant: + TYPENAME: Any Module: STRUCT: - code: BYTES @@ -305,6 +316,23 @@ Pepper: TUPLEARRAY: CONTENT: U8 SIZE: 31 +PrivateKey: + NEWTYPESTRUCT: BYTES +ProviderJWKs: + STRUCT: + - issuer: BYTES + - version: U64 + - jwks: + SEQ: + TYPENAME: JWKMoveStruct +PublicKey: + NEWTYPESTRUCT: BYTES +QuorumCertifiedUpdate: + STRUCT: + - update: + TYPENAME: ProviderJWKs + - multi_sig: + TYPENAME: AggregateSignature RandMetadata: STRUCT: - metadata_to_sign: @@ -354,6 +382,8 @@ Secp256r1EcdsaPublicKey: NEWTYPESTRUCT: BYTES Secp256r1EcdsaSignature: NEWTYPESTRUCT: BYTES +Signature: + NEWTYPESTRUCT: BYTES SignedGroth16Zkp: STRUCT: - proof: @@ -569,17 +599,13 @@ TypeTag: ValidatorTransaction: ENUM: 0: - DummyTopic1: - NEWTYPE: - TYPENAME: DummyValidatorTransaction - 1: DKGResult: NEWTYPE: TYPENAME: DKGTranscript - 2: - DummyTopic2: + 1: + ObservedJWKUpdate: NEWTYPE: - TYPENAME: DummyValidatorTransaction + TYPENAME: QuorumCertifiedUpdate WriteOp: ENUM: 0: diff --git a/testsuite/generate-format/tests/staged/consensus.yaml b/testsuite/generate-format/tests/staged/consensus.yaml index 6ff285ca8e0bf..3d69260096c87 100644 --- a/testsuite/generate-format/tests/staged/consensus.yaml +++ b/testsuite/generate-format/tests/staged/consensus.yaml @@ -48,6 +48,10 @@ AggregateSignatureWithRounds: TYPENAME: AggregateSignature - rounds: SEQ: U64 +Any: + STRUCT: + - type_name: STR + - data: BYTES AnyPublicKey: ENUM: 0: @@ -420,10 +424,6 @@ DKGTranscriptMetadata: - epoch: U64 - author: TYPENAME: AccountAddress -DummyValidatorTransaction: - STRUCT: - - valid: BOOL - - payload: BYTES Ed25519PublicKey: NEWTYPESTRUCT: BYTES Ed25519Signature: @@ -504,6 +504,10 @@ IdCommitment: SIZE: 32 Identifier: NEWTYPESTRUCT: STR +JWKMoveStruct: + STRUCT: + - variant: + TYPENAME: Any LedgerInfo: STRUCT: - commit_info: @@ -640,6 +644,13 @@ ProposalMsg: TYPENAME: Block - sync_info: TYPENAME: SyncInfo +ProviderJWKs: + STRUCT: + - issuer: BYTES + - version: U64 + - jwks: + SEQ: + TYPENAME: JWKMoveStruct PublicKey: NEWTYPESTRUCT: BYTES QuorumCert: @@ -648,6 +659,12 @@ QuorumCert: TYPENAME: VoteData - signed_ledger_info: TYPENAME: LedgerInfoWithSignatures +QuorumCertifiedUpdate: + STRUCT: + - update: + TYPENAME: ProviderJWKs + - multi_sig: + TYPENAME: AggregateSignature RandGenMessage: STRUCT: - epoch: U64 @@ -962,17 +979,13 @@ ValidatorConsensusInfo: ValidatorTransaction: ENUM: 0: - DummyTopic1: - NEWTYPE: - TYPENAME: DummyValidatorTransaction - 1: DKGResult: NEWTYPE: TYPENAME: DKGTranscript - 2: - DummyTopic2: + 1: + ObservedJWKUpdate: NEWTYPE: - TYPENAME: DummyValidatorTransaction + TYPENAME: QuorumCertifiedUpdate ValidatorVerifier: STRUCT: - validator_infos: diff --git a/third_party/move/move-compiler-v2/Cargo.toml b/third_party/move/move-compiler-v2/Cargo.toml index c9e04b143c012..f34ed25bb930d 100644 --- a/third_party/move/move-compiler-v2/Cargo.toml +++ b/third_party/move/move-compiler-v2/Cargo.toml @@ -28,9 +28,10 @@ clap = { version = "4.3.9", features = ["derive", "env"] } codespan = "0.11.1" codespan-reporting = { version = "0.11.1", features = ["serde", "serialization"] } ethnum = "1.0.4" +flexi_logger = "0.27.4" im = "15.0.0" itertools = "0.10.0" -#log = "0.4.14" +log = { version = "0.4.14", features = ["serde"] } num = "0.4.0" once_cell = "1.7.2" #paste = "1.0.5" @@ -46,9 +47,6 @@ move-ir-types = { path = "../move-ir/types" } move-prover-test-utils = { path = "../move-prover/test-utils" } move-stdlib = { path = "../move-stdlib" } -[features] -verbose-debug-print = ["move-stackless-bytecode/verbose-debug-print"] - [lib] doctest = false diff --git a/third_party/move/move-compiler-v2/src/inliner.rs b/third_party/move/move-compiler-v2/src/inliner.rs index 276c52336e843..c3bdfde66546c 100644 --- a/third_party/move/move-compiler-v2/src/inliner.rs +++ b/third_party/move/move-compiler-v2/src/inliner.rs @@ -37,9 +37,9 @@ /// - TODO(10858): add an anchor AST node so we can implement `Return` for inline functions and /// `Lambda`. /// - TODO(10850): add a simplifier that simplifies certain code constructs. -use crate::options::Options; use codespan_reporting::diagnostic::Severity; use itertools::chain; +use log::{info, trace}; use move_model::{ ast::{Exp, ExpData, Operation, Pattern, TempIndex}, exp_rewriter::ExpRewriterFunctions, @@ -64,6 +64,7 @@ type CallSiteLocations = BTreeMap<(QualifiedFunId, QualifiedFunId), BTreeSet( struct Inliner<'env> { env: &'env GlobalEnv, - debug: bool, /// Functions already processed all get an entry here, with a new function body after inline /// calls are substituted here. Functions which are unchanged (no calls to inline functions) /// bind to None. @@ -358,13 +358,8 @@ struct Inliner<'env> { impl<'env> Inliner<'env> { fn new(env: &'env GlobalEnv) -> Self { let funexprs_after_inlining = BTreeMap::new(); - let debug = env - .get_extension::() - .expect("Options is available") - .debug; Self { env, - debug, funexprs_after_inlining, } } @@ -438,22 +433,18 @@ impl<'env, 'inliner> ExpRewriterFunctions for OuterInlinerRewriter<'env, 'inline }; // inline here if let Some(expr) = body_expr { - if self.inliner.debug { - eprintln!( - "inlining function `{}` with args `{}`", - self.env.dump_fun(&func_env), - args.iter() - .map(|exp| format!("{}", exp.as_ref().display(self.env))) - .collect::>() - .join(","), - ); - } + trace!( + "inlining function `{}` with args `{}`", + self.env.dump_fun(&func_env), + args.iter() + .map(|exp| format!("{}", exp.as_ref().display(self.env))) + .collect::>() + .join(","), + ); let rewritten = InlinedRewriter::inline_call( self.env, call_id, &func_loc, &expr, type_args, parameters, args, ); - if self.inliner.debug { - eprintln!("After inlining, expr is `{}`", rewritten.display(self.env)); - } + trace!("After inlining, expr is `{}`", rewritten.display(self.env)); Some(rewritten) } else { None diff --git a/third_party/move/move-compiler-v2/src/lib.rs b/third_party/move/move-compiler-v2/src/lib.rs index de47c3314299a..0714725bff9a6 100644 --- a/third_party/move/move-compiler-v2/src/lib.rs +++ b/third_party/move/move-compiler-v2/src/lib.rs @@ -8,6 +8,7 @@ mod file_format_generator; pub mod flow_insensitive_checkers; pub mod function_checker; pub mod inliner; +pub mod logging; mod options; pub mod pipeline; @@ -23,6 +24,7 @@ use crate::pipeline::{ use anyhow::bail; use codespan_reporting::term::termcolor::{ColorChoice, StandardStream, WriteColor}; pub use experiments::*; +use log::{debug, info, log_enabled, trace, Level}; use move_compiler::{ compiled_unit::{ AnnotatedCompiledModule, AnnotatedCompiledScript, AnnotatedCompiledUnit, CompiledUnit, @@ -52,13 +54,13 @@ pub fn run_move_compiler( error_writer: &mut impl WriteColor, options: Options, ) -> anyhow::Result<(GlobalEnv, Vec)> { + logging::setup_logging(); + info!("Move Compiler v2"); // Run context check. let mut env = run_checker(options.clone())?; check_errors(&env, error_writer, "checking errors")?; - if options.debug { - eprintln!("After error check, GlobalEnv={}", env.dump_env()); - } + trace!("After context check, GlobalEnv={}", env.dump_env()); // Flow-insensitive checks on AST flow_insensitive_checkers::check_for_unused_vars_and_params(&mut env); @@ -66,29 +68,25 @@ pub fn run_move_compiler( function_checker::check_access_and_use(&mut env); check_errors(&env, error_writer, "checking errors")?; - if options.debug { - eprintln!( - "After flow-insensitive checks, GlobalEnv={}", - env.dump_env() - ); - } + trace!( + "After flow-insensitive checks, GlobalEnv={}", + env.dump_env() + ); // Run inlining. inliner::run_inlining(&mut env); check_errors(&env, error_writer, "inlining")?; - if options.debug { - eprintln!("After inlining, GlobalEnv={}", env.dump_env()); - } + debug!("After inlining, GlobalEnv={}", env.dump_env()); // Run code generator let mut targets = run_bytecode_gen(&env); check_errors(&env, error_writer, "code generation errors")?; + // Run transformation pipeline let pipeline = bytecode_pipeline(&env); - if options.debug || options.dump_bytecode { - // Dump bytecode to files, using a basename for the individual sources derived - // from the first input file. + if log_enabled!(Level::Debug) { + // Dump bytecode, providing a name for the target derived from the first input file. let dump_base_name = options .sources .first() @@ -102,13 +100,14 @@ pub fn run_move_compiler( &env, &mut targets, &dump_base_name, - options.debug && options.dump_bytecode, + false, &pipeline::register_formatters, ) } else { pipeline.run(&env, &mut targets) } check_errors(&env, error_writer, "stackless-bytecode analysis errors")?; + let modules_and_scripts = run_file_format_gen(&env, &targets); check_errors(&env, error_writer, "assembling errors")?; let annotated = annotate_units(modules_and_scripts); @@ -118,6 +117,7 @@ pub fn run_move_compiler( /// Run the type checker and return the global env (with errors if encountered). The result /// fails not on context checking errors, but possibly on i/o errors. pub fn run_checker(options: Options) -> anyhow::Result { + info!("Type Checking"); // Run the model builder, which performs context checking. let addrs = move_model::parse_addresses_from_options(options.named_address_mapping.clone())?; let mut env = move_model::run_model_builder_in_compiler_mode( @@ -151,6 +151,7 @@ pub fn run_checker(options: Options) -> anyhow::Result { // compilation, create an entry in the functions target holder which encapsulate info // like the generated bytecode. pub fn run_bytecode_gen(env: &GlobalEnv) -> FunctionTargetsHolder { + info!("Bytecode Generation"); let mut targets = FunctionTargetsHolder::default(); let mut todo = BTreeSet::new(); let mut done = BTreeSet::new(); @@ -180,6 +181,7 @@ pub fn run_bytecode_gen(env: &GlobalEnv) -> FunctionTargetsHolder { } pub fn run_file_format_gen(env: &GlobalEnv, targets: &FunctionTargetsHolder) -> Vec { + info!("File Format Generation"); file_format_generator::generate_file_format(env, targets) } diff --git a/third_party/move/move-compiler-v2/src/logging.rs b/third_party/move/move-compiler-v2/src/logging.rs new file mode 100644 index 0000000000000..4284ae7a5b192 --- /dev/null +++ b/third_party/move/move-compiler-v2/src/logging.rs @@ -0,0 +1,98 @@ +// Copyright © Aptos Foundation +// SPDX-License-Identifier: Apache-2.0 + +//! Configures the logging backend for the compiler and related code which uses +//! the `log` crate (macros `info!`, `error!`, etc.). +//! +//! # How to Run the Logger +//! +//! By default logging is turned off. The environment variable `MVC_LOG` is +//! used to control logging (MVC stands for "Move Compiler"). Usages: +//! +//! ```ignore +//! MVC_LOG = "" # logs everything to console (stderr) +//! MVC_LOG = "info" # logs only info or higher to console +//! MVC_LOG = "=info" # logs only info+ in matching modules +//! MVC_LOG = "@" # logs everything to given file +//! MVC_LOG = "info@" # as above +//! ``` +//! +//! A module path prefix must consist of crate name and module name, as in +//! `move_stackless_bytecode::function_target`, which matches any full module name +//! with this prefix. The general format is `[@]`, where `spec` is defined +//! as described for the `LogSpecification` type. +//! +//! # How to write Logs +//! +//! One can import and use the following macros in decreasing severity: `error!`, `warn!`, `info!`, +//!`debug!`, and `trace!`. Also any other code in external crates already using `log` will +//! use those macros. Invocations of those macros will be redirected to the logger configured +//! via the `MVC_LOG` environment variable. +//! +//! Those macros expand to a conditional, and it is a quick check and branch to skip +//! logging if the level is not enabled (which is the default). In addition, one can +//! use the boolean macro `log_enabled!(log::Level::Debug)` to check whether a given +//! level is enabled. +//! +//! In general it is good to keep INFO and higher severity log messages in the code. They +//! may help to debug customer issues in production. However, for keeping the code base clean, +//! uses of the `debug!` and lower logging level should be kept minimal and possibly removed +//! eliminated. Its a judgement call where and for how long to leave them in code. + +use flexi_logger::{DeferredNow, FileSpec, LogSpecification, Logger}; +use log::Record; +use std::env; + +const MVC_LOG_VAR: &str = "MVC_LOG"; + +/// Configures logging for applications. +pub fn setup_logging() { + // Currently no different to testing + setup_logging_for_testing() +} + +/// Configures logging for testing. Can be called multiple times. +pub fn setup_logging_for_testing() { + if let Some(logger) = configure_logger() { + // Will produce error if a logger is already installed, which we ignore. Its either this same logger + // already installed, or some outer code overriding the logger. + let _ = logger.start(); + } +} + +fn configure_logger() -> Option { + let var = env::var(MVC_LOG_VAR).ok()?; + let mut parts = var.rsplitn(2, '@').collect::>(); + parts.reverse(); + let spec = if parts[0].trim().is_empty() { + // Show everything + LogSpecification::trace() + } else { + LogSpecification::parse(parts[0]).expect("log spec") + }; + let mut logger = Logger::with(spec).format(format_record); + if parts.len() > 1 { + let fname = if !parts[1].contains('/') { + // Flex logger somehow does not like relative file names, help them. + format!("./{}", parts[1]) + } else { + parts[1].to_string() + }; + logger = logger.log_to_file(FileSpec::try_from(fname).expect("file name")) + } + Some(logger) +} + +fn format_record( + w: &mut dyn std::io::Write, + _now: &mut DeferredNow, + record: &Record, +) -> Result<(), std::io::Error> { + write!( + w, + "[{} {}] {}", + record.level(), + record.module_path().unwrap_or_default(), + &record.args() + ) +} diff --git a/third_party/move/move-compiler-v2/src/options.rs b/third_party/move/move-compiler-v2/src/options.rs index 3d0b429c64114..2ded754a32318 100644 --- a/third_party/move/move-compiler-v2/src/options.rs +++ b/third_party/move/move-compiler-v2/src/options.rs @@ -4,7 +4,7 @@ use clap::Parser; use codespan_reporting::diagnostic::Severity; -use move_command_line_common::env::{read_bool_env_var, read_env_var}; +use move_command_line_common::env::read_env_var; use move_compiler::command_line as cli; use once_cell::sync::Lazy; use std::{ @@ -31,12 +31,6 @@ pub struct Options { /// Output directory. #[clap(short, long, default_value = "")] pub output_dir: String, - /// Debug compiler by printing out internal information - #[clap(long = cli::DEBUG_FLAG, default_value=debug_compiler_env_var_str())] - pub debug: bool, - /// Whether to dump intermediate bytecode for debugging. - #[clap(long = "dump-bytecode", default_value=debug_compiler_dump_env_var_str())] - pub dump_bytecode: bool, /// Do not complain about unknown attributes in Move code. #[clap(long, default_value = "false")] pub skip_attribute_checks: bool, @@ -96,18 +90,6 @@ impl Options { } } -fn debug_compiler_env_var() -> bool { - static DEBUG_COMPILER: Lazy = - Lazy::new(|| read_bool_env_var(cli::MOVE_COMPILER_DEBUG_ENV_VAR)); - *DEBUG_COMPILER -} - -fn debug_compiler_dump_env_var() -> bool { - static DEBUG_COMPILER_DUMP: Lazy = - Lazy::new(|| read_bool_env_var(cli::MOVE_COMPILER_DUMP_ENV_VAR)); - *DEBUG_COMPILER_DUMP -} - fn compiler_exp_var() -> Vec { static EXP_VAR: Lazy> = Lazy::new(|| { let s = read_env_var("MOVE_COMPILER_EXP"); @@ -115,19 +97,3 @@ fn compiler_exp_var() -> Vec { }); (*EXP_VAR).clone() } - -fn debug_compiler_env_var_str() -> &'static str { - if debug_compiler_env_var() { - "true" - } else { - "false" - } -} - -fn debug_compiler_dump_env_var_str() -> &'static str { - if debug_compiler_dump_env_var() { - "true" - } else { - "false" - } -} diff --git a/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.exp b/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.exp deleted file mode 100644 index b6f5e04220537..0000000000000 --- a/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.exp +++ /dev/null @@ -1,27 +0,0 @@ - -Diagnostics: -warning: Unused local variable `len`. Consider removing or prefixing with an underscore: `_len` - ┌─ tests/checking/specs/len_ok.move:4:13 - │ -4 │ let len = 5; - │ ^^^ - -// ---- Model Dump -module 0x42::m { - private fun f(gallery: &vector) { - { - let len: u64 = 5; - spec { - assert Ge(Len($t0), 0); - } - ; - Tuple() - } - } - spec fun $f(gallery: vector) { - { - let len: u256 = 5; - Tuple() - } - } -} // end 0x42::m diff --git a/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.move b/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.move deleted file mode 100644 index a082f4b344719..0000000000000 --- a/third_party/move/move-compiler-v2/tests/checking/specs/len_ok.move +++ /dev/null @@ -1,9 +0,0 @@ -module 0x42::m { - - fun f(gallery: &vector) { - let len = 5; - spec { - assert len(gallery) >= 0; - }; - } -} diff --git a/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.exp b/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.exp deleted file mode 100644 index 9cf694dbbac09..0000000000000 --- a/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.exp +++ /dev/null @@ -1,7 +0,0 @@ - -Diagnostics: -error: invalid call of `m::len`: argument count mismatch (expected 0 but found 1) - ┌─ tests/checking/specs/len_same_fun_name_err.move:10:20 - │ -10 │ assert len(gallery) >= 0; // err is raised here because the built-in one is shadowed. - │ ^^^^^^^^^^^^ diff --git a/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.move b/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.move deleted file mode 100644 index 48c56deb9d945..0000000000000 --- a/third_party/move/move-compiler-v2/tests/checking/specs/len_same_fun_name_err.move +++ /dev/null @@ -1,14 +0,0 @@ -module 0x42::m { - - fun len(): bool { - true - } - - fun f(gallery: &vector) { - let len = 5; - spec { - assert len(gallery) >= 0; // err is raised here because the built-in one is shadowed. - assert len(); - }; - } -} diff --git a/third_party/move/move-compiler-v2/tests/testsuite.rs b/third_party/move/move-compiler-v2/tests/testsuite.rs index 4b2217bbf5d55..85278f36c1f11 100644 --- a/third_party/move/move-compiler-v2/tests/testsuite.rs +++ b/third_party/move/move-compiler-v2/tests/testsuite.rs @@ -3,11 +3,12 @@ // SPDX-License-Identifier: Apache-2.0 use codespan_reporting::{diagnostic::Severity, term::termcolor::Buffer}; +use log::{debug, trace}; use move_binary_format::binary_views::BinaryIndexedView; use move_command_line_common::files::FileHash; use move_compiler::compiled_unit::CompiledUnit; use move_compiler_v2::{ - flow_insensitive_checkers, function_checker, inliner, pipeline, + flow_insensitive_checkers, function_checker, inliner, logging, pipeline, pipeline::{ ability_checker::AbilityChecker, avail_copies_analysis::AvailCopiesAnalysisProcessor, copy_propagation::CopyPropagation, dead_store_elimination::DeadStoreElimination, @@ -60,6 +61,7 @@ fn path_from_crate_root(path: &str) -> String { } fn test_runner(path: &Path) -> datatest_stable::Result<()> { + logging::setup_logging_for_testing(); let mut experiments = extract_test_directives(path, "// experiment:")?; if experiments.is_empty() { // If there is no experiment, use "" as the 'default' experiment. @@ -310,9 +312,7 @@ impl TestConfig { let mut ok = Self::check_diags(&mut test_output.borrow_mut(), &env); if ok { - if options.debug { - eprint!("After error check, GlobalEnv={}", env.dump_env()); - } + trace!("After error check, GlobalEnv={}", env.dump_env()); // Flow-insensitive checks on AST flow_insensitive_checkers::check_for_unused_vars_and_params(&mut env); function_checker::check_for_function_typed_parameters(&mut env); @@ -320,18 +320,16 @@ impl TestConfig { ok = Self::check_diags(&mut test_output.borrow_mut(), &env); } if ok { - if options.debug { - eprint!( - "After flow-insensitive checks, GlobalEnv={}", - env.dump_env() - ); - } + trace!( + "After flow-insensitive checks, GlobalEnv={}", + env.dump_env() + ); // Run inlining. inliner::run_inlining(&mut env); ok = Self::check_diags(&mut test_output.borrow_mut(), &env); } - if ok && options.debug { - eprint!("After inlining, GlobalEnv={}", env.dump_env()); + if ok { + trace!("After inlining, GlobalEnv={}", env.dump_env()); } if ok && self.dump_ast { @@ -361,9 +359,20 @@ impl TestConfig { "initial bytecode", targets_before, &pipeline::register_formatters, + false, ), ); } + debug!( + "{}", + &move_stackless_bytecode::print_targets_with_annotations_for_test( + &env, + "initial bytecode", + targets_before, + &pipeline::register_formatters, + true, + ), + ) }, // Hook which is run after every step in the pipeline. Prints out // bytecode after the processor, if requested. @@ -371,22 +380,35 @@ impl TestConfig { let out = &mut test_output.borrow_mut(); Self::check_diags(out, &env); // Note that `i` starts at 1. - if self.dump_annotated_targets - && (self.dump_for_only_some_stages.is_none() // dump all stages - || self - .dump_for_only_some_stages - .as_ref() - .is_some_and(|list| list.contains(&(i - 1)))) - { + let title = format!("after {}:", processor.name()); + let stage_dump_enabled = self.dump_for_only_some_stages.is_none() + || self + .dump_for_only_some_stages + .as_ref() + .is_some_and(|list| list.contains(&(i - 1))); + if self.dump_annotated_targets && stage_dump_enabled { out.push_str( &move_stackless_bytecode::print_targets_with_annotations_for_test( &env, - &format!("after {}:", processor.name()), + &title, targets_after, &pipeline::register_formatters, + false, ), ); } + if stage_dump_enabled { + debug!( + "{}", + &move_stackless_bytecode::print_targets_with_annotations_for_test( + &env, + &title, + targets_after, + &pipeline::register_formatters, + true, + ) + ) + } }, ); let ok = Self::check_diags(&mut test_output.borrow_mut(), &env); diff --git a/third_party/move/move-compiler-v2/transactional-tests/Cargo.toml b/third_party/move/move-compiler-v2/transactional-tests/Cargo.toml index 3f2aa8c269e1b..b5138e9886819 100644 --- a/third_party/move/move-compiler-v2/transactional-tests/Cargo.toml +++ b/third_party/move/move-compiler-v2/transactional-tests/Cargo.toml @@ -12,6 +12,7 @@ once_cell = "1.7.2" [dev-dependencies] datatest-stable = "0.1.1" +move-compiler-v2 = { path = ".." } move-transactional-test-runner = { path = "../../testing-infra/transactional-test-runner" } [[test]] diff --git a/third_party/move/move-compiler-v2/transactional-tests/tests/tests.rs b/third_party/move/move-compiler-v2/transactional-tests/tests/tests.rs index aff43801f8002..7926fe2c88904 100644 --- a/third_party/move/move-compiler-v2/transactional-tests/tests/tests.rs +++ b/third_party/move/move-compiler-v2/transactional-tests/tests/tests.rs @@ -5,6 +5,7 @@ pub const TEST_DIR: &str = "tests"; use move_command_line_common::env::read_bool_env_var; +use move_compiler_v2::logging; use move_transactional_test_runner::{vm_test_harness, vm_test_harness::TestRunConfig}; use once_cell::sync::Lazy; use std::path::Path; @@ -26,6 +27,7 @@ fn move_test_debug() -> bool { } fn run(path: &Path) -> Result<(), Box> { + logging::setup_logging_for_testing(); let p = path.to_str().unwrap_or_default(); if p.contains(NO_SAFETY_PATH) { std::env::set_var(MOVE_COMPILER_EXP, "no-safety") diff --git a/third_party/move/move-compiler/tests/move_check/typing/len_err.exp b/third_party/move/move-compiler/tests/move_check/typing/len_err.exp deleted file mode 100644 index 514580232fb50..0000000000000 --- a/third_party/move/move-compiler/tests/move_check/typing/len_err.exp +++ /dev/null @@ -1,8 +0,0 @@ -error[E02010]: invalid name - ┌─ tests/move_check/typing/len_err.move:5:9 - │ -5 │ ╭ spec { -6 │ │ assert len(gallery) >= len; -7 │ │ }; - │ ╰─────────^ Conflicting name 'len' is used as both a variable and a function pointer (including built-in functions) in spec - diff --git a/third_party/move/move-compiler/tests/move_check/typing/len_err.move b/third_party/move/move-compiler/tests/move_check/typing/len_err.move deleted file mode 100644 index 73485764cbc63..0000000000000 --- a/third_party/move/move-compiler/tests/move_check/typing/len_err.move +++ /dev/null @@ -1,10 +0,0 @@ -module 0x42::m { - - fun f_err(gallery: &vector) { - let len = 5; - spec { - assert len(gallery) >= len; - }; - } - -} diff --git a/third_party/move/move-core/types/src/value.rs b/third_party/move/move-core/types/src/value.rs index ca2dc5b5545db..1171fd730ca7a 100644 --- a/third_party/move/move-core/types/src/value.rs +++ b/third_party/move/move-core/types/src/value.rs @@ -93,6 +93,7 @@ pub enum MoveStructLayout { pub enum IdentifierMappingKind { Aggregator, Snapshot, + DerivedString, } #[derive(Debug, Clone, Hash, Serialize, Deserialize, PartialEq, Eq)] diff --git a/third_party/move/move-model/bytecode-test-utils/src/lib.rs b/third_party/move/move-model/bytecode-test-utils/src/lib.rs index b6e0836a7c4c2..05d53a785f46d 100644 --- a/third_party/move/move-model/bytecode-test-utils/src/lib.rs +++ b/third_party/move/move-model/bytecode-test-utils/src/lib.rs @@ -58,7 +58,7 @@ pub fn test_runner( targets.add_target(&func_env); } } - text += &print_targets_for_test(&env, "initial translation from Move", &targets); + text += &print_targets_for_test(&env, "initial translation from Move", &targets, false); // Run pipeline if any if let Some(pipeline) = pipeline_opt { @@ -69,6 +69,7 @@ pub fn test_runner( &env, &format!("after pipeline `{}`", dir_name), &targets, + false, ); } text += &ProcessorResultDisplay { diff --git a/third_party/move/move-model/bytecode/Cargo.toml b/third_party/move/move-model/bytecode/Cargo.toml index 2d4d6114a3e2a..143c5ee4bd8b7 100644 --- a/third_party/move/move-model/bytecode/Cargo.toml +++ b/third_party/move/move-model/bytecode/Cargo.toml @@ -39,8 +39,6 @@ move-stackless-bytecode-test-utils = { path = "../bytecode-test-utils" } [features] default = [] -# If set, more information is printed when debug printing, e.g. for baseline files -verbose-debug-print = [] [[test]] name = "testsuite" diff --git a/third_party/move/move-model/bytecode/src/function_target.rs b/third_party/move/move-model/bytecode/src/function_target.rs index b437b839f9f23..f597d5a1fe8aa 100644 --- a/third_party/move/move-model/bytecode/src/function_target.rs +++ b/third_party/move/move-model/bytecode/src/function_target.rs @@ -379,6 +379,7 @@ impl<'env> FunctionTarget<'env> { label_offsets: &BTreeMap, offset: usize, code: &Bytecode, + verbose: bool, ) -> String { let mut texts = vec![]; @@ -389,7 +390,7 @@ impl<'env> FunctionTarget<'env> { } // add location - if cfg!(feature = "verbose-debug-print") { + if verbose { texts.push(format!( " # {}", self.get_bytecode_loc(attr_id).display(self.global_env()) @@ -683,10 +684,12 @@ impl<'env> fmt::Display for FunctionTarget<'env> { } let label_offsets = Bytecode::label_offsets(self.get_bytecode()); for (offset, code) in self.get_bytecode().iter().enumerate() { + // use `f.alternate()` to determine verbose print; its activated by `{:#}` instead of `{}` + // in the format string writeln!( f, "{}", - self.pretty_print_bytecode(&label_offsets, offset, code) + self.pretty_print_bytecode(&label_offsets, offset, code, f.alternate()) )?; } writeln!(f, "}}")?; diff --git a/third_party/move/move-model/bytecode/src/function_target_pipeline.rs b/third_party/move/move-model/bytecode/src/function_target_pipeline.rs index 3858b4576e2b6..e18571847b335 100644 --- a/third_party/move/move-model/bytecode/src/function_target_pipeline.rs +++ b/third_party/move/move-model/bytecode/src/function_target_pipeline.rs @@ -432,9 +432,9 @@ impl FunctionTargetPipeline { self.run_with_hook(env, targets, |_| {}, |_, _, _| {}) } - /// Runs the pipeline on all functions in the targets holder, dump the bytecode before the - /// pipeline as well as after each processor pass. If `dump_cfg` is set, dump the per-function - /// control-flow graph (in dot format) too. + /// Runs the pipeline on all functions in the targets holder, and dump the bytecode via `log` before the + /// pipeline as well as after each processor pass, identifying it by `dump_base_name`. If `dump_cfg` is set, + /// dump the per-function control-flow graph (in dot format) to a file, using the given base name. pub fn run_with_dump( &self, env: &GlobalEnv, @@ -447,20 +447,26 @@ impl FunctionTargetPipeline { env, targets, |holders| { - Self::dump_to_file( + Self::debug_dump( dump_base_name, 0, "stackless", - &Self::get_pre_pipeline_dump(env, holders), + &Self::get_pre_pipeline_dump(env, holders, /*verbose*/ true), ) }, |step_count, processor, holders| { let suffix = processor.name(); - Self::dump_to_file( + Self::debug_dump( dump_base_name, step_count, &suffix, - &Self::get_per_processor_dump(env, holders, processor, register_annotations), + &Self::get_per_processor_dump( + env, + holders, + processor, + register_annotations, + /*verbose*/ true, + ), ); if dump_cfg { Self::dump_cfg(env, holders, dump_base_name, step_count, &suffix); @@ -474,17 +480,23 @@ impl FunctionTargetPipeline { name: &str, targets: &FunctionTargetsHolder, register_annotations: &impl Fn(&FunctionTarget), + verbose: bool, ) -> String { print_targets_with_annotations_for_test( env, &format!("after processor `{}`", name), targets, register_annotations, + verbose, ) } - fn get_pre_pipeline_dump(env: &GlobalEnv, targets: &FunctionTargetsHolder) -> String { - Self::print_targets(env, "stackless", targets, &|_| {}) + fn get_pre_pipeline_dump( + env: &GlobalEnv, + targets: &FunctionTargetsHolder, + verbose: bool, + ) -> String { + Self::print_targets(env, "stackless", targets, &|_| {}, verbose) } fn get_per_processor_dump( @@ -492,6 +504,7 @@ impl FunctionTargetPipeline { targets: &FunctionTargetsHolder, processor: &dyn FunctionTargetProcessor, register_annotations: &impl Fn(&FunctionTarget), + verbose: bool, ) -> String { let mut dump = format!("{}", ProcessorResultDisplay { env, @@ -507,16 +520,15 @@ impl FunctionTargetPipeline { &processor.name(), targets, register_annotations, + verbose, )); } dump } - fn dump_to_file(base_name: &str, step_count: usize, suffix: &str, content: &str) { - let dump = format!("{}\n", content.trim()); - let file_name = format!("{}_{}_{}.bytecode", base_name, step_count, suffix); - debug!("dumping bytecode to `{}`", file_name); - fs::write(&file_name, dump).expect("dumping bytecode"); + fn debug_dump(base_name: &str, step_count: usize, suffix: &str, content: &str) { + let name = format!("bytecode of {}_{}_{}", base_name, step_count, suffix); + debug!("{}:\n{}\n", name, content.trim()) } /// Generate dot files for control-flow graphs. diff --git a/third_party/move/move-model/bytecode/src/lib.rs b/third_party/move/move-model/bytecode/src/lib.rs index aebd685663535..db0d324e58b17 100644 --- a/third_party/move/move-model/bytecode/src/lib.rs +++ b/third_party/move/move-model/bytecode/src/lib.rs @@ -33,10 +33,15 @@ pub fn print_targets_for_test( env: &GlobalEnv, header: &str, targets: &FunctionTargetsHolder, + verbose: bool, ) -> String { - print_targets_with_annotations_for_test(env, header, targets, &|target| { - target.register_annotation_formatters_for_test() - }) + print_targets_with_annotations_for_test( + env, + header, + targets, + &|target| target.register_annotation_formatters_for_test(), + verbose, + ) } /// Print function targets for testing and debugging. @@ -45,6 +50,7 @@ pub fn print_targets_with_annotations_for_test( header: &str, targets: &FunctionTargetsHolder, register_annotations: &impl Fn(&FunctionTarget), + verbose: bool, ) -> String { let mut text = String::new(); writeln!(&mut text, "============ {} ================", header).unwrap(); @@ -59,7 +65,11 @@ pub fn print_targets_with_annotations_for_test( for (variant, target) in targets.get_targets(&func_env) { if !target.data.code.is_empty() || target.func_env.is_native_or_intrinsic() { register_annotations(&target); - writeln!(&mut text, "\n[variant {}]\n{}", variant, target).unwrap(); + if verbose { + writeln!(&mut text, "\n[variant {}]\n{:#}", variant, target).unwrap(); + } else { + writeln!(&mut text, "\n[variant {}]\n{}", variant, target).unwrap(); + } } } } diff --git a/third_party/move/move-model/bytecode/src/stackless_control_flow_graph.rs b/third_party/move/move-model/bytecode/src/stackless_control_flow_graph.rs index b25c20081dff0..c1ecef460265e 100644 --- a/third_party/move/move-model/bytecode/src/stackless_control_flow_graph.rs +++ b/third_party/move/move-model/bytecode/src/stackless_control_flow_graph.rs @@ -276,6 +276,7 @@ impl<'env> std::fmt::Display for DotCFGBlock<'env> { &self.label_offsets, offset as usize, instruction, + false, ); writeln!(f, "{}", text)?; } diff --git a/third_party/move/move-model/src/builder/exp_builder.rs b/third_party/move/move-model/src/builder/exp_builder.rs index 37c68250ab6ef..111afca6ea52a 100644 --- a/third_party/move/move-model/src/builder/exp_builder.rs +++ b/third_party/move/move-model/src/builder/exp_builder.rs @@ -1957,15 +1957,6 @@ impl<'env, 'translator, 'module_translator> ExpTranslator<'env, 'translator, 'mo if n.value.as_str() == "update_field" { return Some(self.translate_update_field(expected_type, loc, generics, args)); } - let builtin_module = self.parent.parent.builtin_module(); - let full_name = QualifiedSymbol { - module_name: builtin_module, - symbol: self.symbol_pool().make(&n.value), - }; - // For other built-in functions, type check is performed in translate_call - if self.parent.parent.spec_fun_table.get(&full_name).is_some() { - return None; - } } } if let EA::ModuleAccess_::Name(n) = &maccess.value { diff --git a/third_party/move/move-vm/runtime/src/loader/mod.rs b/third_party/move/move-vm/runtime/src/loader/mod.rs index 041b0c76997c5..44c8842e71780 100644 --- a/third_party/move/move-vm/runtime/src/loader/mod.rs +++ b/third_party/move/move-vm/runtime/src/loader/mod.rs @@ -1525,7 +1525,7 @@ impl<'a> Resolver<'a> { #[derive(Clone)] struct StructLayoutInfoCacheItem { - struct_layout: MoveStructLayout, + struct_layout: MoveTypeLayout, node_count: u64, has_identifier_mappings: bool, } @@ -1537,7 +1537,7 @@ struct StructLayoutInfoCacheItem { struct StructInfoCache { struct_tag: Option<(StructTag, u64)>, struct_layout_info: Option, - annotated_struct_layout: Option, + annotated_struct_layout: Option, annotated_node_count: Option, } @@ -1708,7 +1708,7 @@ impl Loader { ty_args: &[Type], count: &mut u64, depth: u64, - ) -> PartialVMResult<(MoveStructLayout, bool)> { + ) -> PartialVMResult<(MoveTypeLayout, bool)> { let name = &*self.name_cache.idx_to_identifier(struct_idx); if let Some(struct_map) = self.type_cache.read().structs.get(name) { if let Some(struct_info) = struct_map.get(ty_args) { @@ -1742,21 +1742,28 @@ impl Loader { .into_iter() .unzip(); - // For aggregators / snapshots, the first field should be lifted. - if let Some(kind) = &maybe_mapping { - if let Some(l) = field_layouts.first_mut() { - *l = MoveTypeLayout::Tagged( - LayoutTag::IdentifierMapping(kind.clone()), - Box::new(l.clone()), - ); - } - } - let has_identifier_mappings = maybe_mapping.is_some() || field_has_identifier_mappings.into_iter().any(|b| b); let field_node_count = *count - count_before; - let struct_layout = MoveStructLayout::new(field_layouts); + let layout = if Some(IdentifierMappingKind::DerivedString) == maybe_mapping { + // For DerivedString, the whole object should be lifted. + MoveTypeLayout::Tagged( + LayoutTag::IdentifierMapping(IdentifierMappingKind::DerivedString), + Box::new(MoveTypeLayout::Struct(MoveStructLayout::new(field_layouts))), + ) + } else { + // For aggregators / snapshots, the first field should be lifted. + if let Some(kind) = &maybe_mapping { + if let Some(l) = field_layouts.first_mut() { + *l = MoveTypeLayout::Tagged( + LayoutTag::IdentifierMapping(kind.clone()), + Box::new(l.clone()), + ); + } + } + MoveTypeLayout::Struct(MoveStructLayout::new(field_layouts)) + }; let mut cache = self.type_cache.write(); let info = cache @@ -1766,12 +1773,12 @@ impl Loader { .entry(ty_args.to_vec()) .or_insert_with(StructInfoCache::new); info.struct_layout_info = Some(StructLayoutInfoCacheItem { - struct_layout: struct_layout.clone(), + struct_layout: layout.clone(), node_count: field_node_count, has_identifier_mappings, }); - Ok((struct_layout, has_identifier_mappings)) + Ok((layout, has_identifier_mappings)) } // TODO[agg_v2](cleanup): @@ -1792,6 +1799,8 @@ impl Loader { Some(IdentifierMappingKind::Aggregator) } else if ident_str.eq(ident_str!("AggregatorSnapshot")) { Some(IdentifierMappingKind::Snapshot) + } else if ident_str.eq(ident_str!("DerivedStringSnapshot")) { + Some(IdentifierMappingKind::DerivedString) } else { None } @@ -1867,14 +1876,14 @@ impl Loader { // Note depth is incread inside struct_name_to_type_layout instead. let (layout, has_identifier_mappings) = self.struct_name_to_type_layout(module_store, *idx, &[], count, depth)?; - (MoveTypeLayout::Struct(layout), has_identifier_mappings) + (layout, has_identifier_mappings) }, Type::StructInstantiation { idx, ty_args, .. } => { *count += 1; // Note depth is incread inside struct_name_to_type_layout instead. let (layout, has_identifier_mappings) = self.struct_name_to_type_layout(module_store, *idx, ty_args, count, depth)?; - (MoveTypeLayout::Struct(layout), has_identifier_mappings) + (layout, has_identifier_mappings) }, Type::Reference(_) | Type::MutableReference(_) | Type::TyParam(_) => { return Err( @@ -1892,7 +1901,7 @@ impl Loader { ty_args: &[Type], count: &mut u64, depth: u64, - ) -> PartialVMResult { + ) -> PartialVMResult { let name = &*self.name_cache.idx_to_identifier(struct_idx); if let Some(struct_map) = self.type_cache.read().structs.get(name) { if let Some(struct_info) = struct_map.get(ty_args) { @@ -1934,7 +1943,8 @@ impl Loader { Ok(MoveFieldLayout::new(n.clone(), l)) }) .collect::>>()?; - let struct_layout = MoveStructLayout::with_types(struct_tag, field_layouts); + let struct_layout = + MoveTypeLayout::Struct(MoveStructLayout::with_types(struct_tag, field_layouts)); let field_node_count = *count - count_before; let mut cache = self.type_cache.write(); @@ -1976,18 +1986,18 @@ impl Loader { Type::Vector(ty) => MoveTypeLayout::Vector(Box::new( self.type_to_fully_annotated_layout_impl(ty, module_store, count, depth + 1)?, )), - Type::Struct { idx, .. } => MoveTypeLayout::Struct( - self.struct_name_to_fully_annotated_layout(*idx, module_store, &[], count, depth)?, - ), + Type::Struct { idx, .. } => { + self.struct_name_to_fully_annotated_layout(*idx, module_store, &[], count, depth)? + }, Type::StructInstantiation { idx: name, ty_args, .. - } => MoveTypeLayout::Struct(self.struct_name_to_fully_annotated_layout( + } => self.struct_name_to_fully_annotated_layout( *name, module_store, ty_args, count, depth, - )?), + )?, Type::Reference(_) | Type::MutableReference(_) | Type::TyParam(_) => { return Err( PartialVMError::new(StatusCode::UNKNOWN_INVARIANT_VIOLATION_ERROR) diff --git a/third_party/move/tools/move-package/src/compilation/compiled_package.rs b/third_party/move/tools/move-package/src/compilation/compiled_package.rs index 7ff6fa5f072dd..c6682c3c80d1b 100644 --- a/third_party/move/tools/move-package/src/compilation/compiled_package.rs +++ b/third_party/move/tools/move-package/src/compilation/compiled_package.rs @@ -677,7 +677,6 @@ impl CompiledPackage { .collect(), skip_attribute_checks, known_attributes: known_attributes.clone(), - debug: flags.debug(), ..Default::default() }; compiler_driver_v2(options)? diff --git a/types/src/aggregate_signature.rs b/types/src/aggregate_signature.rs index 0abe6f3dd9f38..3202583b6252d 100644 --- a/types/src/aggregate_signature.rs +++ b/types/src/aggregate_signature.rs @@ -97,4 +97,8 @@ impl PartialSignatures { pub fn signatures(&self) -> &BTreeMap { &self.signatures } + + pub fn contains_voter(&self, voter: &AccountAddress) -> bool { + self.signatures.contains_key(voter) + } } diff --git a/types/src/aggregator.rs b/types/src/aggregator.rs deleted file mode 100644 index 78a3091f5583b..0000000000000 --- a/types/src/aggregator.rs +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright © Aptos Foundation -// Parts of the project are originally copyright © Meta Platforms, Inc. -// SPDX-License-Identifier: Apache-2.0 - -use move_binary_format::errors::PartialVMError; -use move_core_types::{value::MoveTypeLayout, vm_status::StatusCode}; -use move_vm_types::values::{Struct, Value}; -use std::str::FromStr; - -/// Ephemeral identifier type used by delayed fields (aggregators, snapshots) -/// during execution. -#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct DelayedFieldID(u64); - -impl DelayedFieldID { - pub fn new(value: u64) -> Self { - Self(value) - } - - pub fn as_u64(&self) -> u64 { - self.0 - } -} - -// Used for ID generation from u32/u64 counters. -impl From for DelayedFieldID { - fn from(value: u64) -> Self { - Self::new(value) - } -} - -// Represents something that should never happen - i.e. a code invariant error, -// which we would generally just panic, but since we are inside of the VM, -// we cannot do that. -#[derive(Clone, Debug, PartialEq, Eq)] -pub enum PanicError { - CodeInvariantError(String), -} - -impl ToString for PanicError { - fn to_string(&self) -> String { - match self { - PanicError::CodeInvariantError(e) => e.clone(), - } - } -} - -impl From for PartialVMError { - fn from(err: PanicError) -> Self { - match err { - PanicError::CodeInvariantError(msg) => { - PartialVMError::new(StatusCode::DELAYED_FIELDS_CODE_INVARIANT_ERROR) - .with_message(msg) - }, - } - } -} - -/// Types which implement this trait can be converted to a Move value. -pub trait TryIntoMoveValue: Sized { - type Error: std::fmt::Debug; - - fn try_into_move_value(self, layout: &MoveTypeLayout) -> Result; -} - -/// Types which implement this trait can be constructed from a Move value. -pub trait TryFromMoveValue: Sized { - // Allows to pass extra information from the caller. - type Hint; - type Error: std::fmt::Debug; - - fn try_from_move_value( - layout: &MoveTypeLayout, - value: Value, - hint: &Self::Hint, - ) -> Result; -} - -impl TryIntoMoveValue for DelayedFieldID { - type Error = PanicError; - - fn try_into_move_value(self, layout: &MoveTypeLayout) -> Result { - Ok(match layout { - MoveTypeLayout::U64 => Value::u64(self.as_u64()), - MoveTypeLayout::U128 => Value::u128(self.as_u64() as u128), - layout if is_string_layout(layout) => { - // Here, we make sure we convert identifiers to fixed-size Move - // values. This is needed because we charge gas based on the resource - // size with identifiers inside, and so it has to be deterministic. - bytes_to_string(u64_to_fixed_size_utf8_bytes(self.as_u64())) - }, - _ => { - return Err(code_invariant_error(format!( - "Failed to convert {:?} into a Move value with {} layout", - self, layout - ))) - }, - }) - } -} - -impl TryFromMoveValue for DelayedFieldID { - type Error = PanicError; - type Hint = (); - - fn try_from_move_value( - layout: &MoveTypeLayout, - value: Value, - _hint: &Self::Hint, - ) -> Result { - // Since we put the value there, we should be able to read it back, - // unless there is a bug in the code - so we expect_ok() throughout. - match layout { - MoveTypeLayout::U64 => expect_ok(value.value_as::()), - MoveTypeLayout::U128 => expect_ok(value.value_as::()).and_then(u128_to_u64), - layout if is_string_layout(layout) => expect_ok(value.value_as::()) - .and_then(string_to_bytes) - .and_then(from_utf8_bytes), - // We use value to ID conversion in serialization. - _ => Err(code_invariant_error(format!( - "Failed to convert a Move value with {} layout into an identifier", - layout - ))), - } - .map(Self::new) - } -} - -fn code_invariant_error(message: M) -> PanicError { - let msg = format!( - "Delayed logic code invariant broken (there is a bug in the code), {:?}", - message - ); - println!("ERROR: {}", msg); - // cannot link aptos_logger in aptos-types crate - // error!("{}", msg); - PanicError::CodeInvariantError(msg) -} - -fn expect_ok(value: Result) -> Result { - value.map_err(code_invariant_error) -} - -/// Returns true if the type layout corresponds to a String, which should be a -/// struct with a single byte vector field. -fn is_string_layout(layout: &MoveTypeLayout) -> bool { - use MoveTypeLayout::*; - if let Struct(move_struct) = layout { - if let [Vector(elem)] = move_struct.fields().iter().as_slice() { - if let U8 = elem.as_ref() { - return true; - } - } - } - false -} - -fn bytes_to_string(bytes: Vec) -> Value { - Value::struct_(Struct::pack(vec![Value::vector_u8(bytes)])) -} - -fn string_to_bytes(value: Struct) -> Result, PanicError> { - expect_ok(value.unpack())? - .collect::>() - .pop() - .map_or_else( - || Err(code_invariant_error("Unable to extract bytes from String")), - |v| expect_ok(v.value_as::>()), - ) -} - -fn u64_to_fixed_size_utf8_bytes(value: u64) -> Vec { - // Maximum u64 identifier size is 20 characters. We need a fixed size to - // ensure identifiers have the same size all the time for all validators, - // to ensure consistent and deterministic gas charging. - format!("{:0>20}", value).to_string().into_bytes() -} - -fn from_utf8_bytes(bytes: Vec) -> Result { - String::from_utf8(bytes) - .map_err(|e| code_invariant_error(format!("Unable to convert bytes to string: {}", e)))? - .parse::() - .map_err(|_| code_invariant_error("Unable to parse string".to_string())) -} - -fn u128_to_u64(value: u128) -> Result { - u64::try_from(value).map_err(|_| code_invariant_error("Cannot cast u128 into u64".to_string())) -} - -#[cfg(test)] -mod tests { - use super::*; - use claims::{assert_ok, assert_ok_eq}; - - #[test] - fn test_fixed_string_id_1() { - let encoded = u64_to_fixed_size_utf8_bytes(7); - assert_eq!(encoded.len(), 20); - - let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); - assert_eq!(decoded_string, "00000000000000000007"); - - let decoded = assert_ok!(decoded_string.parse::()); - assert_eq!(decoded, 7); - assert_ok_eq!(from_utf8_bytes::(encoded), 7); - } - - #[test] - fn test_fixed_string_id_2() { - let encoded = u64_to_fixed_size_utf8_bytes(u64::MAX); - assert_eq!(encoded.len(), 20); - - let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); - assert_eq!(decoded_string, "18446744073709551615"); - - let decoded = assert_ok!(decoded_string.parse::()); - assert_eq!(decoded, u64::MAX); - assert_ok_eq!(from_utf8_bytes::(encoded), u64::MAX); - } - - #[test] - fn test_fixed_string_id_3() { - let encoded = u64_to_fixed_size_utf8_bytes(0); - assert_eq!(encoded.len(), 20); - - let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); - assert_eq!(decoded_string, "00000000000000000000"); - - let decoded = assert_ok!(decoded_string.parse::()); - assert_eq!(decoded, 0); - assert_ok_eq!(from_utf8_bytes::(encoded), 0); - } -} diff --git a/types/src/delayed_fields.rs b/types/src/delayed_fields.rs new file mode 100644 index 0000000000000..086ea3947bc6b --- /dev/null +++ b/types/src/delayed_fields.rs @@ -0,0 +1,514 @@ +// Copyright © Aptos Foundation +// Parts of the project are originally copyright © Meta Platforms, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use crate::serde_helper::bcs_utils::{bcs_size_of_byte_array, size_u32_as_uleb128}; +use move_binary_format::errors::{PartialVMError, PartialVMResult}; +use move_core_types::{value::MoveTypeLayout, vm_status::StatusCode}; +use move_vm_types::values::{Struct, Value}; +use once_cell::sync::Lazy; +use std::str::FromStr; + +const BITS_FOR_SIZE: usize = 32; + +/// Ephemeral identifier type used by delayed fields (aggregators, snapshots) +/// during execution. +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct DelayedFieldID { + unique_index: u32, + // Exact number of bytes serialized delayed field will take. + width: u32, +} + +impl DelayedFieldID { + pub fn new_with_width(unique_index: u32, width: u32) -> Self { + Self { + unique_index, + width, + } + } + + pub fn new_for_test_for_u64(unique_index: u32) -> Self { + Self::new_with_width(unique_index, 8) + } + + pub fn as_u64(&self) -> u64 { + ((self.unique_index as u64) << BITS_FOR_SIZE) | self.width as u64 + } + + pub fn extract_width(&self) -> u32 { + self.width + } + + pub fn into_derived_string_struct(self) -> Result { + let width = self.extract_width() as usize; + + // we need to create DerivedString struct that serializes to exactly match given `width`. + // I.e: size_u32_as_uleb128(value.len()) + value.len() + size_u32_as_uleb128(padding.len()) + padding.len() == width + // As padding has a fixed allowed max width, it is easiest to expand value to have the padding be minimal. + // We cannot always make padding to be 0 byte vector (serialized into 1 byte) - as not all sizes are possible + // for string due to variable encoding of string length. + + // So we will over-estimate the serialized length of the value a bit. + let value_len_width_upper_bound = size_u32_as_uleb128(width - 2); // we subtract 2 because uleb sizes (for both value and padding fields) are at least 1 byte. + + // If we don't even have enough space to store the length of the value, we cannot proceed + if width <= value_len_width_upper_bound + 1 { + return Err(code_invariant_error(format!( + "DerivedStringSnapshot size issue for id {self:?}: width: {width}, value_width_upper_bound: {value_len_width_upper_bound}" + ))); + } + + let id_as_string = u64_to_fixed_size_utf8_bytes( + self.as_u64(), + // fill the string representation to leave 1 byte for padding and upper bound for it's own length serialization. + width - value_len_width_upper_bound - 1, + )?; + + bytes_and_width_to_derived_string_struct(id_as_string, width) + } +} + +// Used for ID generation from exchanged value/exchanges serialized value. +impl From for DelayedFieldID { + fn from(value: u64) -> Self { + Self { + unique_index: u32::try_from(value >> BITS_FOR_SIZE).unwrap(), + width: u32::try_from(value & ((1u64 << BITS_FOR_SIZE) - 1)).unwrap(), + } + } +} + +// Used for ID generation from u32 counter with width. +impl From<(u32, u32)> for DelayedFieldID { + fn from(value: (u32, u32)) -> Self { + let (index, width) = value; + Self::new_with_width(index, width) + } +} + +// Represents something that should never happen - i.e. a code invariant error, +// which we would generally just panic, but since we are inside of the VM, +// we cannot do that. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum PanicError { + CodeInvariantError(String), +} + +impl ToString for PanicError { + fn to_string(&self) -> String { + match self { + PanicError::CodeInvariantError(e) => e.clone(), + } + } +} + +impl From for PartialVMError { + fn from(err: PanicError) -> Self { + match err { + PanicError::CodeInvariantError(msg) => { + PartialVMError::new(StatusCode::DELAYED_FIELDS_CODE_INVARIANT_ERROR) + .with_message(msg) + }, + } + } +} + +pub trait ExtractUniqueIndex: Sized { + fn extract_unique_index(&self) -> u32; +} + +/// Types which implement this trait can be converted to a Move value. +pub trait TryIntoMoveValue: Sized { + type Error: std::fmt::Debug; + + fn try_into_move_value(self, layout: &MoveTypeLayout) -> Result; +} + +/// Types which implement this trait can be constructed from a Move value. +pub trait TryFromMoveValue: Sized { + // Allows to pass extra information from the caller. + type Hint; + type Error: std::fmt::Debug; + + fn try_from_move_value( + layout: &MoveTypeLayout, + value: Value, + hint: &Self::Hint, + ) -> Result<(Self, u32), Self::Error>; +} + +impl ExtractUniqueIndex for DelayedFieldID { + fn extract_unique_index(&self) -> u32 { + self.unique_index + } +} + +impl TryIntoMoveValue for DelayedFieldID { + type Error = PanicError; + + fn try_into_move_value(self, layout: &MoveTypeLayout) -> Result { + Ok(match layout { + MoveTypeLayout::U64 => Value::u64(self.as_u64()), + MoveTypeLayout::U128 => Value::u128(self.as_u64() as u128), + layout if is_derived_string_struct_layout(layout) => { + // Here, we make sure we convert identifiers to fixed-size Move + // values. This is needed because we charge gas based on the resource + // size with identifiers inside, and so it has to be deterministic. + + self.into_derived_string_struct()? + }, + _ => { + return Err(code_invariant_error(format!( + "Failed to convert {:?} into a Move value with {} layout", + self, layout + ))) + }, + }) + } +} + +impl TryFromMoveValue for DelayedFieldID { + type Error = PanicError; + type Hint = (); + + fn try_from_move_value( + layout: &MoveTypeLayout, + value: Value, + hint: &Self::Hint, + ) -> Result<(Self, u32), Self::Error> { + // Since we put the value there, we should be able to read it back, + // unless there is a bug in the code - so we expect_ok() throughout. + let (id, width) = match layout { + MoveTypeLayout::U64 => (expect_ok(value.value_as::()).map(Self::from)?, 8), + MoveTypeLayout::U128 => ( + expect_ok(value.value_as::()).and_then(u128_to_u64).map(Self::from)?, + 16, + ), + layout if is_derived_string_struct_layout(layout) => { + let (bytes, width) = value + .value_as::() + .and_then(derived_string_struct_to_bytes_and_length) + .map_err(|e| { + code_invariant_error(format!( + "couldn't extract derived string struct: {:?}", + e + )) + })?; + let id = from_utf8_bytes::(bytes).map(Self::from)?; + (id, width) + }, + // We use value to ID conversion in serialization. + _ => { + return Err(code_invariant_error(format!( + "Failed to convert a Move value with {layout} layout into an identifier, tagged with {hint:?}, with value {value:?}", + ))) + }, + }; + if id.extract_width() != width { + return Err(code_invariant_error(format!( + "Extracted identifier has a wrong width: id={id:?}, width={width}, expected={}", + id.extract_width(), + ))); + } + + Ok((id, width)) + } +} + +fn code_invariant_error(message: M) -> PanicError { + let msg = format!( + "Delayed logic code invariant broken (there is a bug in the code), {:?}", + message + ); + println!("ERROR: {}", msg); + // cannot link aptos_logger in aptos-types crate + // error!("{}", msg); + PanicError::CodeInvariantError(msg) +} + +fn expect_ok(value: Result) -> Result { + value.map_err(code_invariant_error) +} + +/// Returns true if the type layout corresponds to a String, which should be a +/// struct with a single byte vector field. +fn is_string_layout(layout: &MoveTypeLayout) -> bool { + use MoveTypeLayout::*; + if let Struct(move_struct) = layout { + if let [Vector(elem)] = move_struct.fields().iter().as_slice() { + if let U8 = elem.as_ref() { + return true; + } + } + } + false +} + +pub fn is_derived_string_struct_layout(layout: &MoveTypeLayout) -> bool { + use MoveTypeLayout::*; + if let Struct(move_struct) = layout { + if let [value_field, Vector(padding_elem)] = move_struct.fields().iter().as_slice() { + if is_string_layout(value_field) { + if let U8 = padding_elem.as_ref() { + return true; + } + } + } + } + false +} + +pub fn bytes_to_string(bytes: Vec) -> Value { + Value::struct_(Struct::pack(vec![Value::vector_u8(bytes)])) +} + +pub fn string_to_bytes(value: Struct) -> Result, PanicError> { + expect_ok(value.unpack())? + .collect::>() + .pop() + .map_or_else( + || Err(code_invariant_error("Unable to extract bytes from String")), + |v| expect_ok(v.value_as::>()), + ) +} + +pub fn bytes_and_width_to_derived_string_struct( + bytes: Vec, + width: usize, +) -> Result { + // We need to create DerivedStringSnapshot struct that serializes to exactly match given `width`. + + let value_width = bcs_size_of_byte_array(bytes.len()); + // padding field takes at list 1 byte (empty vector) + if value_width + 1 > width { + return Err(code_invariant_error(format!( + "DerivedStringSnapshot size issue: no space left for padding: value_width: {value_width}, width: {width}" + ))); + } + + // We assume/assert that padding never exceeds length that requires more than 1 byte for size: + // (otherwise it complicates the logic to fill until the exact width, as padding can never be serialized into 129 bytes + // (vec[0; 127] serializes into 128 bytes, and vec[0; 128] serializes into 130 bytes)) + let padding_len = width - value_width - 1; + if size_u32_as_uleb128(padding_len) > 1 { + return Err(code_invariant_error(format!( + "DerivedStringSnapshot size issue: padding expected to be too large: value_width: {value_width}, width: {width}, padding_len: {padding_len}" + ))); + } + + Ok(Value::struct_(Struct::pack(vec![ + bytes_to_string(bytes), + Value::vector_u8(vec![0; padding_len]), + ]))) +} + +pub fn u64_to_fixed_size_utf8_bytes(value: u64, length: usize) -> Result, PanicError> { + let result = format!("{:0>width$}", value, width = length) + .to_string() + .into_bytes(); + if result.len() != length { + return Err(code_invariant_error(format!( + "u64_to_fixed_size_utf8_bytes: width mismatch: value: {value}, length: {length}, result: {result:?}" + ))); + } + Ok(result) +} + +static U64_MAX_DIGITS: Lazy = Lazy::new(|| u64::MAX.to_string().len()); +static U128_MAX_DIGITS: Lazy = Lazy::new(|| u128::MAX.to_string().len()); + +pub fn to_utf8_bytes(value: impl ToString) -> Vec { + value.to_string().into_bytes() +} + +pub fn from_utf8_bytes(bytes: Vec) -> Result { + String::from_utf8(bytes) + .map_err(|e| code_invariant_error(format!("Unable to convert bytes to string: {}", e)))? + .parse::() + .map_err(|_| code_invariant_error("Unable to parse string".to_string())) +} + +pub fn derived_string_struct_to_bytes_and_length(value: Struct) -> PartialVMResult<(Vec, u32)> { + let mut fields = value.unpack()?.collect::>(); + if fields.len() != 2 { + return Err( + PartialVMError::new(StatusCode::DELAYED_FIELDS_CODE_INVARIANT_ERROR).with_message( + format!( + "DerivedStringSnapshot has wrong number of fields: {:?}", + fields.len() + ), + ), + ); + } + let padding = fields.pop().unwrap().value_as::>()?; + let value = fields.pop().unwrap(); + let string_bytes = string_to_bytes(value.value_as::()?)?; + let string_len = string_bytes.len(); + Ok(( + string_bytes, + u32::try_from(bcs_size_of_byte_array(string_len) + bcs_size_of_byte_array(padding.len())) + .map_err(|_| { + PartialVMError::new(StatusCode::DELAYED_FIELDS_CODE_INVARIANT_ERROR).with_message( + format!( + "DerivedStringSnapshot size exceeds u32: string_len: {string_len}, padding_len: {}", + padding.len() + ), + ) + })?, + )) +} + +pub fn u128_to_u64(value: u128) -> Result { + u64::try_from(value).map_err(|_| code_invariant_error("Cannot cast u128 into u64".to_string())) +} + +pub fn calculate_width_for_constant_string(byte_len: usize) -> usize { + // we need to be able to store it both raw, as well as when it is exchanged with u64 DelayedFieldID. + // so the width needs to be larger of the two options + (bcs_size_of_byte_array(byte_len) + 1) // 1 is for empty padding serialized length + .max(*U64_MAX_DIGITS + 2) // largest exchanged u64 DelayedFieldID is u64 max digits, plus 1 for each of the value and padding serialized length +} + +pub fn calculate_width_for_integer_embeded_string( + rest_byte_len: usize, + snapshot_id: DelayedFieldID, +) -> Result { + // we need to translate byte width into string character width. + let max_snapshot_string_width = match snapshot_id.extract_width() { + 8 => *U64_MAX_DIGITS, + 16 => *U128_MAX_DIGITS, + x => { + return Err(code_invariant_error(format!( + "unexpected width ({x}) for integer snapshot id: {snapshot_id:?}" + ))) + }, + }; + + Ok(bcs_size_of_byte_array(rest_byte_len + max_snapshot_string_width) + 1) // 1 for padding length +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SnapshotToStringFormula { + Concat { prefix: Vec, suffix: Vec }, +} + +impl SnapshotToStringFormula { + pub fn apply_to(&self, base: u128) -> Vec { + match self { + SnapshotToStringFormula::Concat { prefix, suffix } => { + let middle_string = base.to_string(); + let middle = middle_string.as_bytes(); + let mut result = Vec::with_capacity(prefix.len() + middle.len() + suffix.len()); + result.extend(prefix); + result.extend(middle); + result.extend(suffix); + result + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use claims::{assert_err, assert_ok, assert_ok_eq}; + + #[test] + fn test_int_to_string_fails_on_small_width() { + assert_err!(u64_to_fixed_size_utf8_bytes(1000, 1)); + } + + #[test] + fn test_width_calculation() { + for data in [ + vec![], + vec![60; 1], + vec![60; 5], + vec![60; 127], + vec![60; 128], + vec![60; 129], + ] { + { + let width = calculate_width_for_constant_string(data.len()); + assert_ok!(bytes_and_width_to_derived_string_struct( + data.clone(), + width + )); + assert_ok!(DelayedFieldID::new_with_width(u32::MAX, width as u32) + .into_derived_string_struct()); + } + { + let width = assert_ok!(calculate_width_for_integer_embeded_string( + data.len(), + DelayedFieldID::new_with_width(u32::MAX, 8) + )); + assert_ok!(bytes_and_width_to_derived_string_struct( + SnapshotToStringFormula::Concat { + prefix: data.clone(), + suffix: vec![] + } + .apply_to(u64::MAX as u128), + width + )); + assert_ok!(DelayedFieldID::new_with_width(u32::MAX, width as u32) + .into_derived_string_struct()); + } + { + let width = assert_ok!(calculate_width_for_integer_embeded_string( + data.len(), + DelayedFieldID::new_with_width(u32::MAX, 16) + )); + assert_ok!(bytes_and_width_to_derived_string_struct( + SnapshotToStringFormula::Concat { + prefix: data.clone(), + suffix: vec![] + } + .apply_to(u128::MAX), + width + )); + assert_ok!(DelayedFieldID::new_with_width(u32::MAX, width as u32) + .into_derived_string_struct()); + } + } + } + + #[test] + fn test_fixed_string_id_1() { + let encoded = assert_ok!(u64_to_fixed_size_utf8_bytes(7, 30)); + assert_eq!(encoded.len(), 30); + + let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); + assert_eq!(decoded_string, "000000000000000000000000000007"); + + let decoded = assert_ok!(decoded_string.parse::()); + assert_eq!(decoded, 7); + assert_ok_eq!(from_utf8_bytes::(encoded), 7); + } + + #[test] + fn test_fixed_string_id_2() { + let encoded = assert_ok!(u64_to_fixed_size_utf8_bytes(u64::MAX, 20)); + assert_eq!(encoded.len(), 20); + + let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); + assert_eq!(decoded_string, "18446744073709551615"); + + let decoded = assert_ok!(decoded_string.parse::()); + assert_eq!(decoded, u64::MAX); + assert_ok_eq!(from_utf8_bytes::(encoded), u64::MAX); + } + + #[test] + fn test_fixed_string_id_3() { + let encoded = assert_ok!(u64_to_fixed_size_utf8_bytes(0, 20)); + assert_eq!(encoded.len(), 20); + + let decoded_string = assert_ok!(String::from_utf8(encoded.clone())); + assert_eq!(decoded_string, "00000000000000000000"); + + let decoded = assert_ok!(decoded_string.parse::()); + assert_eq!(decoded, 0); + assert_ok_eq!(from_utf8_bytes::(encoded), 0); + } +} diff --git a/types/src/dkg/mod.rs b/types/src/dkg/mod.rs index 9df95eebd8a9c..4ad45711131f2 100644 --- a/types/src/dkg/mod.rs +++ b/types/src/dkg/mod.rs @@ -20,7 +20,6 @@ pub struct DKGTranscriptMetadata { pub author: AccountAddress, } -/// Reflection of Move type `0x1::dkg::DKGStartEvent`. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct DKGStartEvent { pub session_metadata: DKGSessionMetadata, @@ -47,6 +46,16 @@ impl DKGTranscript { transcript_bytes, } } + + pub fn dummy() -> Self { + Self { + metadata: DKGTranscriptMetadata { + epoch: 0, + author: AccountAddress::ZERO, + }, + transcript_bytes: vec![], + } + } } // The input of DKG. diff --git a/types/src/jwks/mod.rs b/types/src/jwks/mod.rs index 307502b398fe4..bae7c46633964 100644 --- a/types/src/jwks/mod.rs +++ b/types/src/jwks/mod.rs @@ -1,13 +1,14 @@ // Copyright © Aptos Foundation use self::jwk::JWK; -use crate::{move_utils::as_move_value::AsMoveValue, on_chain_config::OnChainConfig}; +use crate::{ + aggregate_signature::AggregateSignature, move_utils::as_move_value::AsMoveValue, + on_chain_config::OnChainConfig, +}; use anyhow::{bail, Context}; -use aptos_crypto::bls12381; use aptos_crypto_derive::{BCSCryptoHash, CryptoHasher}; use jwk::JWKMoveStruct; use move_core_types::{ - account_address::AccountAddress, ident_str, identifier::IdentStr, move_resource::MoveStructType, @@ -15,7 +16,7 @@ use move_core_types::{ }; use serde::{Deserialize, Serialize}; use std::{ - collections::{BTreeSet, HashMap}, + collections::HashMap, fmt::{Debug, Formatter}, }; @@ -29,6 +30,11 @@ pub fn issuer_from_str(s: &str) -> Issuer { s.as_bytes().to_vec() } +#[cfg(any(test, feature = "fuzzing"))] +pub fn dummy_issuer() -> Issuer { + issuer_from_str("https:://dummy.issuer") +} + /// Move type `0x1::jwks::OIDCProvider` in rust. /// See its doc in Move for more details. #[derive(Default, Serialize, Deserialize)] @@ -59,6 +65,7 @@ impl OnChainConfig for SupportedOIDCProviders { /// See its doc in Move for more details. #[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize, CryptoHasher, BCSCryptoHash)] pub struct ProviderJWKs { + #[serde(with = "serde_bytes")] pub issuer: Issuer, pub version: u64, pub jwks: Vec, @@ -185,18 +192,16 @@ impl MoveStructType for PatchedJWKs { /// A JWK update in format of `ProviderJWKs` and a multi-signature of it as a quorum certificate. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, CryptoHasher, BCSCryptoHash)] pub struct QuorumCertifiedUpdate { - pub authors: BTreeSet, pub update: ProviderJWKs, - pub multi_sig: bls12381::Signature, + pub multi_sig: AggregateSignature, } impl QuorumCertifiedUpdate { #[cfg(any(test, feature = "fuzzing"))] pub fn dummy() -> Self { Self { - authors: Default::default(), - update: Default::default(), - multi_sig: bls12381::Signature::dummy_signature(), + update: ProviderJWKs::new(dummy_issuer()), + multi_sig: AggregateSignature::empty(), } } } diff --git a/types/src/lib.rs b/types/src/lib.rs index b443cdf32903e..4b622a3e50e65 100644 --- a/types/src/lib.rs +++ b/types/src/lib.rs @@ -59,10 +59,10 @@ pub use utility_coin::*; pub mod account_view; pub mod aggregate_signature; -pub mod aggregator; pub mod block_executor; pub mod bn254_circom; pub mod bytes; +pub mod delayed_fields; pub mod state_store; #[cfg(test)] mod unit_tests; diff --git a/types/src/proptest_types.rs b/types/src/proptest_types.rs index bdf9101492ec8..2ebde083bfb66 100644 --- a/types/src/proptest_types.rs +++ b/types/src/proptest_types.rs @@ -15,6 +15,7 @@ use crate::{ block_metadata_ext::BlockMetadataExt, chain_id::ChainId, contract_event::ContractEvent, + dkg::{DKGTranscript, DKGTranscriptMetadata}, epoch_state::EpochState, event::{EventHandle, EventKey}, ledger_info::{generate_ledger_info_with_sig, LedgerInfo, LedgerInfoWithSignatures}, @@ -29,7 +30,7 @@ use crate::{ }, validator_info::ValidatorInfo, validator_signer::ValidatorSigner, - validator_txn::{DummyValidatorTransaction, ValidatorTransaction}, + validator_txn::ValidatorTransaction, validator_verifier::{ValidatorConsensusInfo, ValidatorVerifier}, vm_status::VMStatus, write_set::{WriteOp, WriteSet, WriteSetMut}, @@ -1262,9 +1263,15 @@ impl Arbitrary for ValidatorTransaction { type Strategy = BoxedStrategy; fn arbitrary_with(_args: Self::Parameters) -> Self::Strategy { - (any::(), any::>()) - .prop_map(|(valid, payload)| { - ValidatorTransaction::DummyTopic1(DummyValidatorTransaction { valid, payload }) + (any::>()) + .prop_map(|payload| { + ValidatorTransaction::DKGResult(DKGTranscript { + metadata: DKGTranscriptMetadata { + epoch: 0, + author: AccountAddress::ZERO, + }, + transcript_bytes: payload, + }) }) .boxed() } diff --git a/types/src/serde_helper/bcs_utils.rs b/types/src/serde_helper/bcs_utils.rs new file mode 100644 index 0000000000000..763e24d4f17a6 --- /dev/null +++ b/types/src/serde_helper/bcs_utils.rs @@ -0,0 +1,38 @@ +// Copyright © Aptos Foundation + +pub fn size_u32_as_uleb128(mut value: usize) -> usize { + let mut len = 1; + while value >= 0x80 { + // 7 (lowest) bits of data get written in a single byte. + len += 1; + value >>= 7; + } + len +} + +pub fn bcs_size_of_byte_array(length: usize) -> usize { + size_u32_as_uleb128(length) + length +} + +#[test] +fn test_size_u32_as_uleb128() { + assert_eq!(size_u32_as_uleb128(0), 1); + assert_eq!(size_u32_as_uleb128(127), 1); + assert_eq!(size_u32_as_uleb128(128), 2); + assert_eq!(size_u32_as_uleb128(128 * 128 - 1), 2); + assert_eq!(size_u32_as_uleb128(128 * 128), 3); +} + +#[test] +fn test_group_size_same_as_bcs() { + use bytes::Bytes; + + let reused_vec = Bytes::from(vec![5; 20000]); + + for i in [1, 2, 3, 5, 15, 100, 1000, 10000, 20000] { + assert_eq!( + bcs::serialized_size(&reused_vec.slice(0..i)).unwrap(), + bcs_size_of_byte_array(i) + ); + } +} diff --git a/types/src/serde_helper/mod.rs b/types/src/serde_helper/mod.rs index 876f018225ea6..9e2870b54f409 100644 --- a/types/src/serde_helper/mod.rs +++ b/types/src/serde_helper/mod.rs @@ -2,4 +2,5 @@ // Parts of the project are originally copyright © Meta Platforms, Inc. // SPDX-License-Identifier: Apache-2.0 +pub mod bcs_utils; pub mod vec_bytes; diff --git a/types/src/transaction/mod.rs b/types/src/transaction/mod.rs index 33c7105110703..86d066fb5ccee 100644 --- a/types/src/transaction/mod.rs +++ b/types/src/transaction/mod.rs @@ -6,10 +6,10 @@ use crate::{ account_address::AccountAddress, - aggregator::{TryFromMoveValue, TryIntoMoveValue}, block_metadata::BlockMetadata, chain_id::ChainId, contract_event::{ContractEvent, FEE_STATEMENT_EVENT_TYPE}, + delayed_fields::{ExtractUniqueIndex, TryFromMoveValue, TryIntoMoveValue}, ledger_info::LedgerInfo, proof::{TransactionInfoListWithProof, TransactionInfoWithProof}, state_store::ShardedStateUpdates, @@ -1996,6 +1996,8 @@ pub trait BlockExecutableTransaction: Sync + Send + Clone + 'static { + Debug + Copy + From + + From<(u32, u32)> + + ExtractUniqueIndex + TryIntoMoveValue + TryFromMoveValue; type Value: Send + Sync + Debug + Clone + TransactionWrite; diff --git a/types/src/transaction/signature_verified_transaction.rs b/types/src/transaction/signature_verified_transaction.rs index a3cf35feec5fe..9b846fd564383 100644 --- a/types/src/transaction/signature_verified_transaction.rs +++ b/types/src/transaction/signature_verified_transaction.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{ - aggregator::DelayedFieldID, contract_event::ContractEvent, + delayed_fields::DelayedFieldID, state_store::state_key::StateKey, transaction::{BlockExecutableTransaction, Transaction}, write_set::WriteOp, diff --git a/types/src/validator_txn.rs b/types/src/validator_txn.rs index a037930b7df25..d99b43a96a1d2 100644 --- a/types/src/validator_txn.rs +++ b/types/src/validator_txn.rs @@ -1,40 +1,30 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 +#[cfg(any(test, feature = "fuzzing"))] +use crate::dkg::DKGTranscriptMetadata; use crate::{dkg::DKGTranscript, jwks}; use aptos_crypto_derive::{BCSCryptoHash, CryptoHasher}; +#[cfg(any(test, feature = "fuzzing"))] +use move_core_types::account_address::AccountAddress; use serde::{Deserialize, Serialize}; use std::fmt::Debug; #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, CryptoHasher, BCSCryptoHash)] pub enum ValidatorTransaction { - DummyTopic1(DummyValidatorTransaction), DKGResult(DKGTranscript), - DummyTopic2(DummyValidatorTransaction), ObservedJWKUpdate(jwks::QuorumCertifiedUpdate), } -#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, CryptoHasher, BCSCryptoHash)] -pub struct DummyValidatorTransaction { - pub valid: bool, - #[serde(with = "serde_bytes")] - pub payload: Vec, -} - impl ValidatorTransaction { #[cfg(any(test, feature = "fuzzing"))] - pub fn dummy1(payload: Vec) -> Self { - Self::DummyTopic1(DummyValidatorTransaction { - valid: true, - payload, - }) - } - - #[cfg(any(test, feature = "fuzzing"))] - pub fn dummy2(payload: Vec) -> Self { - Self::DummyTopic2(DummyValidatorTransaction { - valid: true, - payload, + pub fn dummy(payload: Vec) -> Self { + Self::DKGResult(DKGTranscript { + metadata: DKGTranscriptMetadata { + epoch: 999, + author: AccountAddress::ZERO, + }, + transcript_bytes: payload, }) } @@ -44,9 +34,7 @@ impl ValidatorTransaction { pub fn topic(&self) -> Topic { match self { - ValidatorTransaction::DummyTopic1(_) => Topic::DUMMY1, ValidatorTransaction::DKGResult(_) => Topic::DKG, - ValidatorTransaction::DummyTopic2(_) => Topic::DUMMY2, ValidatorTransaction::ObservedJWKUpdate(update) => { Topic::JWK_CONSENSUS(update.update.issuer.clone()) }, @@ -59,6 +47,4 @@ impl ValidatorTransaction { pub enum Topic { DKG, JWK_CONSENSUS(jwks::Issuer), - DUMMY1, - DUMMY2, }