Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add initial version prover_autoscaler #2993

Merged
merged 33 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
80d8645
Initial version of Prover Autoscaler Agent
yorik Sep 5, 2024
e534339
Add scaler API and implementation
yorik Sep 9, 2024
8432072
Add drafts of global watcher/queuer/scaler.
yorik Sep 11, 2024
9873b13
fmt
yorik Sep 11, 2024
ff0bfb4
fmt
yorik Sep 11, 2024
ad1e170
More code
yorik Sep 24, 2024
d909fa1
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Sep 24, 2024
ef7e1ab
Add config.
yorik Oct 1, 2024
bfac6a0
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 1, 2024
aa68e77
Lint fixes
yorik Oct 1, 2024
a194ba3
Add serde-human-readable feature
yorik Oct 1, 2024
4b31298
Better Duration conversion.
yorik Oct 1, 2024
76c516e
Submodule sync
yorik Oct 1, 2024
f9fcc14
Add monitoring for Scaler
yorik Oct 1, 2024
b36750e
Lint fix
yorik Oct 1, 2024
c70e457
Lint fixes
yorik Oct 1, 2024
fcc3b41
Remove unneeded dependencies
yorik Oct 2, 2024
e8a4e00
Remove unneeded use.
yorik Oct 2, 2024
23331e7
Cleanup
yorik Oct 2, 2024
84ee139
Improved style
yorik Oct 3, 2024
c0fb3aa
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 3, 2024
72eb1bd
Convert from let-else to different style
yorik Oct 7, 2024
562edf4
Added more values to config
yorik Oct 7, 2024
b09b8f2
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
e1af797
Fix lint
yorik Oct 7, 2024
7ca0ca2
Fix test and lint
yorik Oct 7, 2024
843525a
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
b2426e9
Fix lint and PR comments.
yorik Oct 7, 2024
ebc0695
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
e3520d9
Add observability.
yorik Oct 7, 2024
965f5c4
Move strum_macros to correct place
yorik Oct 8, 2024
60b05e1
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 9, 2024
c23a9af
Merge branch 'main' into ya-zkd-1855-implement-poc-of-quick-prover-au…
yorik Oct 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

163 changes: 82 additions & 81 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,85 +1,85 @@
[workspace]
members = [
# Binaries
"core/bin/block_reverter",
"core/bin/contract-verifier",
"core/bin/external_node",
"core/bin/merkle_tree_consistency_checker",
"core/bin/snapshots_creator",
"core/bin/selector_generator",
"core/bin/system-constants-generator",
"core/bin/verified_sources_fetcher",
"core/bin/zksync_server",
"core/bin/genesis_generator",
"core/bin/zksync_tee_prover",
# Node services
"core/node/node_framework",
"core/node/proof_data_handler",
"core/node/block_reverter",
"core/node/commitment_generator",
"core/node/house_keeper",
"core/node/genesis",
"core/node/shared_metrics",
"core/node/db_pruner",
"core/node/fee_model",
"core/node/da_dispatcher",
"core/node/eth_sender",
"core/node/vm_runner",
"core/node/test_utils",
"core/node/state_keeper",
"core/node/reorg_detector",
"core/node/consistency_checker",
"core/node/metadata_calculator",
"core/node/node_sync",
"core/node/node_storage_init",
"core/node/consensus",
"core/node/contract_verification_server",
"core/node/api_server",
"core/node/tee_verifier_input_producer",
"core/node/base_token_adjuster",
"core/node/external_proof_integration_api",
"core/node/logs_bloom_backfill",
"core/node/da_clients",
# Libraries
"core/lib/db_connection",
"core/lib/zksync_core_leftovers",
"core/lib/basic_types",
"core/lib/config",
"core/lib/constants",
"core/lib/contract_verifier",
"core/lib/contracts",
"core/lib/circuit_breaker",
"core/lib/dal",
"core/lib/env_config",
"core/lib/da_client",
"core/lib/eth_client",
"core/lib/eth_signer",
"core/lib/l1_contract_interface",
"core/lib/mempool",
"core/lib/merkle_tree",
"core/lib/mini_merkle_tree",
"core/lib/node_framework_derive",
"core/lib/object_store",
"core/lib/prover_interface",
"core/lib/queued_job_processor",
"core/lib/state",
"core/lib/storage",
"core/lib/tee_verifier",
"core/lib/types",
"core/lib/protobuf_config",
"core/lib/utils",
"core/lib/vlog",
"core/lib/multivm",
"core/lib/vm_interface",
"core/lib/vm_executor",
"core/lib/web3_decl",
"core/lib/snapshots_applier",
"core/lib/crypto_primitives",
"core/lib/external_price_api",
# Test infrastructure
"core/tests/test_account",
"core/tests/loadnext",
"core/tests/vm-benchmark",
# Binaries
"core/bin/block_reverter",
"core/bin/contract-verifier",
"core/bin/external_node",
"core/bin/merkle_tree_consistency_checker",
"core/bin/snapshots_creator",
"core/bin/selector_generator",
"core/bin/system-constants-generator",
"core/bin/verified_sources_fetcher",
"core/bin/zksync_server",
"core/bin/genesis_generator",
"core/bin/zksync_tee_prover",
# Node services
"core/node/node_framework",
"core/node/proof_data_handler",
"core/node/block_reverter",
"core/node/commitment_generator",
"core/node/house_keeper",
"core/node/genesis",
"core/node/shared_metrics",
"core/node/db_pruner",
"core/node/fee_model",
"core/node/da_dispatcher",
"core/node/eth_sender",
"core/node/vm_runner",
"core/node/test_utils",
"core/node/state_keeper",
"core/node/reorg_detector",
"core/node/consistency_checker",
"core/node/metadata_calculator",
"core/node/node_sync",
"core/node/node_storage_init",
"core/node/consensus",
"core/node/contract_verification_server",
"core/node/api_server",
"core/node/tee_verifier_input_producer",
"core/node/base_token_adjuster",
"core/node/external_proof_integration_api",
"core/node/logs_bloom_backfill",
"core/node/da_clients",
# Libraries
"core/lib/db_connection",
"core/lib/zksync_core_leftovers",
"core/lib/basic_types",
"core/lib/config",
"core/lib/constants",
"core/lib/contract_verifier",
"core/lib/contracts",
"core/lib/circuit_breaker",
"core/lib/dal",
"core/lib/env_config",
"core/lib/da_client",
"core/lib/eth_client",
"core/lib/eth_signer",
"core/lib/l1_contract_interface",
"core/lib/mempool",
"core/lib/merkle_tree",
"core/lib/mini_merkle_tree",
"core/lib/node_framework_derive",
"core/lib/object_store",
"core/lib/prover_interface",
"core/lib/queued_job_processor",
"core/lib/state",
"core/lib/storage",
"core/lib/tee_verifier",
"core/lib/types",
"core/lib/protobuf_config",
"core/lib/utils",
"core/lib/vlog",
"core/lib/multivm",
"core/lib/vm_interface",
"core/lib/vm_executor",
"core/lib/web3_decl",
"core/lib/snapshots_applier",
"core/lib/crypto_primitives",
"core/lib/external_price_api",
# Test infrastructure
"core/tests/test_account",
"core/tests/loadnext",
"core/tests/vm-benchmark",
]
resolver = "2"

Expand Down Expand Up @@ -172,6 +172,7 @@ sqlx = "0.8.1"
static_assertions = "1.1"
structopt = "0.3.20"
strum = "0.26"
strum_macros = "0.26.4"
tempfile = "3.0.2"
test-casing = "0.1.2"
test-log = "0.2.15"
Expand All @@ -185,7 +186,7 @@ tower-http = "0.5.2"
tracing = "0.1"
tracing-subscriber = "0.3"
tracing-opentelemetry = "0.25.0"
time = "0.3.36" # Has to be same as used by `tracing-subscriber`
time = "0.3.36" # Has to be same as used by `tracing-subscriber`
url = "2"
web3 = "0.19.0"
fraction = "0.15.3"
Expand Down
4 changes: 4 additions & 0 deletions core/lib/config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ anyhow.workspace = true
rand.workspace = true
secrecy.workspace = true
serde = { workspace = true, features = ["derive"] }
time = { workspace = true, features = ["serde-human-readable"] }
yorik marked this conversation as resolved.
Show resolved Hide resolved
strum.workspace = true
strum_macros.workspace = true
vise.workspace = true

[dev-dependencies]
serde_json.workspace = true
Expand Down
1 change: 1 addition & 0 deletions core/lib/config/src/configs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub mod house_keeper;
pub mod object_store;
pub mod observability;
pub mod proof_data_handler;
pub mod prover_autoscaler;
pub mod prover_job_monitor;
pub mod pruning;
pub mod secrets;
Expand Down
117 changes: 117 additions & 0 deletions core/lib/config/src/configs/prover_autoscaler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
use std::collections::HashMap;

use serde::Deserialize;
use strum::Display;
use strum_macros::EnumString;
use time::Duration;
use vise::EncodeLabelValue;

use crate::configs::ObservabilityConfig;

/// Config used for running ProverAutoscaler (both Scaler and Agent).
#[derive(Debug, Clone, PartialEq)]
pub struct ProverAutoscalerConfig {
/// Amount of time ProverJobMonitor will wait all it's tasks to finish.
yorik marked this conversation as resolved.
Show resolved Hide resolved
// TODO: find a way to use #[serde(with = "humantime_serde")] with time::Duration.
pub graceful_shutdown_timeout: Duration,
pub agent_config: Option<ProverAutoscalerAgentConfig>,
pub scaler_config: Option<ProverAutoscalerScalerConfig>,
pub observability: Option<ObservabilityConfig>,
}

#[derive(Debug, Clone, PartialEq, Deserialize)]
pub struct ProverAutoscalerAgentConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// HTTP port for global Scaler to connect to the Agent running in a cluster.
pub http_port: u16,
/// List of namespaces to watch.
#[serde(default = "ProverAutoscalerAgentConfig::default_namespaces")]
pub namespaces: Vec<String>,
/// Watched cluster name. Also can be set via flag.
pub cluster_name: Option<String>,
yorik marked this conversation as resolved.
Show resolved Hide resolved
}

#[derive(Debug, Clone, PartialEq, Deserialize, Default)]
pub struct ProverAutoscalerScalerConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// The interval between runs for global Scaler.
#[serde(default = "ProverAutoscalerScalerConfig::default_scaler_run_interval")]
pub scaler_run_interval: Duration,
/// URL to get queue reports from.
/// In production should be "http://prover-job-monitor.stage2.svc.cluster.local:3074/queue_report".
#[serde(default = "ProverAutoscalerScalerConfig::default_prover_job_monitor_url")]
pub prover_job_monitor_url: String,
/// List of ProverAutoscaler Agents to get cluster data from.
pub agents: Vec<String>,
/// Mapping of namespaces to protocol versions.
pub protocol_versions: HashMap<String, String>,
/// Default priorities, which cluster to prefer when there is no other information.
pub cluster_priorities: HashMap<String, u32>,
/// Prover speed per GPU. Used to calculate desired number of provers for queue size.
pub prover_speed: HashMap<Gpu, u32>,
/// Duration after which pending pod considered long pending.
#[serde(default = "ProverAutoscalerScalerConfig::default_long_pending_duration")]
pub long_pending_duration: Duration,
}

#[derive(
Default,
Debug,
Display,
Hash,
PartialEq,
Eq,
Clone,
Copy,
Ord,
PartialOrd,
EnumString,
EncodeLabelValue,
Deserialize,
)]
pub enum Gpu {
#[default]
Unknown,
#[strum(ascii_case_insensitive)]
L4,
#[strum(ascii_case_insensitive)]
T4,
#[strum(ascii_case_insensitive)]
V100,
#[strum(ascii_case_insensitive)]
P100,
#[strum(ascii_case_insensitive)]
A100,
}

impl ProverAutoscalerConfig {
/// Default graceful shutdown timeout -- 5 seconds
pub fn default_graceful_shutdown_timeout() -> Duration {
Duration::seconds(5)
}
}

impl ProverAutoscalerAgentConfig {
pub fn default_namespaces() -> Vec<String> {
vec!["prover-blue".to_string(), "prover-red".to_string()]
}
}

impl ProverAutoscalerScalerConfig {
/// Default scaler_run_interval -- 10s
pub fn default_scaler_run_interval() -> Duration {
Duration::seconds(10)
}

/// Default prover_job_monitor_url -- cluster local URL
pub fn default_prover_job_monitor_url() -> String {
yorik marked this conversation as resolved.
Show resolved Hide resolved
"http://localhost:3074/queue_report".to_string()
}

/// Default long_pending_duration -- 10m
pub fn default_long_pending_duration() -> Duration {
Duration::minutes(10)
}
}
1 change: 1 addition & 0 deletions core/lib/protobuf_config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rand.workspace = true
hex.workspace = true
secrecy.workspace = true
tracing.workspace = true
time.workspace = true
yorik marked this conversation as resolved.
Show resolved Hide resolved

[build-dependencies]
zksync_protobuf_build.workspace = true
1 change: 1 addition & 0 deletions core/lib/protobuf_config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mod observability;
mod proof_data_handler;
pub mod proto;
mod prover;
mod prover_autoscaler;
mod prover_job_monitor;
mod pruning;
mod secrets;
Expand Down
Loading
Loading