-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add initial version prover_autoscaler (#2993)
## What ❔ Add zksync_prover_autoscaler, which collects data, but only reports metrics instead of actual scaling. <!-- What are the changes this PR brings about? --> <!-- Example: This PR adds a PR template to the repo. --> <!-- (For bigger PRs adding more context is appreciated) --> ## Why ❔ First step in creating fast global prover autoscaler. <!-- Why are these changes done? What goal do they contribute to? What are the principles behind them? --> <!-- Example: PR templates ensure PR reviewers, observers, and future iterators are in context about the evolution of repos. --> ## Checklist <!-- Check your PR fulfills the following items. --> <!-- For draft PRs check the boxes as you complete them. --> - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [x] Code has been formatted via `zk fmt` and `zk lint`.
- Loading branch information
Showing
25 changed files
with
2,060 additions
and
103 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
use std::collections::HashMap; | ||
|
||
use serde::Deserialize; | ||
use strum::Display; | ||
use strum_macros::EnumString; | ||
use time::Duration; | ||
use vise::EncodeLabelValue; | ||
|
||
use crate::configs::ObservabilityConfig; | ||
|
||
/// Config used for running ProverAutoscaler (both Scaler and Agent). | ||
#[derive(Debug, Clone, PartialEq)] | ||
pub struct ProverAutoscalerConfig { | ||
/// Amount of time ProverJobMonitor will wait all it's tasks to finish. | ||
// TODO: find a way to use #[serde(with = "humantime_serde")] with time::Duration. | ||
pub graceful_shutdown_timeout: Duration, | ||
pub agent_config: Option<ProverAutoscalerAgentConfig>, | ||
pub scaler_config: Option<ProverAutoscalerScalerConfig>, | ||
pub observability: Option<ObservabilityConfig>, | ||
} | ||
|
||
#[derive(Debug, Clone, PartialEq, Deserialize)] | ||
pub struct ProverAutoscalerAgentConfig { | ||
/// Port for prometheus metrics connection. | ||
pub prometheus_port: u16, | ||
/// HTTP port for global Scaler to connect to the Agent running in a cluster. | ||
pub http_port: u16, | ||
/// List of namespaces to watch. | ||
#[serde(default = "ProverAutoscalerAgentConfig::default_namespaces")] | ||
pub namespaces: Vec<String>, | ||
/// Watched cluster name. Also can be set via flag. | ||
pub cluster_name: Option<String>, | ||
} | ||
|
||
#[derive(Debug, Clone, PartialEq, Deserialize, Default)] | ||
pub struct ProverAutoscalerScalerConfig { | ||
/// Port for prometheus metrics connection. | ||
pub prometheus_port: u16, | ||
/// The interval between runs for global Scaler. | ||
#[serde(default = "ProverAutoscalerScalerConfig::default_scaler_run_interval")] | ||
pub scaler_run_interval: Duration, | ||
/// URL to get queue reports from. | ||
/// In production should be "http://prover-job-monitor.stage2.svc.cluster.local:3074/queue_report". | ||
#[serde(default = "ProverAutoscalerScalerConfig::default_prover_job_monitor_url")] | ||
pub prover_job_monitor_url: String, | ||
/// List of ProverAutoscaler Agents to get cluster data from. | ||
pub agents: Vec<String>, | ||
/// Mapping of namespaces to protocol versions. | ||
pub protocol_versions: HashMap<String, String>, | ||
/// Default priorities, which cluster to prefer when there is no other information. | ||
pub cluster_priorities: HashMap<String, u32>, | ||
/// Prover speed per GPU. Used to calculate desired number of provers for queue size. | ||
pub prover_speed: HashMap<Gpu, u32>, | ||
/// Duration after which pending pod considered long pending. | ||
#[serde(default = "ProverAutoscalerScalerConfig::default_long_pending_duration")] | ||
pub long_pending_duration: Duration, | ||
} | ||
|
||
#[derive( | ||
Default, | ||
Debug, | ||
Display, | ||
Hash, | ||
PartialEq, | ||
Eq, | ||
Clone, | ||
Copy, | ||
Ord, | ||
PartialOrd, | ||
EnumString, | ||
EncodeLabelValue, | ||
Deserialize, | ||
)] | ||
pub enum Gpu { | ||
#[default] | ||
Unknown, | ||
#[strum(ascii_case_insensitive)] | ||
L4, | ||
#[strum(ascii_case_insensitive)] | ||
T4, | ||
#[strum(ascii_case_insensitive)] | ||
V100, | ||
#[strum(ascii_case_insensitive)] | ||
P100, | ||
#[strum(ascii_case_insensitive)] | ||
A100, | ||
} | ||
|
||
impl ProverAutoscalerConfig { | ||
/// Default graceful shutdown timeout -- 5 seconds | ||
pub fn default_graceful_shutdown_timeout() -> Duration { | ||
Duration::seconds(5) | ||
} | ||
} | ||
|
||
impl ProverAutoscalerAgentConfig { | ||
pub fn default_namespaces() -> Vec<String> { | ||
vec!["prover-blue".to_string(), "prover-red".to_string()] | ||
} | ||
} | ||
|
||
impl ProverAutoscalerScalerConfig { | ||
/// Default scaler_run_interval -- 10s | ||
pub fn default_scaler_run_interval() -> Duration { | ||
Duration::seconds(10) | ||
} | ||
|
||
/// Default prover_job_monitor_url -- cluster local URL | ||
pub fn default_prover_job_monitor_url() -> String { | ||
"http://localhost:3074/queue_report".to_string() | ||
} | ||
|
||
/// Default long_pending_duration -- 10m | ||
pub fn default_long_pending_duration() -> Duration { | ||
Duration::minutes(10) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.