Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add initial version prover_autoscaler #2993

Merged
merged 33 commits into from
Oct 9, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
80d8645
Initial version of Prover Autoscaler Agent
yorik Sep 5, 2024
e534339
Add scaler API and implementation
yorik Sep 9, 2024
8432072
Add drafts of global watcher/queuer/scaler.
yorik Sep 11, 2024
9873b13
fmt
yorik Sep 11, 2024
ff0bfb4
fmt
yorik Sep 11, 2024
ad1e170
More code
yorik Sep 24, 2024
d909fa1
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Sep 24, 2024
ef7e1ab
Add config.
yorik Oct 1, 2024
bfac6a0
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 1, 2024
aa68e77
Lint fixes
yorik Oct 1, 2024
a194ba3
Add serde-human-readable feature
yorik Oct 1, 2024
4b31298
Better Duration conversion.
yorik Oct 1, 2024
76c516e
Submodule sync
yorik Oct 1, 2024
f9fcc14
Add monitoring for Scaler
yorik Oct 1, 2024
b36750e
Lint fix
yorik Oct 1, 2024
c70e457
Lint fixes
yorik Oct 1, 2024
fcc3b41
Remove unneeded dependencies
yorik Oct 2, 2024
e8a4e00
Remove unneeded use.
yorik Oct 2, 2024
23331e7
Cleanup
yorik Oct 2, 2024
84ee139
Improved style
yorik Oct 3, 2024
c0fb3aa
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 3, 2024
72eb1bd
Convert from let-else to different style
yorik Oct 7, 2024
562edf4
Added more values to config
yorik Oct 7, 2024
b09b8f2
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
e1af797
Fix lint
yorik Oct 7, 2024
7ca0ca2
Fix test and lint
yorik Oct 7, 2024
843525a
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
b2426e9
Fix lint and PR comments.
yorik Oct 7, 2024
ebc0695
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 7, 2024
e3520d9
Add observability.
yorik Oct 7, 2024
965f5c4
Move strum_macros to correct place
yorik Oct 8, 2024
60b05e1
Merge remote-tracking branch 'origin/main' into ya-zkd-1855-implement…
yorik Oct 9, 2024
c23a9af
Merge branch 'main' into ya-zkd-1855-implement-poc-of-quick-prover-au…
yorik Oct 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions core/lib/config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ anyhow.workspace = true
rand.workspace = true
secrecy.workspace = true
serde = { workspace = true, features = ["derive"] }
time = { workspace = true, features = ["serde-human-readable"] }
yorik marked this conversation as resolved.
Show resolved Hide resolved

[dev-dependencies]
serde_json.workspace = true
Expand Down
1 change: 1 addition & 0 deletions core/lib/config/src/configs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ pub mod house_keeper;
pub mod object_store;
pub mod observability;
pub mod proof_data_handler;
pub mod prover_autoscaler;
pub mod prover_job_monitor;
pub mod pruning;
pub mod secrets;
Expand Down
65 changes: 65 additions & 0 deletions core/lib/config/src/configs/prover_autoscaler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use serde::Deserialize;
use time::Duration;

/// Config used for running ProverAutoscaler (both Scaler and Agent).
#[derive(Debug, Clone, PartialEq, Deserialize)]
pub struct GeneralConfig {
/// Amount of time ProverJobMonitor will wait all it's tasks to finish.
yorik marked this conversation as resolved.
Show resolved Hide resolved
// TODO: find a way to use #[serde(with = "humantime_serde")] with time::Duration.
#[serde(default = "GeneralConfig::default_graceful_shutdown_timeout")]
pub graceful_shutdown_timeout: Duration,
pub agent_config: Option<ProverAutoscalerAgentConfig>,
pub scaler_config: Option<ProverAutoscalerScalerConfig>,
}

#[derive(Debug, Clone, PartialEq, Deserialize)]
pub struct ProverAutoscalerAgentConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// HTTP port for Scaler to connect to.
yorik marked this conversation as resolved.
Show resolved Hide resolved
pub http_port: u16,
/// List of namespaces to watch.
#[serde(default = "ProverAutoscalerAgentConfig::default_namespaces")]
pub namespaces: Vec<String>,
/// Watched cluster name. Also can be set via flag.
pub cluster_name: Option<String>,
yorik marked this conversation as resolved.
Show resolved Hide resolved
}

#[derive(Debug, Clone, PartialEq, Deserialize)]
pub struct ProverAutoscalerScalerConfig {
/// Port for prometheus metrics connection.
pub prometheus_port: u16,
/// The interval between runs for GPU Prover Archiver.
#[serde(default = "ProverAutoscalerScalerConfig::default_scaler_run_interval")]
pub scaler_run_interval: Duration,
/// URL to get queue reports from.
#[serde(default = "ProverAutoscalerScalerConfig::default_prover_job_monitor_url")]
pub prover_job_monitor_url: String,
/// List of ProverAutoscaler Agents to get data from.
pub agents: Vec<String>,
}

impl GeneralConfig {
/// Default graceful shutdown timeout -- 5 seconds
pub fn default_graceful_shutdown_timeout() -> Duration {
Duration::seconds(5)
}
}

impl ProverAutoscalerAgentConfig {
pub fn default_namespaces() -> Vec<String> {
vec!["prover-blue".to_string(), "prover-red".to_string()]
}
}

impl ProverAutoscalerScalerConfig {
/// Default scaler_run_interval -- 10s
pub fn default_scaler_run_interval() -> Duration {
Duration::seconds(10)
}

/// Default prover_job_monitor_url -- cluster local URL
pub fn default_prover_job_monitor_url() -> String {
yorik marked this conversation as resolved.
Show resolved Hide resolved
"http://prover-job-monitor.stage2.svc.cluster.local:3074/queue_report".to_string()
}
}
1 change: 1 addition & 0 deletions core/lib/protobuf_config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rand.workspace = true
hex.workspace = true
secrecy.workspace = true
tracing.workspace = true
time.workspace = true
yorik marked this conversation as resolved.
Show resolved Hide resolved

[build-dependencies]
zksync_protobuf_build.workspace = true
1 change: 1 addition & 0 deletions core/lib/protobuf_config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ mod snapshots_creator;
mod da_client;
mod external_price_api_client;
mod external_proof_integration_api;
mod prover_autoscaler;
mod prover_job_monitor;
mod snapshot_recovery;
#[cfg(test)]
Expand Down
25 changes: 25 additions & 0 deletions core/lib/protobuf_config/src/proto/config/prover_autoscaler.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
syntax = "proto3";

package zksync.config.prover_autoscaler;

import "zksync/std.proto";

message GeneralConfig {
optional std.Duration graceful_shutdown_timeout = 1; // optional
optional ProverAutoscalerAgentConfig agent_config = 2; // optional
optional ProverAutoscalerScalerConfig scaler_config = 3; // optional
}

message ProverAutoscalerAgentConfig {
optional uint32 prometheus_port = 1; // required
optional uint32 http_port = 2; // required
repeated string namespaces = 3; // optional
optional string cluster_name = 4; // optional
}

message ProverAutoscalerScalerConfig {
optional uint32 prometheus_port = 1; // required
optional std.Duration scaler_run_interval = 2; // optional
optional string prover_job_monitor_url = 3; // required
repeated string agents = 4; // required at least one
}
78 changes: 78 additions & 0 deletions core/lib/protobuf_config/src/prover_autoscaler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use anyhow::Context as _;
use zksync_config::configs;
use zksync_protobuf::{read_optional, repr::ProtoRepr, required, ProtoFmt};

use crate::{proto::prover_autoscaler as proto, read_optional_repr};

impl ProtoRepr for proto::GeneralConfig {
type Type = configs::prover_autoscaler::GeneralConfig;
fn read(&self) -> anyhow::Result<Self::Type> {
Ok(Self::Type {
graceful_shutdown_timeout: read_optional(&self.graceful_shutdown_timeout)
.context("graceful_shutdown_timeout")?
.unwrap_or(Self::Type::default_graceful_shutdown_timeout()),
agent_config: read_optional_repr(&self.agent_config),
scaler_config: read_optional_repr(&self.scaler_config),
})
}

fn build(this: &Self::Type) -> Self {
Self {
graceful_shutdown_timeout: Some(ProtoFmt::build(&this.graceful_shutdown_timeout)),
agent_config: this.agent_config.as_ref().map(ProtoRepr::build),
scaler_config: this.scaler_config.as_ref().map(ProtoRepr::build),
}
}
}

impl ProtoRepr for proto::ProverAutoscalerAgentConfig {
type Type = configs::prover_autoscaler::ProverAutoscalerAgentConfig;
fn read(&self) -> anyhow::Result<Self::Type> {
Ok(Self::Type {
prometheus_port: required(&self.prometheus_port)
.and_then(|x| Ok((*x).try_into()?))
.context("prometheus_port")?,
http_port: required(&self.http_port)
.and_then(|x| Ok((*x).try_into()?))
.context("http_port")?,
namespaces: self.namespaces.to_vec(),
cluster_name: Some("".to_string()),
})
}

fn build(this: &Self::Type) -> Self {
Self {
prometheus_port: Some(this.prometheus_port.into()),
http_port: Some(this.http_port.into()),
namespaces: this.namespaces.clone(),
cluster_name: this.cluster_name.clone(),
}
}
}

impl ProtoRepr for proto::ProverAutoscalerScalerConfig {
type Type = configs::prover_autoscaler::ProverAutoscalerScalerConfig;
fn read(&self) -> anyhow::Result<Self::Type> {
Ok(Self::Type {
prometheus_port: required(&self.prometheus_port)
.and_then(|x| Ok((*x).try_into()?))
.context("prometheus_port")?,
scaler_run_interval: read_optional(&self.scaler_run_interval)
.context("scaler_run_interval")?
.unwrap_or(Self::Type::default_scaler_run_interval()),
prover_job_monitor_url: required(&self.prover_job_monitor_url)
.context("prover_job_monitor_url")?
.clone(),
agents: self.agents.to_vec(),
})
}

fn build(this: &Self::Type) -> Self {
Self {
prometheus_port: Some(this.prometheus_port.into()),
scaler_run_interval: Some(ProtoFmt::build(&this.scaler_run_interval)),
prover_job_monitor_url: Some(this.prover_job_monitor_url.clone()),
agents: this.agents.clone(),
}
}
}
Loading
Loading