From bd32aecdf982c51202c1a69d12fcf1d878fe6d05 Mon Sep 17 00:00:00 2001 From: Yury Akudovich Date: Tue, 5 Nov 2024 12:22:53 +0100 Subject: [PATCH] feat(prover): Add cluster name autodetection (#3227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What ❔ If cluster_name isn't provided via flag, get it from GCP. Removes unused `cluster_name` config option. ## Why ❔ To reduce possibility of errors in configs. ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [ ] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev lint`. ref ZKD-1855 --- .../bin/prover_autoscaler/src/config.rs | 2 -- .../bin/prover_autoscaler/src/k8s/watcher.rs | 32 ++++++++++++++++++- .../crates/bin/prover_autoscaler/src/main.rs | 10 ++---- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/prover/crates/bin/prover_autoscaler/src/config.rs b/prover/crates/bin/prover_autoscaler/src/config.rs index c34cff15951a..6729a5372d56 100644 --- a/prover/crates/bin/prover_autoscaler/src/config.rs +++ b/prover/crates/bin/prover_autoscaler/src/config.rs @@ -27,8 +27,6 @@ pub struct ProverAutoscalerAgentConfig { /// List of namespaces to watch. #[serde(default = "ProverAutoscalerAgentConfig::default_namespaces")] pub namespaces: Vec, - /// Watched cluster name. Also can be set via flag. - pub cluster_name: Option, /// If dry-run enabled don't do any k8s updates, just report success. #[serde(default = "ProverAutoscalerAgentConfig::default_dry_run")] pub dry_run: bool, diff --git a/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs b/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs index 707ff04f1836..b8476ab475ab 100644 --- a/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs +++ b/prover/crates/bin/prover_autoscaler/src/k8s/watcher.rs @@ -1,5 +1,6 @@ use std::{collections::HashMap, sync::Arc}; +use anyhow::Context; use chrono::{DateTime, Utc}; use futures::{stream, StreamExt, TryStreamExt}; use k8s_openapi::api; @@ -7,7 +8,12 @@ use kube::{ api::{Api, ResourceExt}, runtime::{watcher, WatchStreamExt}, }; +use reqwest::{ + header::{HeaderMap, HeaderValue}, + Method, +}; use tokio::sync::Mutex; +use zksync_utils::http_with_retries::send_request_with_retries; use crate::cluster_types::{Cluster, Deployment, Namespace, Pod, ScaleEvent}; @@ -17,13 +23,37 @@ pub struct Watcher { pub cluster: Arc>, } +async fn get_cluster_name() -> anyhow::Result { + let mut headers = HeaderMap::new(); + headers.insert("Metadata-Flavor", HeaderValue::from_static("Google")); + let url = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name"; + let response = send_request_with_retries(url, 5, Method::GET, Some(headers), None).await; + response + .map_err(|err| anyhow::anyhow!("Failed fetching response from url: {url}: {err:?}"))? + .text() + .await + .context("Failed to read response as text") +} + impl Watcher { - pub fn new(client: kube::Client, cluster_name: String, namespaces: Vec) -> Self { + pub async fn new( + client: kube::Client, + cluster_name: Option, + namespaces: Vec, + ) -> Self { let mut ns = HashMap::new(); namespaces.into_iter().for_each(|n| { ns.insert(n, Namespace::default()); }); + let cluster_name = match cluster_name { + Some(c) => c, + None => get_cluster_name() + .await + .expect("Load cluster_name from GCP"), + }; + tracing::info!("Agent cluster name is {cluster_name}"); + Self { client, cluster: Arc::new(Mutex::new(Cluster { diff --git a/prover/crates/bin/prover_autoscaler/src/main.rs b/prover/crates/bin/prover_autoscaler/src/main.rs index b153cb9c4d59..98ffdb49d824 100644 --- a/prover/crates/bin/prover_autoscaler/src/main.rs +++ b/prover/crates/bin/prover_autoscaler/src/main.rs @@ -76,10 +76,7 @@ async fn main() -> anyhow::Result<()> { match opt.job { AutoscalerType::Agent => { - let cluster = opt - .cluster_name - .context("cluster_name is required for Agent")?; - tracing::info!("Starting ProverAutoscaler Agent for cluster {}", cluster); + tracing::info!("Starting ProverAutoscaler Agent"); let agent_config = general_config.agent_config.context("agent_config")?; let exporter_config = PrometheusExporterConfig::pull(agent_config.prometheus_port); tasks.push(tokio::spawn(exporter_config.run(stop_receiver.clone()))); @@ -87,9 +84,8 @@ async fn main() -> anyhow::Result<()> { let _ = rustls::crypto::ring::default_provider().install_default(); let client = kube::Client::try_default().await?; - // TODO: maybe get cluster name from curl -H "Metadata-Flavor: Google" - // http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name - let watcher = Watcher::new(client.clone(), cluster, agent_config.namespaces); + let watcher = + Watcher::new(client.clone(), opt.cluster_name, agent_config.namespaces).await; let scaler = Scaler::new(client, agent_config.dry_run); tasks.push(tokio::spawn(watcher.clone().run())); tasks.push(tokio::spawn(agent::run_server(