Skip to content

Commit

Permalink
Add health checks for each service (#802)
Browse files Browse the repository at this point in the history
* Add health checks for each service

* Clarify function naming

* Add documentation

* Update admin.md

Co-authored-by: Mark Mandel <[email protected]>

---------

Co-authored-by: Mark Mandel <[email protected]>
  • Loading branch information
XAMPPRocky and markmandel authored Oct 11, 2023
1 parent 5f94d28 commit b7a6c51
Show file tree
Hide file tree
Showing 20 changed files with 590 additions and 276 deletions.
3 changes: 2 additions & 1 deletion benches/throughput.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ fn run_quilkin(port: u16, endpoint: SocketAddr) {

runtime.block_on(async move {
let (_shutdown_tx, shutdown_rx) = tokio::sync::watch::channel::<()>(());
proxy.run(config, shutdown_rx).await.unwrap();
let admin = quilkin::cli::Admin::Proxy(<_>::default());
proxy.run(config, admin, shutdown_rx).await.unwrap();
});
});
}
Expand Down
29 changes: 25 additions & 4 deletions docs/src/deployment/admin.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,36 @@ The admin interface provides the following endpoints:
This provides a liveness probe endpoint, most commonly used in
[Kubernetes based systems](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command).

Will return an HTTP status of 200 when all health checks pass.
Liveness is defined as "hasn't panicked", as long as the process has not
panicked quilkin is considered live.

### /ready

This provides a readiness probe endpoint, most commonly used in
[Kubernetes based systems](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes).

Depending on whether Quilkin is run in Proxy mode i.e. `quilkin proxy`, vs an xDS provider mode, such as `quilkin
manage agones`, will dictate how readiness is calculated:
Readiness is service and provider specific, so based on what you're running
there will be different criteria for a service to be considered ready. Here's
a list of the criteria for each service an provider.

| Service | Readiness |
|---------|---------------------------------------------------------------------|
| Proxy | Management server is connected (or always true if config is static) AND if there is more than one endpoint configured|
| Manage | Provider is ready |
| Relay | Provider is ready |
| Agent | Provider is ready AND connected to relay |

| Provider | Readiness |
|----------|--------------------------------------------|
| Agones | The service is connected to kube-api |
| File | The service has found and watches the file |

When setting thresholds for your `proxy` probes, you generally want to set a low
check period (e.g. `periodSeconds=1`) and a low success threshold
(e.g. `successThreshold=1`), but a high `failureThreshold`
(e.g. `failureThreshold=60`) and `terminationGracePeriodSeconds` to allow for
backoff attempts and existing player sessions to continue without disruption.


#### Proxy Mode

Expand All @@ -67,4 +88,4 @@ See the [xDS Metrics](../services/xds/metrics.md) documentation for what xDS met
Returns a JSON representation of the cluster and filterchain configuration that the instance is running
with at the time of invocation.

[log-docs]: https://docs.rs/env_logger/latest/env_logger/#enabling-logging
[log-docs]: https://docs.rs/env_logger/latest/env_logger/#enabling-logging
4 changes: 3 additions & 1 deletion examples/quilkin-filter-example/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ async fn main() -> quilkin::Result<()> {
)
});

proxy.run(config.into(), shutdown_rx).await
let admin = quilkin::cli::Admin::Proxy(<_>::default());

proxy.run(config.into(), admin, shutdown_rx).await
}
// ANCHOR_END: run
160 changes: 0 additions & 160 deletions src/admin.rs

This file was deleted.

84 changes: 53 additions & 31 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

pub(crate) mod admin;

use std::{
path::{Path, PathBuf},
sync::Arc,
Expand All @@ -23,12 +25,12 @@ use clap::builder::TypedValueParser;
use clap::crate_version;
use tokio::{signal, sync::watch};

use crate::{admin::Mode, Config};
use crate::Config;
use strum_macros::{Display, EnumString};

pub use self::{
agent::Agent, generate_config_schema::GenerateConfigSchema, manage::Manage, proxy::Proxy,
qcmp::Qcmp, relay::Relay,
admin::Admin, agent::Agent, generate_config_schema::GenerateConfigSchema, manage::Manage,
proxy::Proxy, qcmp::Qcmp, relay::Relay,
};

macro_rules! define_port {
Expand Down Expand Up @@ -106,10 +108,21 @@ pub enum Commands {
}

impl Commands {
pub fn admin_mode(&self) -> Option<Mode> {
pub fn admin_mode(&self) -> Option<Admin> {
match self {
Self::Proxy(_) => Some(Mode::Proxy),
Self::Relay(_) | Self::Manage(_) | Self::Agent(_) => Some(Mode::Xds),
Self::Proxy(proxy) => Some(Admin::Proxy(proxy::RuntimeConfig {
idle_request_interval_secs: proxy.idle_request_interval_secs,
..<_>::default()
})),
Self::Agent(agent) => Some(Admin::Agent(agent::RuntimeConfig {
idle_request_interval_secs: agent.idle_request_interval_secs,
..<_>::default()
})),
Self::Relay(relay) => Some(Admin::Relay(relay::RuntimeConfig {
idle_request_interval_secs: relay.idle_request_interval_secs,
..<_>::default()
})),
Self::Manage(_) => Some(Admin::Manage(<_>::default())),
Self::GenerateConfigSchema(_) | Self::Qcmp(_) => None,
}
}
Expand Down Expand Up @@ -148,24 +161,24 @@ impl Cli {
"Starting Quilkin"
);

if let Commands::Qcmp(Qcmp::Ping(ping)) = self.command {
return ping.run().await;
// Non-long running commands (e.g. ones with no administration server)
// are executed here.
match self.command {
Commands::Qcmp(Qcmp::Ping(ping)) => return ping.run().await,
Commands::GenerateConfigSchema(generator) => {
return generator.generate_config_schema();
}
_ => {}
}

tracing::debug!(cli = ?self, "config parameters");

let config = Arc::new(Self::read_config(self.config)?);
let _admin_task = self
.command
.admin_mode()
.filter(|_| !self.no_admin)
.map(|mode| {
tokio::spawn(crate::admin::server(
mode,
config.clone(),
self.admin_address,
))
});
let mode = self.command.admin_mode().unwrap();

if !self.no_admin {
mode.server(config.clone(), self.admin_address);
}

let (shutdown_tx, shutdown_rx) = watch::channel::<()>(());

Expand All @@ -191,37 +204,45 @@ impl Cli {

let fut = tryhard::retry_fn({
let shutdown_rx = shutdown_rx.clone();
let mode = mode.clone();
move || match self.command.clone() {
Commands::Agent(agent) => {
let config = config.clone();
let shutdown_rx = shutdown_rx.clone();
tokio::spawn(
async move { agent.run(config.clone(), shutdown_rx.clone()).await },
)
let mode = mode.clone();
tokio::spawn(async move {
agent.run(config.clone(), mode, shutdown_rx.clone()).await
})
}
Commands::Proxy(runner) => {
let config = config.clone();
let shutdown_rx = shutdown_rx.clone();
tokio::spawn(
async move { runner.run(config.clone(), shutdown_rx.clone()).await },
)
let mode = mode.clone();
tokio::spawn(async move {
runner
.run(config.clone(), mode.clone(), shutdown_rx.clone())
.await
})
}
Commands::Manage(manager) => {
let config = config.clone();
let shutdown_rx = shutdown_rx.clone();
let mode = mode.clone();
tokio::spawn(async move {
manager.manage(config.clone(), shutdown_rx.clone()).await
manager
.manage(config.clone(), mode, shutdown_rx.clone())
.await
})
}
Commands::GenerateConfigSchema(generator) => {
tokio::spawn(std::future::ready(generator.generate_config_schema()))
}
Commands::Relay(relay) => {
let config = config.clone();
let shutdown_rx = shutdown_rx.clone();
tokio::spawn(async move { relay.relay(config, shutdown_rx.clone()).await })
let mode = mode.clone();
tokio::spawn(
async move { relay.relay(config, mode, shutdown_rx.clone()).await },
)
}
Commands::Qcmp(_) => unreachable!(),
Commands::GenerateConfigSchema(_) | Commands::Qcmp(_) => unreachable!(),
}
})
.retries(3)
Expand Down Expand Up @@ -354,6 +375,7 @@ mod tests {
region: None,
sub_zone: None,
zone: None,
idle_request_interval_secs: admin::IDLE_REQUEST_INTERVAL_SECS,
qcmp_port: crate::test_utils::available_addr(&AddressType::Random)
.await
.port(),
Expand Down
Loading

0 comments on commit b7a6c51

Please sign in to comment.