From 10ce1f242923be318226dca312f6547d7e6caab2 Mon Sep 17 00:00:00 2001 From: Benjamin Naecker Date: Tue, 14 Nov 2023 19:52:14 +0000 Subject: [PATCH] Add a producer kind to oximeter metric producers - Adds the `kind` enum to metric producer information, including DB schema, model, and various client parameter types. This records the supported types of metric producers, and is intended to aid debugging and future work around updates and instance lifecycle management. Note that this is currently a nullable / optional value, to aid schema upgrades with other clients outside the repo. A follow-up commit will make this required in the API call and `NOT NULL` in the database. - Add schema update files which create the DB enum type and add it as a column to the `metric_producer` table. This currently _drops_ the existing table and recreates it with the new column, rather than adding the column using `ALTER TABLE`. That is intended to remove old entries in bulk, since nothing previously removed the records for Propolis servers when their instance was stopped. --- clients/nexus-client/src/lib.rs | 14 ++++++ clients/oximeter-client/src/lib.rs | 14 ++++++ common/src/api/internal/nexus.rs | 21 +++++++++ nexus/db-model/src/producer_endpoint.rs | 37 +++++++++++++++ nexus/db-model/src/schema.rs | 3 +- nexus/db-queries/src/db/datastore/oximeter.rs | 1 + nexus/src/app/oximeter.rs | 4 ++ nexus/test-utils/src/lib.rs | 2 + openapi/nexus-internal.json | 45 ++++++++++++++++++- openapi/oximeter.json | 45 ++++++++++++++++++- oximeter/collector/src/agent.rs | 4 ++ oximeter/producer/examples/producer.rs | 2 + schema/crdb/11.0.0/up01.sql | 23 ++++++++++ schema/crdb/11.0.0/up02.sql | 11 +++++ schema/crdb/11.0.0/up03.sql | 17 +++++++ schema/crdb/11.0.0/up04.sql | 8 ++++ schema/crdb/dbinit.sql | 15 ++++++- sled-agent/src/sim/disk.rs | 2 + sled-agent/src/sled_agent.rs | 2 + 19 files changed, 266 insertions(+), 4 deletions(-) create mode 100644 schema/crdb/11.0.0/up01.sql create mode 100644 schema/crdb/11.0.0/up02.sql create mode 100644 schema/crdb/11.0.0/up03.sql create mode 100644 schema/crdb/11.0.0/up04.sql diff --git a/clients/nexus-client/src/lib.rs b/clients/nexus-client/src/lib.rs index 9f81492d106..f9f4739556d 100644 --- a/clients/nexus-client/src/lib.rs +++ b/clients/nexus-client/src/lib.rs @@ -202,6 +202,19 @@ impl From<&types::InstanceState> } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus::ProducerKind; + match kind { + ProducerKind::SledAgent => Self::SledAgent, + ProducerKind::Service => Self::Service, + ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -212,6 +225,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.map(Into::into), interval: s.interval.into(), } } diff --git a/clients/oximeter-client/src/lib.rs b/clients/oximeter-client/src/lib.rs index 7bd17d7e767..8a03304e060 100644 --- a/clients/oximeter-client/src/lib.rs +++ b/clients/oximeter-client/src/lib.rs @@ -20,6 +20,19 @@ impl From for types::Duration { } } +impl From + for types::ProducerKind +{ + fn from(kind: omicron_common::api::internal::nexus::ProducerKind) -> Self { + use omicron_common::api::internal::nexus; + match kind { + nexus::ProducerKind::Service => Self::Service, + nexus::ProducerKind::SledAgent => Self::SledAgent, + nexus::ProducerKind::Instance => Self::Instance, + } + } +} + impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> for types::ProducerEndpoint { @@ -30,6 +43,7 @@ impl From<&omicron_common::api::internal::nexus::ProducerEndpoint> address: s.address.to_string(), base_route: s.base_route.clone(), id: s.id, + kind: s.kind.map(Into::into), interval: s.interval.into(), } } diff --git a/common/src/api/internal/nexus.rs b/common/src/api/internal/nexus.rs index a4a539ad9b1..88e082d2cc8 100644 --- a/common/src/api/internal/nexus.rs +++ b/common/src/api/internal/nexus.rs @@ -84,13 +84,34 @@ pub struct SledInstanceState { // Oximeter producer/collector objects. +/// The kind of metric producer this is. +#[derive(Clone, Copy, Debug, Deserialize, JsonSchema, PartialEq, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ProducerKind { + /// The producer is a sled-agent. + SledAgent, + /// The producer is an Oxide-managed service. + Service, + /// The producer is a Propolis VMM managing a guest instance. + Instance, +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Clone, Debug, Deserialize, JsonSchema, Serialize, PartialEq)] pub struct ProducerEndpoint { + /// A unique ID for this producer. pub id: Uuid, + /// The kind of producer. + pub kind: Option, + /// The IP address and port at which `oximeter` can collect metrics from the + /// producer. pub address: SocketAddr, + /// The API base route from which `oximeter` can collect metrics. + /// + /// The full route is `{base_route}/{id}`. pub base_route: String, + /// The interval on which `oximeter` should collect metrics. pub interval: Duration, } diff --git a/nexus/db-model/src/producer_endpoint.rs b/nexus/db-model/src/producer_endpoint.rs index 29e57b0877e..52a69e05085 100644 --- a/nexus/db-model/src/producer_endpoint.rs +++ b/nexus/db-model/src/producer_endpoint.rs @@ -3,12 +3,47 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::SqlU16; +use crate::impl_enum_type; use crate::schema::metric_producer; use db_macros::Asset; use nexus_types::identity::Asset; use omicron_common::api::internal; use uuid::Uuid; +impl_enum_type!( + #[derive(SqlType, Copy, Clone, Debug, QueryId)] + #[diesel(postgres_type(name = "producer_kind"))] + pub struct ProducerKindEnum; + + #[derive(AsExpression, Copy, Clone, Debug, FromSqlRow, PartialEq)] + #[diesel(sql_type = ProducerKindEnum)] + pub enum ProducerKind; + + SledAgent => b"sled_agent" + Service => b"service" + Instance => b"instance" +); + +impl From for ProducerKind { + fn from(kind: internal::nexus::ProducerKind) -> Self { + match kind { + internal::nexus::ProducerKind::SledAgent => ProducerKind::SledAgent, + internal::nexus::ProducerKind::Service => ProducerKind::Service, + internal::nexus::ProducerKind::Instance => ProducerKind::Instance, + } + } +} + +impl From for internal::nexus::ProducerKind { + fn from(kind: ProducerKind) -> Self { + match kind { + ProducerKind::SledAgent => internal::nexus::ProducerKind::SledAgent, + ProducerKind::Service => internal::nexus::ProducerKind::Service, + ProducerKind::Instance => internal::nexus::ProducerKind::Instance, + } + } +} + /// Information announced by a metric server, used so that clients can contact it and collect /// available metric data from it. #[derive(Queryable, Insertable, Debug, Clone, Selectable, Asset)] @@ -17,6 +52,7 @@ pub struct ProducerEndpoint { #[diesel(embed)] identity: ProducerEndpointIdentity, + pub kind: Option, pub ip: ipnetwork::IpNetwork, pub port: SqlU16, pub interval: f64, @@ -33,6 +69,7 @@ impl ProducerEndpoint { ) -> Self { Self { identity: ProducerEndpointIdentity::new(endpoint.id), + kind: endpoint.kind.map(Into::into), ip: endpoint.address.ip().into(), port: endpoint.address.port().into(), base_route: endpoint.base_route.clone(), diff --git a/nexus/db-model/src/schema.rs b/nexus/db-model/src/schema.rs index 7c6b8bbd0a2..b99ee2082e1 100644 --- a/nexus/db-model/src/schema.rs +++ b/nexus/db-model/src/schema.rs @@ -399,6 +399,7 @@ table! { id -> Uuid, time_created -> Timestamptz, time_modified -> Timestamptz, + kind -> Nullable, ip -> Inet, port -> Int4, interval -> Float8, @@ -1243,7 +1244,7 @@ table! { /// /// This should be updated whenever the schema is changed. For more details, /// refer to: schema/crdb/README.adoc -pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(10, 0, 0); +pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(11, 0, 0); allow_tables_to_appear_in_same_query!( system_update, diff --git a/nexus/db-queries/src/db/datastore/oximeter.rs b/nexus/db-queries/src/db/datastore/oximeter.rs index 55b650ea538..116e8586b07 100644 --- a/nexus/db-queries/src/db/datastore/oximeter.rs +++ b/nexus/db-queries/src/db/datastore/oximeter.rs @@ -96,6 +96,7 @@ impl DataStore { .do_update() .set(( dsl::time_modified.eq(Utc::now()), + dsl::kind.eq(producer.kind), dsl::ip.eq(producer.ip), dsl::port.eq(producer.port), dsl::interval.eq(producer.interval), diff --git a/nexus/src/app/oximeter.rs b/nexus/src/app/oximeter.rs index 03f833b087e..9dc5be0fc3a 100644 --- a/nexus/src/app/oximeter.rs +++ b/nexus/src/app/oximeter.rs @@ -116,6 +116,9 @@ impl super::Nexus { for producer in producers.into_iter() { let producer_info = oximeter_client::types::ProducerEndpoint { id: producer.id(), + kind: producer + .kind + .map(|kind| nexus::ProducerKind::from(kind).into()), address: SocketAddr::new( producer.ip.ip(), producer.port.try_into().unwrap(), @@ -139,6 +142,7 @@ impl super::Nexus { pub(crate) async fn register_as_producer(&self, address: SocketAddr) { let producer_endpoint = nexus::ProducerEndpoint { id: self.id, + kind: Some(nexus::ProducerKind::Service), address, base_route: String::from("/metrics/collect"), interval: Duration::from_secs(10), diff --git a/nexus/test-utils/src/lib.rs b/nexus/test-utils/src/lib.rs index 647232031d8..1e7de6132b5 100644 --- a/nexus/test-utils/src/lib.rs +++ b/nexus/test-utils/src/lib.rs @@ -30,6 +30,7 @@ use omicron_common::address::NEXUS_OPTE_IPV4_SUBNET; use omicron_common::api::external::MacAddr; use omicron_common::api::external::{IdentityMetadata, Name}; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::shared::SwitchLocation; use omicron_common::nexus_config; use omicron_common::nexus_config::NUM_INITIAL_RESERVED_IP_ADDRESSES; @@ -1092,6 +1093,7 @@ pub async fn start_producer_server( let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: Some(ProducerKind::Service), address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_secs(1), diff --git a/openapi/nexus-internal.json b/openapi/nexus-internal.json index f83cf68a8a0..a319779de93 100644 --- a/openapi/nexus-internal.json +++ b/openapi/nexus-internal.json @@ -4224,17 +4224,34 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "nullable": true, + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ @@ -4244,6 +4261,32 @@ "interval" ] }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Oxide-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } + ] + }, "ProducerResultsItem": { "oneOf": [ { diff --git a/openapi/oximeter.json b/openapi/oximeter.json index 529d20e921c..9dc9bc07074 100644 --- a/openapi/oximeter.json +++ b/openapi/oximeter.json @@ -191,17 +191,34 @@ "type": "object", "properties": { "address": { + "description": "The IP address and port at which `oximeter` can collect metrics from the producer.", "type": "string" }, "base_route": { + "description": "The API base route from which `oximeter` can collect metrics.\n\nThe full route is `{base_route}/{id}`.", "type": "string" }, "id": { + "description": "A unique ID for this producer.", "type": "string", "format": "uuid" }, "interval": { - "$ref": "#/components/schemas/Duration" + "description": "The interval on which `oximeter` should collect metrics.", + "allOf": [ + { + "$ref": "#/components/schemas/Duration" + } + ] + }, + "kind": { + "nullable": true, + "description": "The kind of producer.", + "allOf": [ + { + "$ref": "#/components/schemas/ProducerKind" + } + ] } }, "required": [ @@ -231,6 +248,32 @@ "required": [ "items" ] + }, + "ProducerKind": { + "description": "The kind of metric producer this is.", + "oneOf": [ + { + "description": "The producer is a sled-agent.", + "type": "string", + "enum": [ + "sled_agent" + ] + }, + { + "description": "The producer is an Oxide-managed service.", + "type": "string", + "enum": [ + "service" + ] + }, + { + "description": "The producer is a Propolis VMM managing a guest instance.", + "type": "string", + "enum": [ + "instance" + ] + } + ] } }, "responses": { diff --git a/oximeter/collector/src/agent.rs b/oximeter/collector/src/agent.rs index 23ff32ed668..f6da1729094 100644 --- a/oximeter/collector/src/agent.rs +++ b/oximeter/collector/src/agent.rs @@ -648,6 +648,7 @@ mod tests { use hyper::Response; use hyper::Server; use hyper::StatusCode; + use omicron_common::api::internal::nexus::ProducerKind; use omicron_test_utils::dev::test_setup_log; use std::convert::Infallible; use std::net::Ipv6Addr; @@ -694,6 +695,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address, base_route: String::from("/"), interval, @@ -752,6 +754,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address: SocketAddr::V6(SocketAddrV6::new( Ipv6Addr::LOCALHOST, 0, @@ -840,6 +843,7 @@ mod tests { let interval = Duration::from_secs(1); let endpoint = ProducerEndpoint { id: Uuid::new_v4(), + kind: Some(ProducerKind::Service), address, base_route: String::from("/"), interval, diff --git a/oximeter/producer/examples/producer.rs b/oximeter/producer/examples/producer.rs index dd9722c80a9..baa4f57bf70 100644 --- a/oximeter/producer/examples/producer.rs +++ b/oximeter/producer/examples/producer.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLogging; use dropshot::ConfigLoggingLevel; use dropshot::HandlerTaskMode; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter::types::Cumulative; use oximeter::types::ProducerRegistry; use oximeter::types::Sample; @@ -124,6 +125,7 @@ async fn main() -> anyhow::Result<()> { registry.register_producer(producer).unwrap(); let server_info = ProducerEndpoint { id: registry.producer_id(), + kind: Some(ProducerKind::Service), address: args.address, base_route: "/collect".to_string(), interval: Duration::from_secs(10), diff --git a/schema/crdb/11.0.0/up01.sql b/schema/crdb/11.0.0/up01.sql new file mode 100644 index 00000000000..acc666a98fc --- /dev/null +++ b/schema/crdb/11.0.0/up01.sql @@ -0,0 +1,23 @@ +/* + * Drop the entire metric producer assignment table. + * + * Programs wishing to produce metrics need to register with Nexus. That creates + * an assignment of the producer to a collector, which is recorded in this + * table. That registration is idempotent, and every _current_ producer will + * register when it restarts. For example, `dpd` includes a task that registers + * with Nexus, so each time it (re)starts, that registration will happen. + * + * With that in mind, dropping this table is safe, _because all updates are + * currently offline_. The current metric producers are: + * + * - `dpd` + * - Each `nexus` instance + * - Each `sled-agent` instance + * - The Propolis server for each guest Instance + * + * Each of this either does not exist at the time of an update, or will be + * restarted afterwards. Each will re-register, and so dropping this table one + * time is safe. It will be recreated in later schema upgrade files in this same + * update. + */ +DROP TABLE IF EXISTS omicron.public.metric_producer; diff --git a/schema/crdb/11.0.0/up02.sql b/schema/crdb/11.0.0/up02.sql new file mode 100644 index 00000000000..96c4c5d6b47 --- /dev/null +++ b/schema/crdb/11.0.0/up02.sql @@ -0,0 +1,11 @@ +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); diff --git a/schema/crdb/11.0.0/up03.sql b/schema/crdb/11.0.0/up03.sql new file mode 100644 index 00000000000..fc576675418 --- /dev/null +++ b/schema/crdb/11.0.0/up03.sql @@ -0,0 +1,17 @@ +/* + * Recreate the metric producer assignment table. + * + * Note that we're adding the `kind` column here, using the new enum in the + * previous update SQL file. + */ +CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( + id UUID PRIMARY KEY, + time_created TIMESTAMPTZ NOT NULL, + time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind, + ip INET NOT NULL, + port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, + interval FLOAT NOT NULL, + base_route STRING(512) NOT NULL, + oximeter_id UUID NOT NULL +); diff --git a/schema/crdb/11.0.0/up04.sql b/schema/crdb/11.0.0/up04.sql new file mode 100644 index 00000000000..cad33ddcf2d --- /dev/null +++ b/schema/crdb/11.0.0/up04.sql @@ -0,0 +1,8 @@ +/* + * Recreate index to support looking up a producer by its assigned oximeter + * collector ID. + */ +CREATE UNIQUE INDEX IF NOT EXISTS lookup_producer_by_oximeter ON omicron.public.metric_producer ( + oximeter_id, + id +); diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 875877ee967..b12141b2951 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -1108,6 +1108,18 @@ CREATE TABLE IF NOT EXISTS omicron.public.oximeter ( port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL ); +/* + * The kind of metric producer each record corresponds to. + */ +CREATE TYPE IF NOT EXISTS omicron.public.producer_kind AS ENUM ( + -- A sled agent for an entry in the sled table. + 'sled_agent', + -- A service in the omicron.public.service table + 'service', + -- A Propolis VMM for an instance in the omicron.public.instance table + 'instance' +); + /* * Information about registered metric producers. */ @@ -1115,6 +1127,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.metric_producer ( id UUID PRIMARY KEY, time_created TIMESTAMPTZ NOT NULL, time_modified TIMESTAMPTZ NOT NULL, + kind omicron.public.producer_kind, ip INET NOT NULL, port INT4 CHECK (port BETWEEN 0 AND 65535) NOT NULL, interval FLOAT NOT NULL, @@ -2838,7 +2851,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - ( TRUE, NOW(), NOW(), '10.0.0', NULL) + ( TRUE, NOW(), NOW(), '11.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/sled-agent/src/sim/disk.rs b/sled-agent/src/sim/disk.rs index 2d2c18be258..e95b5c3c737 100644 --- a/sled-agent/src/sim/disk.rs +++ b/sled-agent/src/sim/disk.rs @@ -17,6 +17,7 @@ use omicron_common::api::external::Generation; use omicron_common::api::external::ResourceType; use omicron_common::api::internal::nexus::DiskRuntimeState; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use oximeter_producer::LogConfig; use oximeter_producer::Server as ProducerServer; use propolis_client::api::DiskAttachmentState as PropolisDiskState; @@ -168,6 +169,7 @@ impl SimDisk { let producer_address = SocketAddr::new(Ipv6Addr::LOCALHOST.into(), 0); let server_info = ProducerEndpoint { id, + kind: Some(ProducerKind::SledAgent), address: producer_address, base_route: "/collect".to_string(), interval: Duration::from_millis(200), diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index aec64a1349f..861753a3ba4 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -43,6 +43,7 @@ use omicron_common::address::{ }; use omicron_common::api::external::Vni; use omicron_common::api::internal::nexus::ProducerEndpoint; +use omicron_common::api::internal::nexus::ProducerKind; use omicron_common::api::internal::nexus::{ SledInstanceState, VmmRuntimeState, }; @@ -504,6 +505,7 @@ impl SledAgent { // Nexus. This should not block progress here. let endpoint = ProducerEndpoint { id: request.body.id, + kind: Some(ProducerKind::SledAgent), address: sled_address.into(), base_route: String::from("/metrics/collect"), interval: crate::metrics::METRIC_COLLECTION_INTERVAL,