Skip to content

Commit

Permalink
[sled-agent] Separate CockroachDB "start" from CockroachDB "init" (#2954
Browse files Browse the repository at this point in the history
)

Co-authored-by: David Pacheco <[email protected]>
  • Loading branch information
smklein and davepacheco authored Jun 29, 2023
1 parent cdf91f2 commit b884c76
Show file tree
Hide file tree
Showing 17 changed files with 633 additions and 133 deletions.
14 changes: 14 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ members = [
"installinator-artifactd",
"installinator-common",
"internal-dns",
"internal-dns-cli",
"ipcc-key-value",
"key-manager",
"nexus",
Expand Down Expand Up @@ -83,6 +84,7 @@ default-members = [
"installinator-artifactd",
"installinator-common",
"internal-dns",
"internal-dns-cli",
"ipcc-key-value",
"key-manager",
"nexus",
Expand Down
11 changes: 8 additions & 3 deletions dns-server/src/dns_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@ use anyhow::anyhow;
use anyhow::Context;
use pretty_hex::*;
use serde::Deserialize;
use slog::info;
use slog::{debug, error, o, Logger};
use slog::{debug, error, info, o, trace, Logger};
use std::net::SocketAddr;
use std::str::FromStr;
use std::sync::Arc;
Expand Down Expand Up @@ -150,7 +149,7 @@ async fn handle_dns_packet(request: Request) {
let log = &request.log;
let buf = &request.packet;

debug!(&log, "buffer"; "buffer" => ?buf.hex_dump());
trace!(&log, "buffer"; "buffer" => ?buf.hex_dump());

// Decode the message.
let mut dec = BinDecoder::new(&buf);
Expand Down Expand Up @@ -270,6 +269,12 @@ async fn handle_dns_message(
}
})
.collect::<Result<Vec<_>, RequestError>>()?;
debug!(
&log,
"dns response";
"query" => ?query,
"records" => ?&response_records
);
respond_records(request, rb, header, &response_records).await
}

Expand Down
4 changes: 4 additions & 0 deletions illumos-utils/src/running_zone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ impl RunningZone {
self.inner.zonepath.join("root")
}

pub fn control_interface(&self) -> AddrObject {
AddrObject::new(self.inner.get_control_vnic_name(), "omicron6").unwrap()
}

/// Runs a command within the Zone, return the output.
//
// NOTE: It's important that this function is synchronous.
Expand Down
15 changes: 15 additions & 0 deletions internal-dns-cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "internal-dns-cli"
version = "0.1.0"
edition = "2021"
license = "MPL-2.0"

[dependencies]
anyhow.workspace = true
clap.workspace = true
dropshot.workspace = true
internal-dns.workspace = true
omicron-common.workspace = true
slog.workspace = true
tokio.workspace = true
trust-dns-resolver.workspace = true
98 changes: 98 additions & 0 deletions internal-dns-cli/src/bin/dnswait.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Resolves DNS names within the Oxide control plane
use anyhow::Context;
use anyhow::Result;
use clap::Parser;
use clap::ValueEnum;
use internal_dns::resolver::ResolveError;
use internal_dns::resolver::Resolver;
use slog::{info, warn};
use std::net::SocketAddr;

#[derive(Debug, Parser)]
#[clap(name = "dnswait", about = "Resolves DNS names in the control plane")]
struct Opt {
/// Nameserver(s) to query
///
/// If unspecified, uses the system configuration (usually the nameservers
/// configured in /etc/resolv.conf).
#[clap(long, action)]
nameserver_addresses: Vec<SocketAddr>,

/// service name to be resolved (should be the target of a DNS name)
#[arg(value_enum)]
srv_name: ServiceName,
}

#[derive(Debug, Clone, Copy, ValueEnum)]
#[value(rename_all = "kebab-case")]
enum ServiceName {
Cockroach,
}

impl From<ServiceName> for internal_dns::ServiceName {
fn from(value: ServiceName) -> Self {
match value {
ServiceName::Cockroach => internal_dns::ServiceName::Cockroach,
}
}
}

#[tokio::main]
async fn main() -> Result<()> {
let opt = Opt::parse();
let log = dropshot::ConfigLogging::File {
path: "/dev/stderr".into(),
level: dropshot::ConfigLoggingLevel::Info,
if_exists: dropshot::ConfigLoggingIfExists::Append,
}
.to_logger("dnswait")
.context("creating log")?;

let resolver = if opt.nameserver_addresses.is_empty() {
info!(&log, "using system configuration");
let async_resolver =
trust_dns_resolver::AsyncResolver::tokio_from_system_conf()
.context("initializing resolver from system configuration")?;
Resolver::new_with_resolver(log.clone(), async_resolver)
} else {
let addrs = opt.nameserver_addresses;
info!(&log, "using explicit nameservers"; "nameservers" => ?addrs);
Resolver::new_from_addrs(log.clone(), addrs)
.context("creating resolver with explicit nameserver addresses")?
};

let result = omicron_common::backoff::retry_notify(
omicron_common::backoff::retry_policy_internal_service(),
|| async {
let dns_name = internal_dns::ServiceName::from(opt.srv_name);
resolver.lookup_srv(dns_name).await.map_err(|error| match error {
ResolveError::Resolve(_)
| ResolveError::NotFound(_)
| ResolveError::NotFoundByString(_) => {
omicron_common::backoff::BackoffError::transient(error)
}
})
},
|error, delay| {
warn!(
&log,
"DNS query failed; will try again";
"error" => format!("{:#}", error),
"delay" => ?delay,
);
},
)
.await
.context("unexpectedly gave up")?;

for ip in result {
println!("{}", ip)
}

Ok(())
}
137 changes: 122 additions & 15 deletions internal-dns/src/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::DNS_ZONE;
use omicron_common::address::{
Ipv6Subnet, ReservedRackSubnet, AZ_PREFIX, DNS_PORT,
};
use slog::{debug, info};
use slog::{debug, info, trace};
use std::net::{IpAddr, Ipv6Addr, SocketAddr, SocketAddrV6};
use trust_dns_proto::rr::record_type::RecordType;
use trust_dns_resolver::config::{
Expand Down Expand Up @@ -70,6 +70,15 @@ impl Resolver {
Self::new_from_subnet(log, subnet)
}

/// Return a resolver that uses the system configuration (usually
/// /etc/resolv.conf) for the underlying nameservers.
pub fn new_with_resolver(
log: slog::Logger,
tokio_resolver: TokioAsyncResolver,
) -> Self {
Resolver { log, inner: Box::new(tokio_resolver) }
}

// TODO-correctness This function and its callers make assumptions about how
// many internal DNS servers there are on the subnet and where they are. Is
// that okay? It would seem more flexible not to assume this. Instead, we
Expand Down Expand Up @@ -121,6 +130,107 @@ impl Resolver {
Ok(*address)
}

/// Returns the targets of the SRV records for a DNS name
///
/// The returned values are generally other DNS names that themselves would
/// need to be looked up to find A/AAAA records.
pub async fn lookup_srv(
&self,
srv: crate::ServiceName,
) -> Result<Vec<String>, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
trace!(self.log, "lookup_srv"; "dns_name" => &name);
let response = self.inner.srv_lookup(&name).await?;
debug!(
self.log,
"lookup_srv";
"dns_name" => &name,
"response" => ?response
);

Ok(response.into_iter().map(|srv| srv.target().to_string()).collect())
}

pub async fn lookup_all_ipv6(
&self,
srv: crate::ServiceName,
) -> Result<Vec<Ipv6Addr>, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
trace!(self.log, "lookup_all_ipv6 srv"; "dns_name" => &name);
let response = self.inner.srv_lookup(&name).await?;
debug!(
self.log,
"lookup_ipv6 srv";
"dns_name" => &name,
"response" => ?response
);

// SRV records have a target, which is itself another DNS name that
// needs to be looked up in order to get to the actual IP addresses.
// Many DNS servers return these IP addresses directly in the response
// to the SRV query as Additional records. Ours does not. See
// omicron#3434. So we need to do another round of lookups separately.
//
// According to the docs` for
// `trust_dns_resolver::lookup::SrvLookup::ip_iter()`, it sounds like
// trust-dns would have done this for us. It doesn't. See
// bluejekyll/trust-dns#1980.
//
// So if we have gotten any IPs, then we assume that one of the above
// issues has been addressed and so we have all the IPs and we're done.
// Otherwise, explicitly do the extra lookups.
let addresses: Vec<Ipv6Addr> = response
.ip_iter()
.filter_map(|addr| match addr {
IpAddr::V4(_) => None,
IpAddr::V6(addr) => Some(addr),
})
.collect();
if !addresses.is_empty() {
return Ok(addresses);
}

// What do we do if some of these queries succeed while others fail? We
// may have some addresses, but the list might be incomplete. That
// might be okay for some use cases but not others. For now, we do the
// simple thing. In the future, we'll want a more cueball-like resolver
// interface that better deals with these cases.
let log = &self.log;
let futures = response.iter().map(|srv| async {
let target = srv.target();
trace!(
log,
"lookup_all_ipv6: looking up SRV target";
"name" => ?target,
);
self.inner.ipv6_lookup(target.clone()).await
});
let results = futures::future::try_join_all(futures).await?;
let results = results
.into_iter()
.flat_map(|ipv6| ipv6.into_iter())
.collect::<Vec<_>>();
if results.is_empty() {
Err(ResolveError::NotFound(srv))
} else {
Ok(results)
}
}

pub async fn lookup_ip(
&self,
srv: crate::ServiceName,
) -> Result<IpAddr, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
debug!(self.log, "lookup srv"; "dns_name" => &name);
let response = self.inner.lookup_ip(&name).await?;
let address = response
.iter()
.next()
.ok_or_else(|| ResolveError::NotFound(srv))?;
Ok(address)
}

/// Looks up a single [`SocketAddrV6`] based on the SRV name
/// Returns an error if the record does not exist.
pub async fn lookup_socket_v6(
Expand Down Expand Up @@ -156,20 +266,6 @@ impl Resolver {
}
})
}

pub async fn lookup_ip(
&self,
srv: crate::ServiceName,
) -> Result<IpAddr, ResolveError> {
let name = format!("{}.{}", srv.dns_name(), DNS_ZONE);
debug!(self.log, "lookup srv"; "dns_name" => &name);
let response = self.inner.lookup_ip(&name).await?;
let address = response
.iter()
.next()
.ok_or_else(|| ResolveError::NotFound(srv))?;
Ok(address)
}
}

#[cfg(test)]
Expand Down Expand Up @@ -427,6 +523,17 @@ mod test {
.expect("Should have been able to look up IP address");
assert!(cockroach_addrs.iter().any(|addr| addr.ip() == &ip));

// Look up all the Cockroach addresses.
let mut ips =
resolver.lookup_all_ipv6(ServiceName::Cockroach).await.expect(
"Should have been able to look up all CockroachDB addresses",
);
ips.sort();
assert_eq!(
ips,
cockroach_addrs.iter().map(|s| *s.ip()).collect::<Vec<_>>()
);

// Look up Clickhouse
let ip = resolver
.lookup_ipv6(ServiceName::Clickhouse)
Expand Down
17 changes: 17 additions & 0 deletions openapi/sled-agent.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,23 @@
"version": "0.0.1"
},
"paths": {
"/cockroachdb": {
"post": {
"summary": "Initializes a CockroachDB cluster",
"operationId": "cockroachdb_init",
"responses": {
"204": {
"description": "resource updated"
},
"4XX": {
"$ref": "#/components/responses/Error"
},
"5XX": {
"$ref": "#/components/responses/Error"
}
}
}
},
"/disks/{disk_id}": {
"put": {
"operationId": "disk_put",
Expand Down
Loading

0 comments on commit b884c76

Please sign in to comment.