Skip to content

Commit

Permalink
Add next-item query based on a join (#6551)
Browse files Browse the repository at this point in the history
- Adds the `NextItemSelfJoined` query, which implements the next-item
query as a self-join. The current implementation joins the target table
with the series of all possible values for the target item. Because CRDB
eagerly evaluates subqueries, this entire list must be buffered in
memory. The self-join version instead joins the existing table with the
literal next item (item + 1), and finds the lowest one that's not
allocated yet. Some initial testing suggests this can consume much less
memory and run much faster than the previous query implementations. As a
tradeoff, this new query cannot be used to randomly select _any_ item in
a range -- only the next, lowest value can be selected. Note that this
changes the way we allocate MAC addresses, which were previously random.
- Fixes #5904
  • Loading branch information
bnaecker authored Sep 13, 2024
1 parent a7dbc36 commit 8be99b0
Show file tree
Hide file tree
Showing 2 changed files with 629 additions and 144 deletions.
202 changes: 58 additions & 144 deletions nexus/db-queries/src/db/queries/network_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::db::error::{public_error_from_diesel, retryable, ErrorHandler};
use crate::db::model::IncompleteNetworkInterface;
use crate::db::pool::DbConnection;
use crate::db::queries::next_item::DefaultShiftGenerator;
use crate::db::queries::next_item::NextItem;
use crate::db::queries::next_item::{NextItem, NextItemSelfJoined};
use crate::db::schema::network_interface::dsl;
use async_bb8_diesel::AsyncRunQueryDsl;
use chrono::DateTime;
Expand All @@ -33,7 +33,7 @@ use omicron_common::api::external;
use omicron_common::api::external::MacAddr;
use once_cell::sync::Lazy;
use slog_error_chain::SlogInlineError;
use std::net::IpAddr;
use std::net::{IpAddr, Ipv6Addr};
use uuid::Uuid;

// These are sentinel values and other constants used to verify the state of the
Expand Down Expand Up @@ -448,36 +448,6 @@ fn decode_database_error(
}
}

// Helper to return the offset of the last valid/allocatable IP in a subnet.
// Note that this is the offset from the _first available address_, not the
// network address.
fn last_address_offset(subnet: &IpNetwork) -> u32 {
// Generate last address in the range.
//
// NOTE: First subtraction is to convert from the subnet size to an
// offset, since `generate_series` is inclusive of the last value.
// Example: 256 -> 255.
let last_address_offset = match subnet {
IpNetwork::V4(network) => network.size() - 1,
IpNetwork::V6(network) => {
// TODO-robustness: IPv6 subnets are always /64s, so in theory we
// could require searching all ~2^64 items for the next address.
// That won't happen in practice, because there will be other limits
// on the number of IPs (such as MAC addresses, or just project
// accounting limits). However, we should update this to be the
// actual maximum size we expect or want to support, once we get a
// better sense of what that is.
u32::try_from(network.size() - 1).unwrap_or(u32::MAX - 1)
}
};

// This subtraction is because the last address in a subnet is
// explicitly reserved for Oxide use.
last_address_offset
.checked_sub(1 + NUM_INITIAL_RESERVED_IP_ADDRESSES as u32)
.unwrap_or_else(|| panic!("Unexpectedly small IP subnet: '{}'", subnet))
}

// Return the first available address in a subnet. This is not the network
// address, since Oxide reserves the first few addresses.
fn first_available_address(subnet: &IpNetwork) -> IpAddr {
Expand All @@ -489,12 +459,9 @@ fn first_available_address(subnet: &IpNetwork) -> IpAddr {
})
.into(),
IpNetwork::V6(network) => {
// TODO-performance: This is unfortunate. `ipnetwork` implements a
// direct addition-based approach for IPv4 but not IPv6. This will
// loop, which, while it may not matter much, can be nearly
// trivially avoided by converting to u128, adding, and converting
// back. Given that these spaces can be _really_ big, that is
// probably worth doing.
// NOTE: This call to `nth()` will loop and call the `next()`
// implementation. That's inefficient, but the number of reserved
// addresses is very small, so it should not matter.
network
.iter()
.nth(NUM_INITIAL_RESERVED_IP_ADDRESSES as _)
Expand All @@ -506,11 +473,41 @@ fn first_available_address(subnet: &IpNetwork) -> IpAddr {
}
}

// Return the last available address in a subnet. This is not the broadcast
// address, since that is reserved.
fn last_available_address(subnet: &IpNetwork) -> IpAddr {
// NOTE: In both cases below, we subtract 2 from the network size. That's
// because we first subtract 1 to go from a size to an index, and then
// another 1 because the broadcast address isn't valid for an interface.
match subnet {
IpNetwork::V4(network) => network
.size()
.checked_sub(2)
.and_then(|n| network.nth(n))
.map(IpAddr::V4)
.unwrap_or_else(|| {
panic!("Unexpectedly small IPv4 subnetwork: '{}'", network);
}),
IpNetwork::V6(network) => {
// NOTE: The iterator implementation for `Ipv6Network` only
// implements the required `Iterator::next()` method. That means we
// get the default implementation of the `nth()` method, which will
// loop and call `next()`. That is ridiculously inefficient, so we
// manually compute the nth address through addition instead.
let base = u128::from(network.network());
let n = network.size().checked_sub(2).unwrap_or_else(|| {
panic!("Unexpectedly small IPv6 subnetwork: '{}'", network);
});
IpAddr::V6(Ipv6Addr::from(base + n))
}
}
}

/// The `NextIpv4Address` query is a `NextItem` query for choosing the next
/// available IPv4 address for an interface.
#[derive(Debug, Clone, Copy)]
pub struct NextIpv4Address {
inner: NextItem<
inner: NextItemSelfJoined<
db::schema::network_interface::table,
IpNetwork,
db::schema::network_interface::dsl::ip,
Expand All @@ -522,11 +519,9 @@ pub struct NextIpv4Address {
impl NextIpv4Address {
pub fn new(subnet: Ipv4Network, subnet_id: Uuid) -> Self {
let subnet = IpNetwork::from(subnet);
let net = IpNetwork::from(first_available_address(&subnet));
let max_shift = i64::from(last_address_offset(&subnet));
let generator = DefaultShiftGenerator::new(net, max_shift, 0)
.expect("invalid min/max shift");
Self { inner: NextItem::new_scoped(generator, subnet_id) }
let min = IpNetwork::from(first_available_address(&subnet));
let max = IpNetwork::from(last_available_address(&subnet));
Self { inner: NextItemSelfJoined::new_scoped(subnet_id, min, max) }
}
}

Expand Down Expand Up @@ -607,7 +602,7 @@ impl QueryFragment<Pg> for NextNicSlot {
/// a network interface.
#[derive(Debug, Clone, Copy)]
pub struct NextMacAddress {
inner: NextItem<
inner: NextItemSelfJoined<
db::schema::network_interface::table,
db::model::MacAddr,
db::schema::network_interface::dsl::mac,
Expand All @@ -616,63 +611,19 @@ pub struct NextMacAddress {
>,
}

// Helper to ensure we correctly compute the min/max shifts for a next MAC
// query.
#[derive(Copy, Clone, Debug)]
struct NextMacShifts {
base: MacAddr,
min_shift: i64,
max_shift: i64,
}

impl NextMacShifts {
fn for_guest() -> Self {
let base = MacAddr::random_guest();
Self::shifts_for(base, MacAddr::MIN_GUEST_ADDR, MacAddr::MAX_GUEST_ADDR)
}

fn for_system() -> NextMacShifts {
let base = MacAddr::random_system();
Self::shifts_for(
base,
MacAddr::MIN_SYSTEM_ADDR,
MacAddr::MAX_SYSTEM_ADDR,
)
}

fn shifts_for(base: MacAddr, min: i64, max: i64) -> NextMacShifts {
let x = base.to_i64();

// The max shift is the distance to the last value. This min shift is
// always expressed as a negative number, giving the largest leftward
// shift, i.e., the distance to the first value.
let max_shift = max - x;
let min_shift = min - x;
Self { base, min_shift, max_shift }
}
}

impl NextMacAddress {
pub fn new(vpc_id: Uuid, kind: NetworkInterfaceKind) -> Self {
let (base, max_shift, min_shift) = match kind {
let (min, max) = match kind {
NetworkInterfaceKind::Instance | NetworkInterfaceKind::Probe => {
let NextMacShifts { base, min_shift, max_shift } =
NextMacShifts::for_guest();
(base.into(), max_shift, min_shift)
(MacAddr::MIN_GUEST_ADDR, MacAddr::MAX_GUEST_ADDR)
}
NetworkInterfaceKind::Service => {
let NextMacShifts { base, min_shift, max_shift } =
NextMacShifts::for_system();
(base.into(), max_shift, min_shift)
(MacAddr::MIN_SYSTEM_ADDR, MacAddr::MAX_SYSTEM_ADDR)
}
};
let generator = DefaultShiftGenerator::new(base, max_shift, min_shift)
.unwrap_or_else(|| {
panic!(
"invalid min shift ({min_shift}) or max_shift ({max_shift})"
)
});
Self { inner: NextItem::new_scoped(generator, vpc_id) }
let min = db::model::MacAddr(MacAddr::from_i64(min));
let max = db::model::MacAddr(MacAddr::from_i64(max));
Self { inner: NextItemSelfJoined::new_scoped(vpc_id, min, max) }
}
}

Expand Down Expand Up @@ -1840,7 +1791,6 @@ fn decode_delete_network_interface_database_error(
#[cfg(test)]
mod tests {
use super::first_available_address;
use super::last_address_offset;
use super::DeleteError;
use super::InsertError;
use super::MAX_NICS_PER_INSTANCE;
Expand All @@ -1857,7 +1807,7 @@ mod tests {
use crate::db::model::NetworkInterface;
use crate::db::model::Project;
use crate::db::model::VpcSubnet;
use crate::db::queries::network_interface::NextMacShifts;
use crate::db::queries::network_interface::last_available_address;
use async_bb8_diesel::AsyncRunQueryDsl;
use dropshot::test_util::LogContext;
use model::NetworkInterfaceKind;
Expand Down Expand Up @@ -2821,7 +2771,8 @@ mod tests {
.await;
assert!(
matches!(result, Err(InsertError::NoAvailableIpAddresses)),
"Address exhaustion should be detected and handled"
"Address exhaustion should be detected and handled, found {:?}",
result,
);
context.success().await;
}
Expand Down Expand Up @@ -2963,24 +2914,6 @@ mod tests {
context.success().await;
}

#[test]
fn test_last_address_offset() {
let subnet = "172.30.0.0/28".parse().unwrap();
assert_eq!(
last_address_offset(&subnet),
// /28 = 2 ** 4 = 16 total addresses
// ... - 1 for converting from size to index = 15
// ... - 1 for reserved broadcast address = 14
// ... - 5 for reserved initial addresses = 9
9,
);
let subnet = "fd00::/64".parse().unwrap();
assert_eq!(
last_address_offset(&subnet),
u32::MAX - 1 - 1 - super::NUM_INITIAL_RESERVED_IP_ADDRESSES as u32,
);
}

#[test]
fn test_first_available_address() {
let subnet = "172.30.0.0/28".parse().unwrap();
Expand All @@ -2996,35 +2929,16 @@ mod tests {
}

#[test]
fn test_next_mac_shifts_for_system() {
let NextMacShifts { base, min_shift, max_shift } =
NextMacShifts::for_system();
assert!(base.is_system());
assert!(
min_shift <= 0,
"expected min shift to be negative, found {min_shift}"
);
assert!(max_shift >= 0, "found {max_shift}");
let x = base.to_i64();
assert_eq!(x + min_shift, MacAddr::MIN_SYSTEM_ADDR);
assert_eq!(x + max_shift, MacAddr::MAX_SYSTEM_ADDR);
}

#[test]
fn test_next_mac_shifts_for_guest() {
let NextMacShifts { base, min_shift, max_shift } =
NextMacShifts::for_guest();
assert!(base.is_guest());
assert!(
min_shift <= 0,
"expected min shift to be negative, found {min_shift}"
fn test_last_available_address() {
let subnet = "172.30.0.0/28".parse().unwrap();
assert_eq!(
last_available_address(&subnet),
"172.30.0.14".parse::<IpAddr>().unwrap(),
);
assert!(
max_shift >= 0,
"expected max shift to be positive, found {max_shift}"
let subnet = "fd00::/64".parse().unwrap();
assert_eq!(
last_available_address(&subnet),
"fd00::ffff:ffff:ffff:fffe".parse::<IpAddr>().unwrap(),
);
let x = base.to_i64();
assert_eq!(x + min_shift, MacAddr::MIN_GUEST_ADDR);
assert_eq!(x + max_shift, MacAddr::MAX_GUEST_ADDR);
}
}
Loading

0 comments on commit 8be99b0

Please sign in to comment.