Skip to content

Commit

Permalink
Auto merge of rust-lang#129587 - Voultapher:opt-for-size-variants-of-…
Browse files Browse the repository at this point in the history
…sort-impls, r=cuviper

Add `optimize_for_size` variants for stable and unstable sort as well as select_nth_unstable

- Stable sort uses a simple merge-sort that re-uses the existing - rather gnarly - merge function.
- Unstable sort jumps directly to the branchless heapsort fallback.
- select_nth_unstable jumps directly to the median_of_medians fallback, which is augmented with a custom tiny smallsort and partition impl.

Some code is duplicated but de-duplication would bring it's own problems. For example `swap_if_less` is critical for performance, if the sorting networks don't inline it perf drops drastically, however `#[inline(always)]` is also a poor fit, if the provided comparison function is huge, it gives the compiler an out to only instantiate `swap_if_less` once and call it. Another aspect that would suffer when making `swap_if_less` pub, is having to cfg out dozens of functions in in smallsort module.

Part of rust-lang#125612

r​? `@Kobzol`
  • Loading branch information
bors committed Sep 24, 2024
2 parents 194bbc7 + 5446229 commit 92d5cef
Show file tree
Hide file tree
Showing 8 changed files with 202 additions and 59 deletions.
11 changes: 10 additions & 1 deletion core/src/slice/sort/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//! better performance than one would get using heapsort as fallback.
use crate::mem::{self, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::pivot::choose_pivot;
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
use crate::slice::sort::unstable::quicksort::partition;
Expand Down Expand Up @@ -40,7 +41,13 @@ where
let min_idx = min_index(v, &mut is_less).unwrap();
v.swap(min_idx, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
median_of_medians(v, &mut is_less, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
}
}
}

let (left, right) = v.split_at_mut(index);
Expand All @@ -53,6 +60,7 @@ where
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
const INSERTION_SORT_THRESHOLD: usize = 16;

#[cfg(not(feature = "optimize_for_size"))]
fn partition_at_index_loop<'a, T, F>(
mut v: &'a mut [T],
mut index: usize,
Expand Down Expand Up @@ -169,6 +177,7 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
}

return;
}

Expand Down
2 changes: 2 additions & 0 deletions core/src/slice/sort/shared/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![cfg_attr(feature = "optimize_for_size", allow(dead_code))]

use crate::marker::Freeze;

pub(crate) mod pivot;
Expand Down
7 changes: 6 additions & 1 deletion core/src/slice/sort/shared/smallsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,12 @@ where

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
///
/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
/// types. `is_less` could be a huge function and we want to give the compiler an option to
/// not inline this function. For the same reasons that this function is very perf critical
/// it should be in the same module as the functions that use it.
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
Expand Down
65 changes: 51 additions & 14 deletions core/src/slice/sort/stable/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
//! This module contains the entry points for `slice::sort`.
#[cfg(not(feature = "optimize_for_size"))]
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::{
SMALL_SORT_GENERAL_SCRATCH_LEN, StableSmallSortTypeImpl, insertion_sort_shift_left,
};
use crate::{cmp, intrinsics};

pub(crate) mod drift;
pub(crate) mod merge;

#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod drift;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod quicksort;

#[cfg(feature = "optimize_for_size")]
pub(crate) mod tiny;

/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
Expand All @@ -30,25 +39,53 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
let alloc_len = len / 2;

cfg_if! {
if #[cfg(target_pointer_width = "16")] {
let heap_buf = BufT::with_capacity(alloc_len);
let scratch = heap_buf.as_uninit_slice_mut();
} else {
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
let mut stack_buf = AlignedStorage::<T, 4096>::new();
let stack_scratch = stack_buf.as_uninit_slice_mut();
let mut heap_buf;
let scratch = if stack_scratch.len() >= alloc_len {
stack_scratch
} else {
heap_buf = BufT::with_capacity(alloc_len);
heap_buf.as_uninit_slice_mut()
};
}
}

driftsort_main::<T, F, BufT>(v, is_less);
tiny::mergesort(v, scratch, is_less);
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

driftsort_main::<T, F, BufT>(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
// By allocating n elements of memory we can ensure the entire input can
Expand Down
41 changes: 41 additions & 0 deletions core/src/slice/sort/stable/tiny.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.
use crate::mem::MaybeUninit;
use crate::ptr;
use crate::slice::sort::stable::merge;

/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
/// no run detection, etc.
#[inline(always)]
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
let len = v.len();

if len > 2 {
let mid = len / 2;

// SAFETY: mid is in-bounds.
unsafe {
// Sort the left half recursively.
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
// Sort the right half recursively.
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
}

merge::merge(v, scratch, mid, is_less);
} else if len == 2 {
// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
unsafe {
let v_base = v.as_mut_ptr();
let v_a = v_base;
let v_b = v_base.add(1);

if is_less(&*v_b, &*v_a) {
ptr::swap_nonoverlapping(v_a, v_b, 1);
}
}
}
}
40 changes: 19 additions & 21 deletions core/src/slice/sort/unstable/heapsort.rs
Original file line number Diff line number Diff line change
@@ -1,46 +1,46 @@
//! This module contains a branchless heapsort as fallback for unstable quicksort.
use crate::{intrinsics, ptr};
use crate::{cmp, intrinsics, ptr};

/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
///
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
/// fallback.
///
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
#[inline(never)]
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
pub(crate) fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: See function safety.
unsafe {
intrinsics::assume(v.len() >= 2);

// Build the heap in linear time.
for i in (0..v.len() / 2).rev() {
sift_down(v, i, is_less);
}
let len = v.len();

// Pop maximal elements from the heap.
for i in (1..v.len()).rev() {
for i in (0..len + len / 2).rev() {
let sift_idx = if i >= len {
i - len
} else {
v.swap(0, i);
sift_down(&mut v[..i], 0, is_less);
0
};

// SAFETY: The above calculation ensures that `sift_idx` is either 0 or
// `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`.
// This guarantees the required `sift_idx <= len`.
unsafe {
sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less);
}
}
}

// This binary heap respects the invariant `parent >= child`.
//
// SAFETY: The caller has to guarantee that node < `v.len()`.
#[inline(never)]
// SAFETY: The caller has to guarantee that `node <= v.len()`.
#[inline(always)]
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: See function safety.
unsafe {
intrinsics::assume(node < v.len());
intrinsics::assume(node <= v.len());
}

let len = v.len();
Expand Down Expand Up @@ -69,9 +69,7 @@ where
break;
}

// Swap `node` with the greater child, move one step down, and continue sifting. This
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
ptr::swap(v_base.add(node), v_base.add(child));
ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1);
}

node = child;
Expand Down
33 changes: 21 additions & 12 deletions core/src/slice/sort/unstable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
use crate::intrinsics;
use crate::mem::SizedTypeProperties;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::find_existing_run;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;

pub(crate) mod heapsort;
Expand All @@ -28,25 +30,32 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
heapsort::heapsort(v, is_less);
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

ipnsort(v, is_less);
ipnsort(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
where
Expand Down
Loading

0 comments on commit 92d5cef

Please sign in to comment.