diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
index 166454d3ae74c..fa67a1b331011 100644
--- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
+++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs
@@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>(
     let trait_def_id = trait_ref_self.def_id();
     let trait_vis = cx.tcx.visibility(trait_def_id);
 
-    let cgus = cx.sess().codegen_units();
+    let cgus = cx.sess().codegen_units().as_usize();
     let single_cgu = cgus == 1;
 
     let lto = cx.sess().lto();
diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs
index 9be69e560e888..109e9959aeac8 100644
--- a/compiler/rustc_codegen_ssa/src/back/write.rs
+++ b/compiler/rustc_codegen_ssa/src/back/write.rs
@@ -646,10 +646,10 @@ fn produce_final_output_artifacts(
         // rlib.
         let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe);
 
-        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1;
+        let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1;
 
         let keep_numbered_objects =
-            needs_crate_object || (user_wants_objects && sess.codegen_units() > 1);
+            needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1);
 
         for module in compiled_modules.modules.iter() {
             if let Some(ref path) = module.object {
@@ -1923,7 +1923,7 @@ impl<B: ExtraBackendMethods> OngoingCodegen<B> {
 
         // FIXME: time_llvm_passes support - does this use a global context or
         // something?
-        if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes {
+        if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes {
             self.backend.print_pass_timings()
         }
 
diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs
index 1d9c8ded349c0..ebcc3b0399973 100644
--- a/compiler/rustc_monomorphize/src/partitioning.rs
+++ b/compiler/rustc_monomorphize/src/partitioning.rs
@@ -113,6 +113,7 @@ use rustc_middle::query::Providers;
 use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths};
 use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt};
 use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath};
+use rustc_session::CodegenUnits;
 use rustc_span::symbol::Symbol;
 
 use crate::collector::UsageMap;
@@ -121,7 +122,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec
 
 struct PartitioningCx<'a, 'tcx> {
     tcx: TyCtxt<'tcx>,
-    target_cgu_count: usize,
     usage_map: &'a UsageMap<'tcx>,
 }
 
@@ -130,13 +130,17 @@ struct PlacedRootMonoItems<'tcx> {
     codegen_units: Vec<CodegenUnit<'tcx>>,
 
     internalization_candidates: FxHashSet<MonoItem<'tcx>>,
+
+    /// These must be obtained when the iterator in `partition` runs. They
+    /// can't be obtained later because some inlined functions might not be
+    /// reachable.
+    unique_inlined_stats: (usize, usize),
 }
 
 // The output CGUs are sorted by name.
 fn partition<'tcx, I>(
     tcx: TyCtxt<'tcx>,
     mono_items: I,
-    max_cgu_count: usize,
     usage_map: &UsageMap<'tcx>,
 ) -> Vec<CodegenUnit<'tcx>>
 where
@@ -144,12 +148,12 @@ where
 {
     let _prof_timer = tcx.prof.generic_activity("cgu_partitioning");
 
-    let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map };
+    let cx = &PartitioningCx { tcx, usage_map };
 
     // In the first step, we place all regular monomorphizations into their
     // respective 'home' codegen unit. Regular monomorphizations are all
     // functions and statics defined in the local crate.
-    let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = {
+    let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots");
         place_root_mono_items(cx, mono_items)
     };
@@ -158,7 +162,7 @@ where
         cgu.create_size_estimate(tcx);
     }
 
-    debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units);
+    debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats);
 
     // Merge until we have at most `max_cgu_count` codegen units.
     // `merge_codegen_units` is responsible for updating the CGU size
@@ -166,7 +170,7 @@ where
     {
         let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus");
         merge_codegen_units(cx, &mut codegen_units);
-        debug_dump(tcx, "POST MERGING", &codegen_units);
+        debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats);
     }
 
     // In the next step, we use the inlining map to determine which additional
@@ -182,7 +186,7 @@ where
         cgu.create_size_estimate(tcx);
     }
 
-    debug_dump(tcx, "POST INLINING", &codegen_units);
+    debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats);
 
     // Next we try to make as many symbols "internal" as possible, so LLVM has
     // more freedom to optimize.
@@ -226,7 +230,7 @@ where
     // Ensure CGUs are sorted by name, so that we get deterministic results.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
 
-    debug_dump(tcx, "FINAL", &codegen_units);
+    debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats);
 
     codegen_units
 }
@@ -252,10 +256,16 @@ where
     let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx);
     let cgu_name_cache = &mut FxHashMap::default();
 
+    let mut num_unique_inlined_items = 0;
+    let mut unique_inlined_items_size = 0;
     for mono_item in mono_items {
         match mono_item.instantiation_mode(cx.tcx) {
             InstantiationMode::GloballyShared { .. } => {}
-            InstantiationMode::LocalCopy => continue,
+            InstantiationMode::LocalCopy => {
+                num_unique_inlined_items += 1;
+                unique_inlined_items_size += mono_item.size_estimate(cx.tcx);
+                continue;
+            }
         }
 
         let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item);
@@ -300,7 +310,11 @@ where
     let mut codegen_units: Vec<_> = codegen_units.into_values().collect();
     codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str()));
 
-    PlacedRootMonoItems { codegen_units, internalization_candidates }
+    PlacedRootMonoItems {
+        codegen_units,
+        internalization_candidates,
+        unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size),
+    }
 }
 
 // This function requires the CGUs to be sorted by name on input, and ensures
@@ -309,7 +323,7 @@ fn merge_codegen_units<'tcx>(
     cx: &PartitioningCx<'_, 'tcx>,
     codegen_units: &mut Vec<CodegenUnit<'tcx>>,
 ) {
-    assert!(cx.target_cgu_count >= 1);
+    assert!(cx.tcx.sess.codegen_units().as_usize() >= 1);
 
     // A sorted order here ensures merging is deterministic.
     assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str()))));
@@ -318,11 +332,32 @@ fn merge_codegen_units<'tcx>(
     let mut cgu_contents: FxHashMap<Symbol, Vec<Symbol>> =
         codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect();
 
-    // Merge the two smallest codegen units until the target size is
-    // reached.
-    while codegen_units.len() > cx.target_cgu_count {
-        // Sort small cgus to the back
+    // Having multiple CGUs can drastically speed up compilation. But for
+    // non-incremental builds, tiny CGUs slow down compilation *and* result in
+    // worse generated code. So we don't allow CGUs smaller than this (unless
+    // there is just one CGU, of course). Note that CGU sizes of 100,000+ are
+    // common in larger programs, so this isn't all that large.
+    const NON_INCR_MIN_CGU_SIZE: usize = 1000;
+
+    // Repeatedly merge the two smallest codegen units as long as:
+    // - we have more CGUs than the upper limit, or
+    // - (Non-incremental builds only) the user didn't specify a CGU count, and
+    //   there are multiple CGUs, and some are below the minimum size.
+    //
+    // The "didn't specify a CGU count" condition is because when an explicit
+    // count is requested we observe it as closely as possible. For example,
+    // the `compiler_builtins` crate sets `codegen-units = 10000` and it's
+    // critical they aren't merged. Also, some tests use explicit small values
+    // and likewise won't work if small CGUs are merged.
+    while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize()
+        || (cx.tcx.sess.opts.incremental.is_none()
+            && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_))
+            && codegen_units.len() > 1
+            && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE))
+    {
+        // Sort small cgus to the back.
         codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate()));
+
         let mut smallest = codegen_units.pop().unwrap();
         let second_smallest = codegen_units.last_mut().unwrap();
 
@@ -814,47 +849,147 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit
     }
 }
 
-fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) {
+fn debug_dump<'a, 'tcx: 'a>(
+    tcx: TyCtxt<'tcx>,
+    label: &str,
+    cgus: &[CodegenUnit<'tcx>],
+    (unique_inlined_items, unique_inlined_size): (usize, usize),
+) {
     let dump = move || {
         use std::fmt::Write;
 
-        let num_cgus = cgus.len();
-        let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum();
-        let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum();
-        let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap();
-        let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap();
-        let max_min_size_ratio = max_size as f64 / min_size as f64;
+        let mut num_cgus = 0;
+        let mut all_cgu_sizes = Vec::new();
+
+        // Note: every unique root item is placed exactly once, so the number
+        // of unique root items always equals the number of placed root items.
+
+        let mut root_items = 0;
+        // unique_inlined_items is passed in above.
+        let mut placed_inlined_items = 0;
+
+        let mut root_size = 0;
+        // unique_inlined_size is passed in above.
+        let mut placed_inlined_size = 0;
+
+        for cgu in cgus.iter() {
+            num_cgus += 1;
+            all_cgu_sizes.push(cgu.size_estimate());
+
+            for (item, _) in cgu.items() {
+                match item.instantiation_mode(tcx) {
+                    InstantiationMode::GloballyShared { .. } => {
+                        root_items += 1;
+                        root_size += item.size_estimate(tcx);
+                    }
+                    InstantiationMode::LocalCopy => {
+                        placed_inlined_items += 1;
+                        placed_inlined_size += item.size_estimate(tcx);
+                    }
+                }
+            }
+        }
+
+        all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
+
+        let unique_items = root_items + unique_inlined_items;
+        let placed_items = root_items + placed_inlined_items;
+        let items_ratio = placed_items as f64 / unique_items as f64;
+
+        let unique_size = root_size + unique_inlined_size;
+        let placed_size = root_size + placed_inlined_size;
+        let size_ratio = placed_size as f64 / unique_size as f64;
+
+        let mean_cgu_size = placed_size as f64 / num_cgus as f64;
+
+        assert_eq!(placed_size, all_cgu_sizes.iter().sum::<usize>());
 
         let s = &mut String::new();
+        let _ = writeln!(s, "{label}");
         let _ = writeln!(
             s,
-            "{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \
-             max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):"
+            "- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \
+               unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\
+             - placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \
+               placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\
+             - placed/unique items ratio: {items_ratio:.2}, \
+               placed/unique size ratio: {size_ratio:.2}\n\
+             - CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}",
+            list(&all_cgu_sizes),
         );
+        let _ = writeln!(s);
+
         for (i, cgu) in cgus.iter().enumerate() {
+            let name = cgu.name();
+            let size = cgu.size_estimate();
             let num_items = cgu.items().len();
-            let _ = writeln!(
-                s,
-                "- CGU[{i}] {} ({num_items} items, size={}):",
-                cgu.name(),
-                cgu.size_estimate()
-            );
+            let mean_size = size as f64 / num_items as f64;
+
+            let mut placed_item_sizes: Vec<_> =
+                cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect();
+            placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n));
+            let sizes = list(&placed_item_sizes);
+
+            let _ = writeln!(s, "- CGU[{i}]");
+            let _ = writeln!(s, "  - {name}, size: {size}");
+            let _ =
+                writeln!(s, "  - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",);
 
             for (item, linkage) in cgu.items_in_deterministic_order(tcx) {
                 let symbol_name = item.symbol_name(tcx).name;
                 let symbol_hash_start = symbol_name.rfind('h');
                 let symbol_hash = symbol_hash_start.map_or("<no hash>", |i| &symbol_name[i..]);
                 let size = item.size_estimate(tcx);
+                let kind = match item.instantiation_mode(tcx) {
+                    InstantiationMode::GloballyShared { .. } => "root",
+                    InstantiationMode::LocalCopy => "inlined",
+                };
                 let _ = with_no_trimmed_paths!(writeln!(
                     s,
-                    "  - {item} [{linkage:?}] [{symbol_hash}] (size={size})"
+                    "  - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})"
                 ));
             }
 
             let _ = writeln!(s);
         }
 
-        std::mem::take(s)
+        return std::mem::take(s);
+
+        // Converts a slice to a string, capturing repetitions to save space.
+        // E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]".
+        fn list(ns: &[usize]) -> String {
+            let mut v = Vec::new();
+            if ns.is_empty() {
+                return "[]".to_string();
+            }
+
+            let mut elem = |curr, curr_count| {
+                if curr_count == 1 {
+                    v.push(format!("{curr}"));
+                } else {
+                    v.push(format!("{curr} (x{curr_count})"));
+                }
+            };
+
+            let mut curr = ns[0];
+            let mut curr_count = 1;
+
+            for &n in &ns[1..] {
+                if n != curr {
+                    elem(curr, curr_count);
+                    curr = n;
+                    curr_count = 1;
+                } else {
+                    curr_count += 1;
+                }
+            }
+            elem(curr, curr_count);
+
+            let mut s = "[".to_string();
+            s.push_str(&v.join(", "));
+            s.push_str("]");
+            s
+        }
     };
 
     debug!("{}", dump());
@@ -922,8 +1057,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co
     let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || {
         sync::join(
             || {
-                let mut codegen_units =
-                    partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map);
+                let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map);
                 codegen_units[0].make_primary();
                 &*tcx.arena.alloc_from_iter(codegen_units)
             },
diff --git a/compiler/rustc_session/src/session.rs b/compiler/rustc_session/src/session.rs
index 2cc02003218ee..5feea83edb6a3 100644
--- a/compiler/rustc_session/src/session.rs
+++ b/compiler/rustc_session/src/session.rs
@@ -234,6 +234,27 @@ pub enum MetadataKind {
     Compressed,
 }
 
+#[derive(Clone, Copy)]
+pub enum CodegenUnits {
+    /// Specified by the user. In this case we try fairly hard to produce the
+    /// number of CGUs requested.
+    User(usize),
+
+    /// A default value, i.e. not specified by the user. In this case we take
+    /// more liberties about CGU formation, e.g. avoid producing very small
+    /// CGUs.
+    Default(usize),
+}
+
+impl CodegenUnits {
+    pub fn as_usize(self) -> usize {
+        match self {
+            CodegenUnits::User(n) => n,
+            CodegenUnits::Default(n) => n,
+        }
+    }
+}
+
 impl Session {
     pub fn miri_unleashed_feature(&self, span: Span, feature_gate: Option<Symbol>) {
         self.miri_unleashed_features.lock().push((span, feature_gate));
@@ -1104,7 +1125,7 @@ impl Session {
 
         // If there's only one codegen unit and LTO isn't enabled then there's
         // no need for ThinLTO so just return false.
-        if self.codegen_units() == 1 {
+        if self.codegen_units().as_usize() == 1 {
             return config::Lto::No;
         }
 
@@ -1206,19 +1227,19 @@ impl Session {
 
     /// Returns the number of codegen units that should be used for this
     /// compilation
-    pub fn codegen_units(&self) -> usize {
+    pub fn codegen_units(&self) -> CodegenUnits {
         if let Some(n) = self.opts.cli_forced_codegen_units {
-            return n;
+            return CodegenUnits::User(n);
         }
         if let Some(n) = self.target.default_codegen_units {
-            return n as usize;
+            return CodegenUnits::Default(n as usize);
         }
 
         // If incremental compilation is turned on, we default to a high number
         // codegen units in order to reduce the "collateral damage" small
         // changes cause.
         if self.opts.incremental.is_some() {
-            return 256;
+            return CodegenUnits::Default(256);
         }
 
         // Why is 16 codegen units the default all the time?
@@ -1271,7 +1292,7 @@ impl Session {
         // As a result 16 was chosen here! Mostly because it was a power of 2
         // and most benchmarks agreed it was roughly a local optimum. Not very
         // scientific.
-        16
+        CodegenUnits::Default(16)
     }
 
     pub fn teach(&self, code: &DiagnosticId) -> bool {
diff --git a/src/doc/rustc/src/codegen-options/index.md b/src/doc/rustc/src/codegen-options/index.md
index 1041d5026690f..8de638dde4fbf 100644
--- a/src/doc/rustc/src/codegen-options/index.md
+++ b/src/doc/rustc/src/codegen-options/index.md
@@ -31,8 +31,8 @@ Supported values can also be discovered by running `rustc --print code-models`.
 
 ## codegen-units
 
-This flag controls how many code generation units the crate is split into. It
-takes an integer greater than 0.
+This flag controls the maximum number of code generation units the crate is
+split into. It takes an integer greater than 0.
 
 When a crate is split into multiple codegen units, LLVM is able to process
 them in parallel. Increasing parallelism may speed up compile times, but may