diff --git a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs index 166454d3ae74c..fa67a1b331011 100644 --- a/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs +++ b/compiler/rustc_codegen_llvm/src/debuginfo/metadata.rs @@ -1385,7 +1385,7 @@ fn vcall_visibility_metadata<'ll, 'tcx>( let trait_def_id = trait_ref_self.def_id(); let trait_vis = cx.tcx.visibility(trait_def_id); - let cgus = cx.sess().codegen_units(); + let cgus = cx.sess().codegen_units().as_usize(); let single_cgu = cgus == 1; let lto = cx.sess().lto(); diff --git a/compiler/rustc_codegen_ssa/src/back/write.rs b/compiler/rustc_codegen_ssa/src/back/write.rs index 9be69e560e888..109e9959aeac8 100644 --- a/compiler/rustc_codegen_ssa/src/back/write.rs +++ b/compiler/rustc_codegen_ssa/src/back/write.rs @@ -646,10 +646,10 @@ fn produce_final_output_artifacts( // rlib. let needs_crate_object = crate_output.outputs.contains_key(&OutputType::Exe); - let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units() > 1; + let keep_numbered_bitcode = user_wants_bitcode && sess.codegen_units().as_usize() > 1; let keep_numbered_objects = - needs_crate_object || (user_wants_objects && sess.codegen_units() > 1); + needs_crate_object || (user_wants_objects && sess.codegen_units().as_usize() > 1); for module in compiled_modules.modules.iter() { if let Some(ref path) = module.object { @@ -1923,7 +1923,7 @@ impl OngoingCodegen { // FIXME: time_llvm_passes support - does this use a global context or // something? - if sess.codegen_units() == 1 && sess.opts.unstable_opts.time_llvm_passes { + if sess.codegen_units().as_usize() == 1 && sess.opts.unstable_opts.time_llvm_passes { self.backend.print_pass_timings() } diff --git a/compiler/rustc_monomorphize/src/partitioning.rs b/compiler/rustc_monomorphize/src/partitioning.rs index 1d9c8ded349c0..ebcc3b0399973 100644 --- a/compiler/rustc_monomorphize/src/partitioning.rs +++ b/compiler/rustc_monomorphize/src/partitioning.rs @@ -113,6 +113,7 @@ use rustc_middle::query::Providers; use rustc_middle::ty::print::{characteristic_def_id_of_type, with_no_trimmed_paths}; use rustc_middle::ty::{self, visit::TypeVisitableExt, InstanceDef, TyCtxt}; use rustc_session::config::{DumpMonoStatsFormat, SwitchWithOptPath}; +use rustc_session::CodegenUnits; use rustc_span::symbol::Symbol; use crate::collector::UsageMap; @@ -121,7 +122,6 @@ use crate::errors::{CouldntDumpMonoStats, SymbolAlreadyDefined, UnknownCguCollec struct PartitioningCx<'a, 'tcx> { tcx: TyCtxt<'tcx>, - target_cgu_count: usize, usage_map: &'a UsageMap<'tcx>, } @@ -130,13 +130,17 @@ struct PlacedRootMonoItems<'tcx> { codegen_units: Vec>, internalization_candidates: FxHashSet>, + + /// These must be obtained when the iterator in `partition` runs. They + /// can't be obtained later because some inlined functions might not be + /// reachable. + unique_inlined_stats: (usize, usize), } // The output CGUs are sorted by name. fn partition<'tcx, I>( tcx: TyCtxt<'tcx>, mono_items: I, - max_cgu_count: usize, usage_map: &UsageMap<'tcx>, ) -> Vec> where @@ -144,12 +148,12 @@ where { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning"); - let cx = &PartitioningCx { tcx, target_cgu_count: max_cgu_count, usage_map }; + let cx = &PartitioningCx { tcx, usage_map }; // In the first step, we place all regular monomorphizations into their // respective 'home' codegen unit. Regular monomorphizations are all // functions and statics defined in the local crate. - let PlacedRootMonoItems { mut codegen_units, internalization_candidates } = { + let PlacedRootMonoItems { mut codegen_units, internalization_candidates, unique_inlined_stats } = { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_place_roots"); place_root_mono_items(cx, mono_items) }; @@ -158,7 +162,7 @@ where cgu.create_size_estimate(tcx); } - debug_dump(tcx, "INITIAL PARTITIONING", &codegen_units); + debug_dump(tcx, "ROOTS", &codegen_units, unique_inlined_stats); // Merge until we have at most `max_cgu_count` codegen units. // `merge_codegen_units` is responsible for updating the CGU size @@ -166,7 +170,7 @@ where { let _prof_timer = tcx.prof.generic_activity("cgu_partitioning_merge_cgus"); merge_codegen_units(cx, &mut codegen_units); - debug_dump(tcx, "POST MERGING", &codegen_units); + debug_dump(tcx, "MERGE", &codegen_units, unique_inlined_stats); } // In the next step, we use the inlining map to determine which additional @@ -182,7 +186,7 @@ where cgu.create_size_estimate(tcx); } - debug_dump(tcx, "POST INLINING", &codegen_units); + debug_dump(tcx, "INLINE", &codegen_units, unique_inlined_stats); // Next we try to make as many symbols "internal" as possible, so LLVM has // more freedom to optimize. @@ -226,7 +230,7 @@ where // Ensure CGUs are sorted by name, so that we get deterministic results. assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str())))); - debug_dump(tcx, "FINAL", &codegen_units); + debug_dump(tcx, "FINAL", &codegen_units, unique_inlined_stats); codegen_units } @@ -252,10 +256,16 @@ where let cgu_name_builder = &mut CodegenUnitNameBuilder::new(cx.tcx); let cgu_name_cache = &mut FxHashMap::default(); + let mut num_unique_inlined_items = 0; + let mut unique_inlined_items_size = 0; for mono_item in mono_items { match mono_item.instantiation_mode(cx.tcx) { InstantiationMode::GloballyShared { .. } => {} - InstantiationMode::LocalCopy => continue, + InstantiationMode::LocalCopy => { + num_unique_inlined_items += 1; + unique_inlined_items_size += mono_item.size_estimate(cx.tcx); + continue; + } } let characteristic_def_id = characteristic_def_id_of_mono_item(cx.tcx, mono_item); @@ -300,7 +310,11 @@ where let mut codegen_units: Vec<_> = codegen_units.into_values().collect(); codegen_units.sort_by(|a, b| a.name().as_str().cmp(b.name().as_str())); - PlacedRootMonoItems { codegen_units, internalization_candidates } + PlacedRootMonoItems { + codegen_units, + internalization_candidates, + unique_inlined_stats: (num_unique_inlined_items, unique_inlined_items_size), + } } // This function requires the CGUs to be sorted by name on input, and ensures @@ -309,7 +323,7 @@ fn merge_codegen_units<'tcx>( cx: &PartitioningCx<'_, 'tcx>, codegen_units: &mut Vec>, ) { - assert!(cx.target_cgu_count >= 1); + assert!(cx.tcx.sess.codegen_units().as_usize() >= 1); // A sorted order here ensures merging is deterministic. assert!(codegen_units.is_sorted_by(|a, b| Some(a.name().as_str().cmp(b.name().as_str())))); @@ -318,11 +332,32 @@ fn merge_codegen_units<'tcx>( let mut cgu_contents: FxHashMap> = codegen_units.iter().map(|cgu| (cgu.name(), vec![cgu.name()])).collect(); - // Merge the two smallest codegen units until the target size is - // reached. - while codegen_units.len() > cx.target_cgu_count { - // Sort small cgus to the back + // Having multiple CGUs can drastically speed up compilation. But for + // non-incremental builds, tiny CGUs slow down compilation *and* result in + // worse generated code. So we don't allow CGUs smaller than this (unless + // there is just one CGU, of course). Note that CGU sizes of 100,000+ are + // common in larger programs, so this isn't all that large. + const NON_INCR_MIN_CGU_SIZE: usize = 1000; + + // Repeatedly merge the two smallest codegen units as long as: + // - we have more CGUs than the upper limit, or + // - (Non-incremental builds only) the user didn't specify a CGU count, and + // there are multiple CGUs, and some are below the minimum size. + // + // The "didn't specify a CGU count" condition is because when an explicit + // count is requested we observe it as closely as possible. For example, + // the `compiler_builtins` crate sets `codegen-units = 10000` and it's + // critical they aren't merged. Also, some tests use explicit small values + // and likewise won't work if small CGUs are merged. + while codegen_units.len() > cx.tcx.sess.codegen_units().as_usize() + || (cx.tcx.sess.opts.incremental.is_none() + && matches!(cx.tcx.sess.codegen_units(), CodegenUnits::Default(_)) + && codegen_units.len() > 1 + && codegen_units.iter().any(|cgu| cgu.size_estimate() < NON_INCR_MIN_CGU_SIZE)) + { + // Sort small cgus to the back. codegen_units.sort_by_cached_key(|cgu| cmp::Reverse(cgu.size_estimate())); + let mut smallest = codegen_units.pop().unwrap(); let second_smallest = codegen_units.last_mut().unwrap(); @@ -814,47 +849,147 @@ fn default_visibility(tcx: TyCtxt<'_>, id: DefId, is_generic: bool) -> Visibilit } } -fn debug_dump<'a, 'tcx: 'a>(tcx: TyCtxt<'tcx>, label: &str, cgus: &[CodegenUnit<'tcx>]) { +fn debug_dump<'a, 'tcx: 'a>( + tcx: TyCtxt<'tcx>, + label: &str, + cgus: &[CodegenUnit<'tcx>], + (unique_inlined_items, unique_inlined_size): (usize, usize), +) { let dump = move || { use std::fmt::Write; - let num_cgus = cgus.len(); - let num_items: usize = cgus.iter().map(|cgu| cgu.items().len()).sum(); - let total_size: usize = cgus.iter().map(|cgu| cgu.size_estimate()).sum(); - let max_size = cgus.iter().map(|cgu| cgu.size_estimate()).max().unwrap(); - let min_size = cgus.iter().map(|cgu| cgu.size_estimate()).min().unwrap(); - let max_min_size_ratio = max_size as f64 / min_size as f64; + let mut num_cgus = 0; + let mut all_cgu_sizes = Vec::new(); + + // Note: every unique root item is placed exactly once, so the number + // of unique root items always equals the number of placed root items. + + let mut root_items = 0; + // unique_inlined_items is passed in above. + let mut placed_inlined_items = 0; + + let mut root_size = 0; + // unique_inlined_size is passed in above. + let mut placed_inlined_size = 0; + + for cgu in cgus.iter() { + num_cgus += 1; + all_cgu_sizes.push(cgu.size_estimate()); + + for (item, _) in cgu.items() { + match item.instantiation_mode(tcx) { + InstantiationMode::GloballyShared { .. } => { + root_items += 1; + root_size += item.size_estimate(tcx); + } + InstantiationMode::LocalCopy => { + placed_inlined_items += 1; + placed_inlined_size += item.size_estimate(tcx); + } + } + } + } + + all_cgu_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n)); + + let unique_items = root_items + unique_inlined_items; + let placed_items = root_items + placed_inlined_items; + let items_ratio = placed_items as f64 / unique_items as f64; + + let unique_size = root_size + unique_inlined_size; + let placed_size = root_size + placed_inlined_size; + let size_ratio = placed_size as f64 / unique_size as f64; + + let mean_cgu_size = placed_size as f64 / num_cgus as f64; + + assert_eq!(placed_size, all_cgu_sizes.iter().sum::()); let s = &mut String::new(); + let _ = writeln!(s, "{label}"); let _ = writeln!( s, - "{label} ({num_items} items, total_size={total_size}; {num_cgus} CGUs, \ - max_size={max_size}, min_size={min_size}, max_size/min_size={max_min_size_ratio:.1}):" + "- unique items: {unique_items} ({root_items} root + {unique_inlined_items} inlined), \ + unique size: {unique_size} ({root_size} root + {unique_inlined_size} inlined)\n\ + - placed items: {placed_items} ({root_items} root + {placed_inlined_items} inlined), \ + placed size: {placed_size} ({root_size} root + {placed_inlined_size} inlined)\n\ + - placed/unique items ratio: {items_ratio:.2}, \ + placed/unique size ratio: {size_ratio:.2}\n\ + - CGUs: {num_cgus}, mean size: {mean_cgu_size:.1}, sizes: {}", + list(&all_cgu_sizes), ); + let _ = writeln!(s); + for (i, cgu) in cgus.iter().enumerate() { + let name = cgu.name(); + let size = cgu.size_estimate(); let num_items = cgu.items().len(); - let _ = writeln!( - s, - "- CGU[{i}] {} ({num_items} items, size={}):", - cgu.name(), - cgu.size_estimate() - ); + let mean_size = size as f64 / num_items as f64; + + let mut placed_item_sizes: Vec<_> = + cgu.items().iter().map(|(item, _)| item.size_estimate(tcx)).collect(); + placed_item_sizes.sort_unstable_by_key(|&n| cmp::Reverse(n)); + let sizes = list(&placed_item_sizes); + + let _ = writeln!(s, "- CGU[{i}]"); + let _ = writeln!(s, " - {name}, size: {size}"); + let _ = + writeln!(s, " - items: {num_items}, mean size: {mean_size:.1}, sizes: {sizes}",); for (item, linkage) in cgu.items_in_deterministic_order(tcx) { let symbol_name = item.symbol_name(tcx).name; let symbol_hash_start = symbol_name.rfind('h'); let symbol_hash = symbol_hash_start.map_or("", |i| &symbol_name[i..]); let size = item.size_estimate(tcx); + let kind = match item.instantiation_mode(tcx) { + InstantiationMode::GloballyShared { .. } => "root", + InstantiationMode::LocalCopy => "inlined", + }; let _ = with_no_trimmed_paths!(writeln!( s, - " - {item} [{linkage:?}] [{symbol_hash}] (size={size})" + " - {item} [{linkage:?}] [{symbol_hash}] ({kind}, size: {size})" )); } let _ = writeln!(s); } - std::mem::take(s) + return std::mem::take(s); + + // Converts a slice to a string, capturing repetitions to save space. + // E.g. `[4, 4, 4, 3, 2, 1, 1, 1, 1, 1]` -> "[4 (x3), 3, 2, 1 (x5)]". + fn list(ns: &[usize]) -> String { + let mut v = Vec::new(); + if ns.is_empty() { + return "[]".to_string(); + } + + let mut elem = |curr, curr_count| { + if curr_count == 1 { + v.push(format!("{curr}")); + } else { + v.push(format!("{curr} (x{curr_count})")); + } + }; + + let mut curr = ns[0]; + let mut curr_count = 1; + + for &n in &ns[1..] { + if n != curr { + elem(curr, curr_count); + curr = n; + curr_count = 1; + } else { + curr_count += 1; + } + } + elem(curr, curr_count); + + let mut s = "[".to_string(); + s.push_str(&v.join(", ")); + s.push_str("]"); + s + } }; debug!("{}", dump()); @@ -922,8 +1057,7 @@ fn collect_and_partition_mono_items(tcx: TyCtxt<'_>, (): ()) -> (&DefIdSet, &[Co let (codegen_units, _) = tcx.sess.time("partition_and_assert_distinct_symbols", || { sync::join( || { - let mut codegen_units = - partition(tcx, items.iter().copied(), tcx.sess.codegen_units(), &usage_map); + let mut codegen_units = partition(tcx, items.iter().copied(), &usage_map); codegen_units[0].make_primary(); &*tcx.arena.alloc_from_iter(codegen_units) }, diff --git a/compiler/rustc_session/src/session.rs b/compiler/rustc_session/src/session.rs index 2cc02003218ee..5feea83edb6a3 100644 --- a/compiler/rustc_session/src/session.rs +++ b/compiler/rustc_session/src/session.rs @@ -234,6 +234,27 @@ pub enum MetadataKind { Compressed, } +#[derive(Clone, Copy)] +pub enum CodegenUnits { + /// Specified by the user. In this case we try fairly hard to produce the + /// number of CGUs requested. + User(usize), + + /// A default value, i.e. not specified by the user. In this case we take + /// more liberties about CGU formation, e.g. avoid producing very small + /// CGUs. + Default(usize), +} + +impl CodegenUnits { + pub fn as_usize(self) -> usize { + match self { + CodegenUnits::User(n) => n, + CodegenUnits::Default(n) => n, + } + } +} + impl Session { pub fn miri_unleashed_feature(&self, span: Span, feature_gate: Option) { self.miri_unleashed_features.lock().push((span, feature_gate)); @@ -1104,7 +1125,7 @@ impl Session { // If there's only one codegen unit and LTO isn't enabled then there's // no need for ThinLTO so just return false. - if self.codegen_units() == 1 { + if self.codegen_units().as_usize() == 1 { return config::Lto::No; } @@ -1206,19 +1227,19 @@ impl Session { /// Returns the number of codegen units that should be used for this /// compilation - pub fn codegen_units(&self) -> usize { + pub fn codegen_units(&self) -> CodegenUnits { if let Some(n) = self.opts.cli_forced_codegen_units { - return n; + return CodegenUnits::User(n); } if let Some(n) = self.target.default_codegen_units { - return n as usize; + return CodegenUnits::Default(n as usize); } // If incremental compilation is turned on, we default to a high number // codegen units in order to reduce the "collateral damage" small // changes cause. if self.opts.incremental.is_some() { - return 256; + return CodegenUnits::Default(256); } // Why is 16 codegen units the default all the time? @@ -1271,7 +1292,7 @@ impl Session { // As a result 16 was chosen here! Mostly because it was a power of 2 // and most benchmarks agreed it was roughly a local optimum. Not very // scientific. - 16 + CodegenUnits::Default(16) } pub fn teach(&self, code: &DiagnosticId) -> bool { diff --git a/src/doc/rustc/src/codegen-options/index.md b/src/doc/rustc/src/codegen-options/index.md index 1041d5026690f..8de638dde4fbf 100644 --- a/src/doc/rustc/src/codegen-options/index.md +++ b/src/doc/rustc/src/codegen-options/index.md @@ -31,8 +31,8 @@ Supported values can also be discovered by running `rustc --print code-models`. ## codegen-units -This flag controls how many code generation units the crate is split into. It -takes an integer greater than 0. +This flag controls the maximum number of code generation units the crate is +split into. It takes an integer greater than 0. When a crate is split into multiple codegen units, LLVM is able to process them in parallel. Increasing parallelism may speed up compile times, but may