From 0a80c56eb92f7abb92a985dc0688a443a0c0f62e Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Wed, 14 Jun 2023 15:26:38 +0200 Subject: [PATCH 1/4] Optimize frame stats collection Optimize UI performance by calculating frame collection statistics dynamically. Rather than repeatedly traversing each frame, we now update the stats upon adding or removing frames. This change mitigates the significant overhead caused by constantly accessing frame data protected by RwLock. --- puffin/src/lib.rs | 2 +- puffin/src/profile_view.rs | 150 +++++++++++++++++++++++++++++++++---- puffin_egui/src/lib.rs | 53 +++++++------ 3 files changed, 170 insertions(+), 35 deletions(-) diff --git a/puffin/src/lib.rs b/puffin/src/lib.rs index dafd0dfd..ef6638c8 100644 --- a/puffin/src/lib.rs +++ b/puffin/src/lib.rs @@ -109,7 +109,7 @@ mod profile_view; pub use data::*; pub use frame_data::{FrameData, FrameMeta, UnpackedFrameData}; pub use merge::*; -pub use profile_view::{select_slowest, FrameView, GlobalFrameView}; +pub use profile_view::{select_slowest, FrameStats, FrameView, GlobalFrameView}; use std::collections::BTreeMap; use std::sync::{ diff --git a/puffin/src/profile_view.rs b/puffin/src/profile_view.rs index 16f84bb9..db32e4db 100644 --- a/puffin/src/profile_view.rs +++ b/puffin/src/profile_view.rs @@ -1,6 +1,7 @@ +use std::collections::HashSet; use std::sync::{Arc, Mutex}; -use crate::{FrameData, FrameSinkId}; +use crate::{FrameData, FrameIndex, FrameSinkId}; /// A view of recent and slowest frames, used by GUIs. #[derive(Clone)] @@ -16,12 +17,16 @@ pub struct FrameView { /// /// Only recommended if you set a large max_recent size. pack_frames: bool, + + /// Maintain stats as we add/remove frames + stats: FrameStats, } impl Default for FrameView { fn default() -> Self { let max_recent = 60 * 60 * 5; let max_slow = 256; + let stats = Default::default(); Self { recent: std::collections::VecDeque::with_capacity(max_recent), @@ -29,6 +34,7 @@ impl Default for FrameView { slowest: std::collections::BinaryHeap::with_capacity(max_slow), max_slow, pack_frames: true, + stats, } } } @@ -46,6 +52,7 @@ impl FrameView { // The safe choice is to clear everything: self.recent.clear(); self.slowest.clear(); + self.stats.clear(); } } @@ -57,24 +64,54 @@ impl FrameView { false }; - if add_to_slowest { - self.slowest.push(OrderedByDuration(new_frame.clone())); - while self.slowest.len() > self.max_slow { - self.slowest.pop(); - } - } - if let Some(last) = self.recent.back() { // Assume there is a viewer viewing the newest frame, // and compress the previously newest frame to save RAM: if self.pack_frames { last.pack(); } + + self.stats.add(last); + } + + if add_to_slowest { + self.add_slow_frame(&new_frame); + } + + self.add_recent_frame(&new_frame); + } + + pub fn add_slow_frame(&mut self, new_frame: &Arc) { + self.slowest.push(OrderedByDuration(new_frame.clone())); + + while self.slowest.len() > self.max_slow { + if let Some(removed_frame) = self.slowest.pop() { + // Only remove from stats if the frame is not present in recent + if self + .recent + .binary_search_by_key(&removed_frame.0.frame_index(), |f| f.frame_index()) + .is_err() + { + self.stats.remove(&removed_frame.0); + } + } } + } + + pub fn add_recent_frame(&mut self, new_frame: &Arc) { + self.recent.push_back(new_frame.clone()); - self.recent.push_back(new_frame); while self.recent.len() > self.max_recent { - self.recent.pop_front(); + if let Some(removed_frame) = self.recent.pop_front() { + // Only remove from stats if the frame is not present in slowest + if !self + .slowest + .iter() + .any(|f| removed_frame.frame_index() == f.0.frame_index()) + { + self.stats.remove(&removed_frame); + } + } } } @@ -107,8 +144,13 @@ impl FrameView { /// All frames sorted chronologically. pub fn all_uniq(&self) -> Vec> { - let mut all: Vec<_> = self.slowest.iter().map(|f| f.0.clone()).collect(); - all.extend(self.recent.iter().cloned()); + let mut all: Vec<_> = self + .slowest + .iter() + .map(|f| f.0.clone()) + .chain(self.recent.iter().cloned()) + .collect(); + all.sort_by_key(|frame| frame.frame_index()); all.dedup_by_key(|frame| frame.frame_index()); all @@ -116,7 +158,9 @@ impl FrameView { /// Clean history of the slowest frames. pub fn clear_slowest(&mut self) { - self.slowest.clear(); + for frame in self.slowest.drain() { + self.stats.remove(&frame.0); + } } /// How many frames of recent history to store. @@ -147,6 +191,18 @@ impl FrameView { self.pack_frames = pack_frames; } + /// Retrieve statistics for added frames. This operation is efficient and suitable when + /// frames have not been manipulated outside of `ProfileView`, such as being unpacked. For + /// comprehensive statistics, refer to [`Self::stats_full()`] + pub fn stats(&self) -> &FrameStats { + &self.stats + } + + /// Retrieve detailed statistics by performing a full computation on all the added frames. + pub fn stats_full(&self) -> FrameStats { + FrameStats::from_frames(self.all_uniq().map(Arc::as_ref)) + } + /// Export profile data as a `.puffin` file. #[cfg(feature = "serialization")] #[cfg(not(target_arch = "wasm32"))] // compression not supported on wasm @@ -271,3 +327,71 @@ impl GlobalFrameView { self.view.lock().unwrap() } } + +// ---------------------------------------------------------------------------- + +/// Collect statistics for maintained frames +#[derive(Clone, Debug, Default)] +pub struct FrameStats { + unique_frames: HashSet, + total_ram_used: usize, + unpacked_frames: usize, +} + +impl FrameStats { + pub fn from_frames<'a>(frames: impl Iterator) -> Self { + let mut stats = FrameStats::default(); + + for frame in frames { + stats.add(frame); + } + + stats + } + + fn add(&mut self, frame: &FrameData) { + if self.unique_frames.insert(frame.frame_index()) { + self.total_ram_used = self + .total_ram_used + .saturating_add(frame.bytes_of_ram_used()); + + self.unpacked_frames = self + .unpacked_frames + .saturating_add(frame.has_unpacked() as usize); + } + } + + fn remove(&mut self, frame: &FrameData) { + if self.unique_frames.remove(&frame.frame_index()) { + self.total_ram_used = self + .total_ram_used + .saturating_sub(frame.bytes_of_ram_used()); + + self.unpacked_frames = self + .unpacked_frames + .saturating_sub(frame.has_unpacked() as usize); + } + } + + pub fn frames(&self) -> usize { + assert!(self.unique_frames.len() >= self.unpacked_frames); + self.unique_frames.len() + } + + pub fn unpacked_frames(&self) -> usize { + assert!(self.unique_frames.len() >= self.unpacked_frames); + self.unpacked_frames + } + + pub fn bytes_of_ram_used(&self) -> usize { + assert!(self.unique_frames.len() >= self.unpacked_frames); + self.total_ram_used + } + + pub fn clear(&mut self) { + self.unique_frames.clear(); + + self.unpacked_frames = 0; + self.total_ram_used = 0; + } +} diff --git a/puffin_egui/src/lib.rs b/puffin_egui/src/lib.rs index ee9b13fd..a48aa93d 100644 --- a/puffin_egui/src/lib.rs +++ b/puffin_egui/src/lib.rs @@ -185,6 +185,7 @@ impl GlobalProfilerUi { pub struct AvailableFrames { pub recent: Vec>, pub slowest: Vec>, + pub stats: FrameStats, } impl AvailableFrames { @@ -192,16 +193,26 @@ impl AvailableFrames { Self { recent: frame_view.recent_frames().cloned().collect(), slowest: frame_view.slowest_frames_chronological(), + stats: Default::default(), } } fn all_uniq(&self) -> Vec> { - let mut all = self.slowest.clone(); - all.extend(self.recent.iter().cloned()); + let mut all = self + .slowest + .iter() + .cloned() + .chain(self.recent.iter().cloned()) + .collect::>(); + all.sort_by_key(|frame| frame.frame_index()); all.dedup_by_key(|frame| frame.frame_index()); all } + + fn stats(&self) -> &FrameStats { + &self.stats + } } /// Multiple streams for one thread. @@ -390,8 +401,16 @@ impl ProfilerUi { /// The frames we can select between fn frames(&self, frame_view: &FrameView) -> AvailableFrames { self.paused.as_ref().map_or_else( - || AvailableFrames::latest(frame_view), - |paused| paused.frames.clone(), + || { + let mut frames = AvailableFrames::latest(frame_view); + frames.stats = frame_view.stats().clone(); + frames + }, + |paused| { + let mut frames = paused.frames.clone(); + frames.stats = frame_view.stats_full(); + frames + }, ) } @@ -631,21 +650,17 @@ impl ProfilerUi { { let uniq = frames.all_uniq(); - let mut bytes = 0; - let mut unpacked = 0; - for frame in &uniq { - bytes += frame.bytes_of_ram_used(); - unpacked += frame.has_unpacked() as usize; - } + let stats = frames.stats(); + ui.label(format!( "{} frames ({} unpacked) using approximately {:.1} MB.", - uniq.len(), - unpacked, - bytes as f64 * 1e-6 + stats.frames(), + stats.unpacked_frames(), + stats.bytes_of_ram_used() as f64 * 1e-6 )); if let Some(frame_view) = frame_view.as_mut() { - max_frames_ui(ui, frame_view); + max_frames_ui(ui, frame_view, &uniq); if self.paused.is_none() { max_num_latest_ui(ui, &mut self.max_num_latest); } @@ -835,13 +850,9 @@ fn format_time(nanos: NanoSecond) -> Option { } } -fn max_frames_ui(ui: &mut egui::Ui, frame_view: &mut FrameView) { - let uniq = frame_view.all_uniq(); - - let mut bytes = 0; - for frame in &uniq { - bytes += frame.bytes_of_ram_used(); - } +fn max_frames_ui(ui: &mut egui::Ui, frame_view: &mut FrameView, uniq: &Vec>) { + let stats = frame_view.stats(); + let bytes = stats.bytes_of_ram_used(); let frames_per_second = if let (Some(first), Some(last)) = (uniq.first(), uniq.last()) { let nanos = last.range_ns().1 - first.range_ns().0; From 2f8f16f45d0066775817fdd87a6b19bc111c9673 Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Mon, 19 Jun 2023 20:11:46 +0200 Subject: [PATCH 2/4] Optimize cloning/sorting/deduplication of frames This update modifies the data containers used for storing frames to adopt binary tree structures. As a result, all frames are sorted at the time of insertion, eliminating the need for subsequent sorting during vector collection. Moreover, instead of returning vectors, the binary tree structure enables iteration over elements in a sorted order, providing iterators rather than vectors. This functionality gives the API caller the flexibility to either clone frames or simply inspect them in an efficient manner. --- Cargo.lock | 1 + puffin-imgui/src/ui.rs | 11 +- puffin/Cargo.toml | 1 + puffin/benches/benchmark.rs | 9 ++ puffin/src/profile_view.rs | 212 ++++++++++++++++++++---------------- puffin_egui/src/lib.rs | 41 +++---- 6 files changed, 151 insertions(+), 124 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5d3195e..c9202a95 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2138,6 +2138,7 @@ dependencies = [ "cfg-if", "criterion", "instant", + "itertools", "js-sys", "lz4_flex", "once_cell", diff --git a/puffin-imgui/src/ui.rs b/puffin-imgui/src/ui.rs index 28fbf36f..99f2cc51 100644 --- a/puffin-imgui/src/ui.rs +++ b/puffin-imgui/src/ui.rs @@ -357,7 +357,7 @@ impl ProfilerUi { let view = self.frame_view.lock(); Frames { recent: view.recent_frames().cloned().collect(), - slowest: view.slowest_frames_chronological(), + slowest: view.slowest_frames_chronological().cloned().collect(), } } @@ -392,10 +392,17 @@ impl ProfilerUi { } fn all_known_frames(&self) -> Vec> { - let mut all = self.frame_view.lock().all_uniq(); + let mut all = self + .frame_view + .lock() + .all_uniq() + .cloned() + .collect::>(); + if let Some(paused) = &self.paused { all.append(&mut paused.frames.all_uniq()); } + all.sort_by_key(|frame| frame.frame_index()); all.dedup_by_key(|frame| frame.frame_index()); all diff --git a/puffin/Cargo.toml b/puffin/Cargo.toml index 9e1b0848..3376ccd9 100644 --- a/puffin/Cargo.toml +++ b/puffin/Cargo.toml @@ -38,6 +38,7 @@ web = ["instant/wasm-bindgen", "dep:js-sys"] byteorder = { version = "1.0" } cfg-if = "1.0" instant = { version = "0.1" } +itertools = "0.10" once_cell = "1.0" # Optional: diff --git a/puffin/benches/benchmark.rs b/puffin/benches/benchmark.rs index 33f20317..b5d8eeb7 100644 --- a/puffin/benches/benchmark.rs +++ b/puffin/benches/benchmark.rs @@ -31,6 +31,15 @@ pub fn criterion_benchmark(c: &mut Criterion) { puffin::profile_scope!("my longish scope name", "my_mesh.obj"); }) }); + c.bench_function("flush_frames", |b| { + puffin::GlobalProfiler::lock().new_frame(); + let _fv = puffin::GlobalFrameView::default(); + + b.iter(|| { + puffin::profile_function!(); + puffin::GlobalProfiler::lock().new_frame(); + }) + }); puffin::set_scopes_on(false); c.bench_function("profile_function_off", |b| { diff --git a/puffin/src/profile_view.rs b/puffin/src/profile_view.rs index db32e4db..211f6818 100644 --- a/puffin/src/profile_view.rs +++ b/puffin/src/profile_view.rs @@ -1,16 +1,20 @@ -use std::collections::HashSet; +use std::cmp::Ordering; + use std::sync::{Arc, Mutex}; -use crate::{FrameData, FrameIndex, FrameSinkId}; +use itertools::Itertools; + +use crate::{FrameData, FrameSinkId}; /// A view of recent and slowest frames, used by GUIs. #[derive(Clone)] pub struct FrameView { /// newest first - recent: std::collections::VecDeque>, + recent: std::collections::VecDeque, max_recent: usize, - slowest: std::collections::BinaryHeap, + slowest_by_index: std::collections::BTreeSet, + slowest_by_duration: std::collections::BTreeSet, max_slow: usize, /// Minimizes memory usage at the expense of CPU time. @@ -31,7 +35,8 @@ impl Default for FrameView { Self { recent: std::collections::VecDeque::with_capacity(max_recent), max_recent, - slowest: std::collections::BinaryHeap::with_capacity(max_slow), + slowest_by_index: std::collections::BTreeSet::new(), + slowest_by_duration: std::collections::BTreeSet::new(), max_slow, pack_frames: true, stats, @@ -41,39 +46,40 @@ impl Default for FrameView { impl FrameView { pub fn is_empty(&self) -> bool { - self.recent.is_empty() && self.slowest.is_empty() + self.recent.is_empty() && self.slowest_by_duration.is_empty() } pub fn add_frame(&mut self, new_frame: Arc) { - if let Some(last) = self.recent.back() { - if new_frame.frame_index() <= last.frame_index() { + if let Some(last) = self.recent.iter().last() { + if new_frame.frame_index() <= last.0.frame_index() { // A frame from the past!? // Likely we are `puffin_viewer`, and the server restarted. // The safe choice is to clear everything: - self.recent.clear(); - self.slowest.clear(); self.stats.clear(); + self.recent.clear(); + self.slowest_by_index.clear(); + self.slowest_by_duration.clear(); } } - let add_to_slowest = if self.slowest.len() < self.max_slow { - true - } else if let Some(fastest_of_the_slow) = self.slowest.peek() { - new_frame.duration_ns() > fastest_of_the_slow.0.duration_ns() - } else { - false - }; - - if let Some(last) = self.recent.back() { + if let Some(last) = self.recent.iter().last() { // Assume there is a viewer viewing the newest frame, // and compress the previously newest frame to save RAM: if self.pack_frames { - last.pack(); + last.0.pack(); } - self.stats.add(last); + self.stats.add(&last.0); } + let add_to_slowest = if self.slowest_by_duration.len() < self.max_slow { + true + } else if let Some(fastest_of_the_slow) = self.slowest_by_duration.iter().last() { + new_frame.duration_ns() > fastest_of_the_slow.0.duration_ns() + } else { + false + }; + if add_to_slowest { self.add_slow_frame(&new_frame); } @@ -82,16 +88,20 @@ impl FrameView { } pub fn add_slow_frame(&mut self, new_frame: &Arc) { - self.slowest.push(OrderedByDuration(new_frame.clone())); + assert_eq!(self.slowest_by_duration.len(), self.slowest_by_index.len()); + + self.slowest_by_duration + .insert(OrderedByDuration(new_frame.clone())); + self.slowest_by_index + .insert(OrderedByIndex(new_frame.clone())); + + while self.slowest_by_duration.len() > self.max_slow { + if let Some(removed_frame) = self.slowest_by_duration.pop_last() { + let removed_by_index = OrderedByIndex(removed_frame.0.clone()); + self.slowest_by_index.remove(&removed_by_index); - while self.slowest.len() > self.max_slow { - if let Some(removed_frame) = self.slowest.pop() { // Only remove from stats if the frame is not present in recent - if self - .recent - .binary_search_by_key(&removed_frame.0.frame_index(), |f| f.frame_index()) - .is_err() - { + if self.recent.binary_search(&removed_by_index).is_err() { self.stats.remove(&removed_frame.0); } } @@ -99,17 +109,13 @@ impl FrameView { } pub fn add_recent_frame(&mut self, new_frame: &Arc) { - self.recent.push_back(new_frame.clone()); + self.recent.push_back(OrderedByIndex(new_frame.clone())); while self.recent.len() > self.max_recent { if let Some(removed_frame) = self.recent.pop_front() { // Only remove from stats if the frame is not present in slowest - if !self - .slowest - .iter() - .any(|f| removed_frame.frame_index() == f.0.frame_index()) - { - self.stats.remove(&removed_frame); + if !self.slowest_by_index.contains(&removed_frame) { + self.stats.remove(&removed_frame.0); } } } @@ -117,50 +123,44 @@ impl FrameView { /// The latest fully captured frame of data. pub fn latest_frame(&self) -> Option> { - self.recent.back().cloned() + self.recent.back().map(|f| f.0.clone()) } /// Returns up to `n` latest fully captured frames of data. - pub fn latest_frames(&self, n: usize) -> Vec> { + pub fn latest_frames(&self, n: usize) -> impl Iterator> { // Probably not the best way to do this, but since // [`self.recent`] is immutable in this context and // working with deque slices is complicated, we'll do // it this way for now. - self.recent.iter().rev().take(n).rev().cloned().collect() + self.recent.iter().rev().take(n).rev().map(|f| &f.0) } /// Oldest first pub fn recent_frames(&self) -> impl Iterator> { - self.recent.iter() + self.recent.iter().map(|f| &f.0) } /// The slowest frames so far (or since last call to [`Self::clear_slowest()`]) /// in chronological order. - pub fn slowest_frames_chronological(&self) -> Vec> { - let mut frames: Vec<_> = self.slowest.iter().map(|f| f.0.clone()).collect(); - frames.sort_by_key(|frame| frame.frame_index()); - frames + pub fn slowest_frames_chronological(&self) -> impl Iterator> { + self.slowest_by_index.iter().map(|f| &f.0) } /// All frames sorted chronologically. - pub fn all_uniq(&self) -> Vec> { - let mut all: Vec<_> = self - .slowest - .iter() - .map(|f| f.0.clone()) - .chain(self.recent.iter().cloned()) - .collect(); - - all.sort_by_key(|frame| frame.frame_index()); - all.dedup_by_key(|frame| frame.frame_index()); - all + pub fn all_uniq(&self) -> impl Iterator> { + Itertools::merge(self.recent.iter(), self.slowest_by_index.iter()) + .dedup() + .map(|f| &f.0) } /// Clean history of the slowest frames. pub fn clear_slowest(&mut self) { - for frame in self.slowest.drain() { + for frame in self.slowest_by_index.iter() { self.stats.remove(&frame.0); } + + self.slowest_by_duration.clear(); + self.slowest_by_index.clear(); } /// How many frames of recent history to store. @@ -217,12 +217,7 @@ impl FrameView { pub fn save_to_writer(&self, write: &mut impl std::io::Write) -> anyhow::Result<()> { write.write_all(b"PUF0")?; - let slowest_frames = self.slowest.iter().map(|f| &f.0); - let mut frames: Vec<_> = slowest_frames.chain(self.recent.iter()).collect(); - frames.sort_by_key(|frame| frame.frame_index()); - frames.dedup_by_key(|frame| frame.frame_index()); - - for frame in frames { + for frame in self.all_uniq() { frame.write_into(write)?; } Ok(()) @@ -277,22 +272,55 @@ pub fn select_slowest(frames: &[Arc], max: usize) -> Vec); +impl Ord for OrderedByDuration { + fn cmp(&self, other: &Self) -> Ordering { + match self.0.duration_ns().cmp(&other.0.duration_ns()).reverse() { + Ordering::Equal => self.0.frame_index().cmp(&other.0.frame_index()), + res => res, + } + } +} + +impl PartialOrd for OrderedByDuration { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for OrderedByDuration {} + impl PartialEq for OrderedByDuration { fn eq(&self, other: &Self) -> bool { - self.0.duration_ns().eq(&other.0.duration_ns()) + self.0.duration_ns() == other.0.duration_ns() + && self.0.frame_index() == other.0.frame_index() } } -impl Eq for OrderedByDuration {} -impl PartialOrd for OrderedByDuration { - fn partial_cmp(&self, other: &Self) -> Option { +// ---------------------------------------------------------------------------- +#[derive(Clone)] +struct OrderedByIndex(Arc); + +impl Ord for OrderedByIndex { + fn cmp(&self, other: &Self) -> Ordering { + match self.0.frame_index().cmp(&other.0.frame_index()) { + Ordering::Equal => self.0.duration_ns().cmp(&other.0.duration_ns()), + res => res, + } + } +} + +impl PartialOrd for OrderedByIndex { + fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for OrderedByDuration { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.0.duration_ns().cmp(&other.0.duration_ns()).reverse() +impl Eq for OrderedByIndex {} + +impl PartialEq for OrderedByIndex { + fn eq(&self, other: &Self) -> bool { + self.0.frame_index() == other.0.frame_index() + && self.0.duration_ns() == other.0.duration_ns() } } @@ -333,7 +361,7 @@ impl GlobalFrameView { /// Collect statistics for maintained frames #[derive(Clone, Debug, Default)] pub struct FrameStats { - unique_frames: HashSet, + unique_frames: usize, total_ram_used: usize, unpacked_frames: usize, } @@ -350,47 +378,43 @@ impl FrameStats { } fn add(&mut self, frame: &FrameData) { - if self.unique_frames.insert(frame.frame_index()) { - self.total_ram_used = self - .total_ram_used - .saturating_add(frame.bytes_of_ram_used()); - - self.unpacked_frames = self - .unpacked_frames - .saturating_add(frame.has_unpacked() as usize); - } + self.total_ram_used = self + .total_ram_used + .saturating_add(frame.bytes_of_ram_used()); + + self.unpacked_frames = self + .unpacked_frames + .saturating_add(frame.has_unpacked() as usize); + + self.unique_frames = self.unique_frames.saturating_add(1); } fn remove(&mut self, frame: &FrameData) { - if self.unique_frames.remove(&frame.frame_index()) { - self.total_ram_used = self - .total_ram_used - .saturating_sub(frame.bytes_of_ram_used()); - - self.unpacked_frames = self - .unpacked_frames - .saturating_sub(frame.has_unpacked() as usize); - } + self.total_ram_used = self + .total_ram_used + .saturating_sub(frame.bytes_of_ram_used()); + + self.unpacked_frames = self + .unpacked_frames + .saturating_sub(frame.has_unpacked() as usize); + + self.unique_frames = self.unique_frames.saturating_sub(1); } pub fn frames(&self) -> usize { - assert!(self.unique_frames.len() >= self.unpacked_frames); - self.unique_frames.len() + self.unique_frames } pub fn unpacked_frames(&self) -> usize { - assert!(self.unique_frames.len() >= self.unpacked_frames); self.unpacked_frames } pub fn bytes_of_ram_used(&self) -> usize { - assert!(self.unique_frames.len() >= self.unpacked_frames); self.total_ram_used } pub fn clear(&mut self) { - self.unique_frames.clear(); - + self.unique_frames = 0; self.unpacked_frames = 0; self.total_ram_used = 0; } diff --git a/puffin_egui/src/lib.rs b/puffin_egui/src/lib.rs index a48aa93d..81f53300 100644 --- a/puffin_egui/src/lib.rs +++ b/puffin_egui/src/lib.rs @@ -185,6 +185,7 @@ impl GlobalProfilerUi { pub struct AvailableFrames { pub recent: Vec>, pub slowest: Vec>, + pub uniq: Vec>, pub stats: FrameStats, } @@ -192,27 +193,11 @@ impl AvailableFrames { fn latest(frame_view: &FrameView) -> Self { Self { recent: frame_view.recent_frames().cloned().collect(), - slowest: frame_view.slowest_frames_chronological(), + slowest: frame_view.slowest_frames_chronological().cloned().collect(), + uniq: frame_view.all_uniq().cloned().collect(), stats: Default::default(), } } - - fn all_uniq(&self) -> Vec> { - let mut all = self - .slowest - .iter() - .cloned() - .chain(self.recent.iter().cloned()) - .collect::>(); - - all.sort_by_key(|frame| frame.frame_index()); - all.dedup_by_key(|frame| frame.frame_index()); - all - } - - fn stats(&self) -> &FrameStats { - &self.stats - } } /// Multiple streams for one thread. @@ -436,14 +421,14 @@ impl ProfilerUi { } } - fn all_known_frames(&self, frame_view: &FrameView) -> Vec> { - let mut all = frame_view.all_uniq(); - if let Some(paused) = &self.paused { - all.append(&mut paused.frames.all_uniq()); + fn all_known_frames<'a>( + &'a self, + frame_view: &'a FrameView, + ) -> Box> + '_> { + match &self.paused { + Some(paused) => Box::new(frame_view.all_uniq().chain(paused.frames.uniq.iter())), + None => Box::new(frame_view.all_uniq()), } - all.sort_by_key(|frame| frame.frame_index()); - all.dedup_by_key(|frame| frame.frame_index()); - all } fn run_pack_pass_if_needed(&mut self, frame_view: &FrameView) { @@ -649,8 +634,8 @@ impl ProfilerUi { }); { - let uniq = frames.all_uniq(); - let stats = frames.stats(); + let uniq = &frames.uniq; + let stats = &frames.stats; ui.label(format!( "{} frames ({} unpacked) using approximately {:.1} MB.", @@ -660,7 +645,7 @@ impl ProfilerUi { )); if let Some(frame_view) = frame_view.as_mut() { - max_frames_ui(ui, frame_view, &uniq); + max_frames_ui(ui, frame_view, uniq); if self.paused.is_none() { max_num_latest_ui(ui, &mut self.max_num_latest); } From c497844cbdc4f9028f73f92fc4c98d3166fb7221 Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Fri, 23 Jun 2023 12:07:05 +0200 Subject: [PATCH 3/4] Merge packed and unpacked into single type This commit consolidates the packed and unpacked data from frame_data into a single enum. The goal of this change is to minimize the number of locks required when retrieving information about a frame. Previously, the process of reading the packing information for stats required four separate locks: 1. One to check 'has_unpacked' 2. Another to verify 'has_unpacked' within 'unpacked_size' 3. One to retrieve 'unpacked_size' 4. And finally, one to retrieve 'packed_size' With this optimization, the lock count is reduced to a single one, achieved through invoking the 'packing_info' function. --- puffin/src/frame_data.rs | 210 +++++++++++++++++++++++++------------ puffin/src/profile_view.rs | 38 +++---- puffin_egui/src/lib.rs | 4 +- 3 files changed, 164 insertions(+), 88 deletions(-) diff --git a/puffin/src/frame_data.rs b/puffin/src/frame_data.rs index 71781c63..6eed46ed 100644 --- a/puffin/src/frame_data.rs +++ b/puffin/src/frame_data.rs @@ -12,7 +12,7 @@ pub type ThreadStreams = BTreeMap>; /// Meta-information about a frame. #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] -#[derive(Clone, Debug)] +#[derive(Clone, Copy, Debug)] pub struct FrameMeta { /// What frame this is (counting from 0 at application startup). pub frame_index: FrameIndex, @@ -130,6 +130,13 @@ impl FrameData { pub fn bytes_of_ram_used(&self) -> usize { self.unpacked_frame.meta.num_bytes } + + pub fn packing_info(&self) -> PackingInfo { + PackingInfo { + unpacked_size: Some(self.unpacked_frame.meta.num_bytes), + packed_size: None, + } + } pub fn has_packed(&self) -> bool { false } @@ -278,14 +285,106 @@ impl PackedStreams { #[cfg(feature = "packing")] pub struct FrameData { meta: FrameMeta, - /// * [`None`] if still compressed. - /// * `Some(Err(…))` if there was a problem during unpacking. - /// * `Some(Ok(…))` if unpacked. - unpacked_frame: RwLock>>>, + inner: RwLock, +} + +#[derive(Clone, Copy, Debug)] +pub struct PackingInfo { + /// Number of bytes used when unpacked, if has unpacked. + pub unpacked_size: Option, + /// Number of bytes used by the packed data, if has packed. + pub packed_size: Option, +} + +#[cfg(feature = "packing")] +enum Inner { + /// Unpacked data. + Unpacked(Arc), /// [`UnpackedFrameData::thread_streams`], compressed. - /// [`None`] if not yet compressed. - packed_streams: RwLock>, + Packed(PackedStreams), + + /// Both compressed and uncompressed. + Both(Arc, PackedStreams), +} + +#[cfg(feature = "packing")] +impl Inner { + fn unpacked_size(&self) -> Option { + match self { + Inner::Packed(_) => None, + Inner::Unpacked(unpacked) | Inner::Both(unpacked, _) => Some(unpacked.meta.num_bytes), + } + } + + fn unpacked(&self) -> Option> { + match self { + Inner::Packed(_) => None, + Inner::Unpacked(unpacked) | Inner::Both(unpacked, _) => Some(unpacked.clone()), + } + } + + fn unpack(&mut self, unpacked: Arc) { + let temp = std::mem::replace( + self, + Inner::Packed(PackedStreams::new(CompressionKind::Uncompressed, vec![])), + ); + + if let Inner::Packed(packed) = temp { + // Transform only if we don't have unpacked already + *self = Inner::Both(unpacked, packed); + } else { + // Restore the original value if it was not Inner::Packed + *self = temp; + } + } + + fn packed_size(&self) -> Option { + match self { + Inner::Unpacked(_) => None, + Inner::Packed(packed) | Inner::Both(_, packed) => Some(packed.num_bytes()), + } + } + + fn packed(&self) -> Option<&PackedStreams> { + match self { + Inner::Unpacked(_) => None, + Inner::Packed(packed) | Inner::Both(_, packed) => Some(packed), + } + } + + fn pack_and_remove(&mut self) { + if let Inner::Unpacked(ref unpacked) | Inner::Both(ref unpacked, _) = *self { + let packed = PackedStreams::pack(&unpacked.thread_streams); + *self = Self::Packed(packed); + } + } + + fn pack_and_keep(&mut self) { + if let Inner::Unpacked(ref unpacked) = *self { + let packed = PackedStreams::pack(&unpacked.thread_streams); + *self = Self::Packed(packed); + } + } + + fn bytes_of_ram_used(&self) -> usize { + self.unpacked_size().unwrap_or(0) + self.packed_size().unwrap_or(0) + } + + fn has_packed(&self) -> bool { + matches!(self, Inner::Packed(_) | Inner::Both(..)) + } + + fn has_unpacked(&self) -> bool { + matches!(self, Inner::Unpacked(_) | Inner::Both(..)) + } + + fn packing_info(&self) -> PackingInfo { + PackingInfo { + unpacked_size: self.unpacked_size(), + packed_size: self.packed_size(), + } + } } #[cfg(feature = "packing")] @@ -302,9 +401,8 @@ impl FrameData { fn from_unpacked(unpacked_frame: Arc) -> Self { Self { - meta: unpacked_frame.meta.clone(), - unpacked_frame: RwLock::new(Some(Ok(unpacked_frame))), - packed_streams: RwLock::new(None), + meta: unpacked_frame.meta, + inner: RwLock::new(Inner::Unpacked(unpacked_frame)), } } @@ -315,31 +413,37 @@ impl FrameData { /// Number of bytes used by the packed data, if packed. pub fn packed_size(&self) -> Option { - self.packed_streams.read().as_ref().map(|c| c.num_bytes()) + self.inner.read().packed_size() } /// Number of bytes used when unpacked, if known. pub fn unpacked_size(&self) -> Option { - if self.has_unpacked() { - Some(self.meta.num_bytes) - } else { - None - } + self.inner.read().unpacked_size() } /// bytes currently used by the unpacked and packed data. pub fn bytes_of_ram_used(&self) -> usize { - self.unpacked_size().unwrap_or(0) + self.packed_size().unwrap_or(0) + self.inner.read().bytes_of_ram_used() } /// Do we have a packed version stored internally? pub fn has_packed(&self) -> bool { - self.packed_streams.read().is_some() + self.inner.read().has_packed() } /// Do we have a unpacked version stored internally? pub fn has_unpacked(&self) -> bool { - self.unpacked_frame.read().is_some() + self.inner.read().has_unpacked() + } + + /// Provides an overview of the frame's packing status. + /// + /// The function retrieves both the sizes of the unpacked and packed frames, as well as whether + /// packed/unpacked versions of the frame exist internally. The goal of this function is to + /// minimize the number of lock accesses by consolidating information retrieval into a single + /// operation. + pub fn packing_info(&self) -> PackingInfo { + self.inner.read().packing_info() } /// Return the unpacked data. @@ -348,60 +452,34 @@ impl FrameData { /// /// Returns `Err` if failing to decode the packed data. pub fn unpacked(&self) -> anyhow::Result> { - fn unpack_frame_data( - meta: FrameMeta, - packed: &PackedStreams, - ) -> anyhow::Result { - Ok(UnpackedFrameData { - meta, - thread_streams: packed.unpack()?, - }) - } + let unpacked = { + let inner_guard = self.inner.read(); + let Inner::Packed(ref packed) = *inner_guard else { + // Safe to unwrap, variant has to contain unpacked if NOT `Packed` + return Ok(self.inner.read().unpacked().unwrap()); + }; - let has_unpacked = self.unpacked_frame.read().is_some(); - if !has_unpacked { crate::profile_scope!("unpack_puffin_frame"); - let packed_lock = self.packed_streams.read(); - let packed = packed_lock - .as_ref() - .expect("FrameData is neither packed or unpacked"); - - let frame_data_result = unpack_frame_data(self.meta.clone(), packed); - let frame_data_result = frame_data_result.map(Arc::new); - *self.unpacked_frame.write() = Some(frame_data_result); - } - match self.unpacked_frame.read().as_ref().unwrap() { - Ok(frame) => Ok(frame.clone()), - Err(err) => Err(anyhow::format_err!("{}", err)), // can't clone `anyhow::Error` - } + Arc::new(UnpackedFrameData { + meta: self.meta, + thread_streams: packed.unpack()?, + }) + }; + + self.inner.write().unpack(unpacked.clone()); + Ok(unpacked) } /// Make the [`FrameData`] use up less memory. /// Idempotent. pub fn pack(&self) { - self.create_packed(); - *self.unpacked_frame.write() = None; + self.inner.write().pack_and_remove(); } /// Create a packed storage without freeing the unpacked storage. fn create_packed(&self) { - let has_packed = self.packed_streams.read().is_some(); - if !has_packed { - // crate::profile_scope!("pack_puffin_frame"); // we get called from `GlobalProfiler::new_frame`, so avoid recursiveness! - let unpacked_frame = self - .unpacked_frame - .read() - .as_ref() - .expect("We should have an unpacked frame if we don't have a packed one") - .as_ref() - .expect("The unpacked frame should be error free, since it doesn't come from packed source") - .clone(); - - let packed = PackedStreams::pack(&unpacked_frame.thread_streams); - - *self.packed_streams.write() = Some(packed); - } + self.inner.write().pack_and_keep(); } /// Writes one [`FrameData`] into a stream, prefixed by its length ([`u32`] le). @@ -417,8 +495,8 @@ impl FrameData { write.write_all(&meta_serialized)?; self.create_packed(); - let packed_streams_lock = self.packed_streams.read(); - let packed_streams = packed_streams_lock.as_ref().unwrap(); // We just called create_packed + let packed_streams_lock = self.inner.read(); + let packed_streams = packed_streams_lock.packed().unwrap(); // We just called create_packed write.write_all(&(packed_streams.num_bytes() as u32).to_le_bytes())?; write.write_u8(packed_streams.compression_kind as u8)?; @@ -534,8 +612,7 @@ impl FrameData { Ok(Some(Self { meta, - unpacked_frame: RwLock::new(None), - packed_streams: RwLock::new(Some(packed_streams)), + inner: RwLock::new(Inner::Packed(packed_streams)), })) } else if &header == b"PFD3" { // Added 2023-05-13: CompressionKind field @@ -564,8 +641,7 @@ impl FrameData { Ok(Some(Self { meta, - unpacked_frame: RwLock::new(None), - packed_streams: RwLock::new(Some(packed_streams)), + inner: RwLock::new(Inner::Packed(packed_streams)), })) } else { anyhow::bail!("Failed to decode: this data is newer than this reader. Please update your puffin version!"); diff --git a/puffin/src/profile_view.rs b/puffin/src/profile_view.rs index 211f6818..2fc9d1dd 100644 --- a/puffin/src/profile_view.rs +++ b/puffin/src/profile_view.rs @@ -87,7 +87,7 @@ impl FrameView { self.add_recent_frame(&new_frame); } - pub fn add_slow_frame(&mut self, new_frame: &Arc) { + fn add_slow_frame(&mut self, new_frame: &Arc) { assert_eq!(self.slowest_by_duration.len(), self.slowest_by_index.len()); self.slowest_by_duration @@ -108,7 +108,7 @@ impl FrameView { } } - pub fn add_recent_frame(&mut self, new_frame: &Arc) { + fn add_recent_frame(&mut self, new_frame: &Arc) { self.recent.push_back(OrderedByIndex(new_frame.clone())); while self.recent.len() > self.max_recent { @@ -194,8 +194,8 @@ impl FrameView { /// Retrieve statistics for added frames. This operation is efficient and suitable when /// frames have not been manipulated outside of `ProfileView`, such as being unpacked. For /// comprehensive statistics, refer to [`Self::stats_full()`] - pub fn stats(&self) -> &FrameStats { - &self.stats + pub fn stats(&self) -> FrameStats { + self.stats } /// Retrieve detailed statistics by performing a full computation on all the added frames. @@ -358,8 +358,16 @@ impl GlobalFrameView { // ---------------------------------------------------------------------------- +fn stats_entry(frame: &FrameData) -> (usize, usize) { + let info = frame.packing_info(); + ( + info.packed_size.unwrap_or(0) + info.unpacked_size.unwrap_or(0), + info.unpacked_size.is_some() as usize, + ) +} + /// Collect statistics for maintained frames -#[derive(Clone, Debug, Default)] +#[derive(Clone, Copy, Debug, Default)] pub struct FrameStats { unique_frames: usize, total_ram_used: usize, @@ -378,26 +386,18 @@ impl FrameStats { } fn add(&mut self, frame: &FrameData) { - self.total_ram_used = self - .total_ram_used - .saturating_add(frame.bytes_of_ram_used()); - - self.unpacked_frames = self - .unpacked_frames - .saturating_add(frame.has_unpacked() as usize); + let (total, unpacked) = stats_entry(frame); + self.total_ram_used = self.total_ram_used.saturating_add(total); + self.unpacked_frames = self.unpacked_frames.saturating_add(unpacked); self.unique_frames = self.unique_frames.saturating_add(1); } fn remove(&mut self, frame: &FrameData) { - self.total_ram_used = self - .total_ram_used - .saturating_sub(frame.bytes_of_ram_used()); - - self.unpacked_frames = self - .unpacked_frames - .saturating_sub(frame.has_unpacked() as usize); + let (total, unpacked) = stats_entry(frame); + self.total_ram_used = self.total_ram_used.saturating_sub(total); + self.unpacked_frames = self.unpacked_frames.saturating_sub(unpacked); self.unique_frames = self.unique_frames.saturating_sub(1); } diff --git a/puffin_egui/src/lib.rs b/puffin_egui/src/lib.rs index 81f53300..cbb5e2a2 100644 --- a/puffin_egui/src/lib.rs +++ b/puffin_egui/src/lib.rs @@ -388,12 +388,12 @@ impl ProfilerUi { self.paused.as_ref().map_or_else( || { let mut frames = AvailableFrames::latest(frame_view); - frames.stats = frame_view.stats().clone(); + frames.stats = frame_view.stats(); frames }, |paused| { let mut frames = paused.frames.clone(); - frames.stats = frame_view.stats_full(); + frames.stats = FrameStats::from_frames(paused.frames.uniq.iter().map(Arc::as_ref)); frames }, ) From c38b8ae8a028206f781b6a8dbafb9ba1143f6b74 Mon Sep 17 00:00:00 2001 From: Tomasz Andrzejak Date: Fri, 28 Jun 2024 15:20:33 +0200 Subject: [PATCH 4/4] Build SelectedFrames from iterator --- puffin/CHANGELOG.md | 2 ++ puffin/src/profile_view.rs | 18 +++++++++++------- puffin_egui/CHANGELOG.md | 1 + puffin_egui/src/lib.rs | 26 +++++++++++++++----------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/puffin/CHANGELOG.md b/puffin/CHANGELOG.md index 9249fa27..3ff86708 100644 --- a/puffin/CHANGELOG.md +++ b/puffin/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] - ReleaseDate +- [PR#151](https://github.com/EmbarkStudios/puffin/pull/151) Optimize frame statistics collection. + ## [0.19.0] - 2024-01-17 - [PR#169](https://github.com/EmbarkStudios/puffin/pull/169) Stream scope information only once, drastically reduce bandwidth and increased performance. Allow better usage of static strings in profile scopes. Breaking change! See PR for migration guide. diff --git a/puffin/src/profile_view.rs b/puffin/src/profile_view.rs index e01d6b87..605c610d 100644 --- a/puffin/src/profile_view.rs +++ b/puffin/src/profile_view.rs @@ -1,5 +1,9 @@ use itertools::Itertools; -use std::{cmp::Ordering, sync::Arc}; +use std::{ + cmp::Ordering, + collections::{BTreeSet, VecDeque}, + sync::Arc, +}; use crate::{FrameData, FrameSinkId, ScopeCollection}; @@ -7,11 +11,11 @@ use crate::{FrameData, FrameSinkId, ScopeCollection}; #[derive(Clone)] pub struct FrameView { /// newest first - recent: std::collections::VecDeque, + recent: VecDeque, max_recent: usize, - slowest_by_index: std::collections::BTreeSet, - slowest_by_duration: std::collections::BTreeSet, + slowest_by_index: BTreeSet, + slowest_by_duration: BTreeSet, max_slow: usize, /// Minimizes memory usage at the expense of CPU time. @@ -31,10 +35,10 @@ impl Default for FrameView { let max_slow = 256; Self { - recent: std::collections::VecDeque::with_capacity(max_recent), + recent: VecDeque::with_capacity(max_recent), max_recent, - slowest_by_index: std::collections::BTreeSet::new(), - slowest_by_duration: std::collections::BTreeSet::new(), + slowest_by_index: BTreeSet::new(), + slowest_by_duration: BTreeSet::new(), max_slow, pack_frames: true, stats: Default::default(), diff --git a/puffin_egui/CHANGELOG.md b/puffin_egui/CHANGELOG.md index e9d178df..29e2079c 100644 --- a/puffin_egui/CHANGELOG.md +++ b/puffin_egui/CHANGELOG.md @@ -8,6 +8,7 @@ All notable changes to the egui crate will be documented in this file. ## [Unreleased] - ReleaseDate - [PR#218](https://github.com/EmbarkStudios/puffin/pull/218) Fix flamegraph click intersection +- [PR#151](https://github.com/EmbarkStudios/puffin/pull/151) Optimize frame statistics collection. ## [0.27.1] - 2024-06-16 diff --git a/puffin_egui/src/lib.rs b/puffin_egui/src/lib.rs index 4f8b7e5d..2587692c 100644 --- a/puffin_egui/src/lib.rs +++ b/puffin_egui/src/lib.rs @@ -23,6 +23,7 @@ use puffin::*; use std::{ collections::{BTreeMap, BTreeSet}, fmt::Write as _, + iter, sync::Arc, }; use time::OffsetDateTime; @@ -211,11 +212,15 @@ pub struct SelectedFrames { } impl SelectedFrames { - fn try_from_vec( + fn try_from_iter( scope_collection: &ScopeCollection, - frames: Vec>, + frames: impl Iterator>, ) -> Option { - let frames = vec1::Vec1::try_from_vec(frames).ok()?; + let mut it = frames; + let first = it.next()?; + let mut frames = vec1::Vec1::new(first); + frames.extend(it); + Some(Self::from_vec1(scope_collection, frames)) } @@ -450,7 +455,7 @@ impl ProfilerUi { let frames = if let Some(frame) = hovered_frame { match frame.unpacked() { Ok(frame) => { - SelectedFrames::try_from_vec(frame_view.scope_collection(), vec![frame]) + SelectedFrames::try_from_iter(frame_view.scope_collection(), iter::once(frame)) } Err(err) => { ui.colored_label(ERROR_COLOR, format!("Failed to load hovered frame: {err}")); @@ -461,13 +466,12 @@ impl ProfilerUi { Some(paused.selected.clone()) } else { puffin::profile_scope!("select_latest_frames"); - let latest = frame_view.latest_frames(self.max_num_latest); - let unpacked: Vec> = latest - .into_iter() + let latest = frame_view + .latest_frames(self.max_num_latest) .map(|frame| frame.unpacked()) - .filter_map(|unpacked| unpacked.ok()) - .collect(); - SelectedFrames::try_from_vec(frame_view.scope_collection(), unpacked) + .filter_map(|unpacked| unpacked.ok()); + + SelectedFrames::try_from_iter(frame_view.scope_collection(), latest) }; let frames = if let Some(frames) = frames { @@ -760,7 +764,7 @@ impl ProfilerUi { } if let Some(new_selection) = - SelectedFrames::try_from_vec(frame_view.scope_collection(), new_selection) + SelectedFrames::try_from_iter(frame_view.scope_collection(), new_selection.into_iter()) { self.pause_and_select(frame_view, new_selection); }