From 4cc058a78903252cdf1eec3c11e735f158b176f7 Mon Sep 17 00:00:00 2001
From: Patrick Jackson <patrick@jackson.dev>
Date: Tue, 7 Jun 2022 10:45:19 -0700
Subject: [PATCH 1/3] dd: reuse buffer for the most common cases

---
 src/uu/dd/src/dd.rs | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)
diff --git a/src/uu/dd/src/dd.rs b/src/uu/dd/src/dd.rs
index 115da8bccfd..46c35d0ecec 100644
--- a/src/uu/dd/src/dd.rs
+++ b/src/uu/dd/src/dd.rs
@@ -414,6 +414,10 @@ where
         let (prog_tx, rx) = mpsc::channel();
         thread::spawn(gen_prog_updater(rx, i.print_level));
 
+        // Create a common buffer with a capacity of the block size.
+        // This is the max size needed.
+        let mut buf = vec![BUF_INIT_BYTE; bsize];
+
         // The main read/write loop.
         //
         // Each iteration reads blocks from the input and writes
@@ -427,7 +431,7 @@ where
             // best buffer size for reading based on the number of
             // blocks already read and the number of blocks remaining.
             let loop_bsize = calc_loop_bsize(&i.count, &rstat, &wstat, i.ibs, bsize);
-            let (rstat_update, buf) = read_helper(&mut i, loop_bsize)?;
+            let rstat_update = read_helper(&mut i, &mut buf, loop_bsize)?;
             if rstat_update.is_empty() {
                 break;
             }
@@ -600,7 +604,11 @@ impl Write for Output<io::Stdout> {
 }
 
 /// Read helper performs read operations common to all dd reads, and dispatches the buffer to relevant helper functions as dictated by the operations requested by the user.
-fn read_helper<R: Read>(i: &mut Input<R>, bsize: usize) -> std::io::Result<(ReadStat, Vec<u8>)> {
+fn read_helper<R: Read>(
+    i: &mut Input<R>,
+    mut buf: &mut Vec<u8>,
+    bsize: usize,
+) -> std::io::Result<ReadStat> {
     // Local Helper Fns -------------------------------------------------
     fn perform_swab(buf: &mut [u8]) {
         for base in (1..buf.len()).step_by(2) {
@@ -609,14 +617,16 @@ fn read_helper<R: Read>(i: &mut Input<R>, bsize: usize) -> std::io::Result<(Read
     }
     // ------------------------------------------------------------------
     // Read
-    let mut buf = vec![BUF_INIT_BYTE; bsize];
+    // Resize the buffer to the bsize. Any garbage data in the buffer is overwritten or truncated, so there is no need to fill with BUF_INIT_BYTE first.
+    buf.resize(bsize, BUF_INIT_BYTE);
+
     let mut rstat = match i.cflags.sync {
         Some(ch) => i.fill_blocks(&mut buf, ch)?,
         _ => i.fill_consecutive(&mut buf)?,
     };
     // Return early if no data
     if rstat.reads_complete == 0 && rstat.reads_partial == 0 {
-        return Ok((rstat, buf));
+        return Ok(rstat);
     }
 
     // Perform any conv=x[,x...] options
@@ -626,10 +636,10 @@ fn read_helper<R: Read>(i: &mut Input<R>, bsize: usize) -> std::io::Result<(Read
 
     match i.cflags.mode {
         Some(ref mode) => {
-            let buf = conv_block_unblock_helper(buf, mode, &mut rstat);
-            Ok((rstat, buf))
+            *buf = conv_block_unblock_helper(buf.clone(), mode, &mut rstat);
+            Ok(rstat)
         }
-        None => Ok((rstat, buf)),
+        None => Ok(rstat),
     }
 }
 

From a186adbff1119adfeba54e6ab1aa8751e0635fb9 Mon Sep 17 00:00:00 2001
From: Patrick Jackson <patrick@jackson.dev>
Date: Thu, 9 Jun 2022 09:27:42 -0700
Subject: [PATCH 2/3] dd: fixing clippy warnings.

---
 src/uu/dd/src/dd.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/uu/dd/src/dd.rs b/src/uu/dd/src/dd.rs
index 46c35d0ecec..e47f923eec6 100644
--- a/src/uu/dd/src/dd.rs
+++ b/src/uu/dd/src/dd.rs
@@ -606,7 +606,7 @@ impl Write for Output<io::Stdout> {
 /// Read helper performs read operations common to all dd reads, and dispatches the buffer to relevant helper functions as dictated by the operations requested by the user.
 fn read_helper<R: Read>(
     i: &mut Input<R>,
-    mut buf: &mut Vec<u8>,
+    buf: &mut Vec<u8>,
     bsize: usize,
 ) -> std::io::Result<ReadStat> {
     // Local Helper Fns -------------------------------------------------
@@ -621,8 +621,8 @@ fn read_helper<R: Read>(
     buf.resize(bsize, BUF_INIT_BYTE);
 
     let mut rstat = match i.cflags.sync {
-        Some(ch) => i.fill_blocks(&mut buf, ch)?,
-        _ => i.fill_consecutive(&mut buf)?,
+        Some(ch) => i.fill_blocks(buf, ch)?,
+        _ => i.fill_consecutive(buf)?,
     };
     // Return early if no data
     if rstat.reads_complete == 0 && rstat.reads_partial == 0 {
@@ -631,7 +631,7 @@ fn read_helper<R: Read>(
 
     // Perform any conv=x[,x...] options
     if i.cflags.swab {
-        perform_swab(&mut buf);
+        perform_swab(buf);
     }
 
     match i.cflags.mode {

From 881f0c3d0604e1c2098e7c4e2d05677046b0d8d3 Mon Sep 17 00:00:00 2001
From: Patrick Jackson <patrick@jackson.dev>
Date: Thu, 9 Jun 2022 11:01:30 -0700
Subject: [PATCH 3/3] dd: add BENCHMARKING instructions

---
 src/uu/dd/BENCHMARKING.md | 62 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 src/uu/dd/BENCHMARKING.md

diff --git a/src/uu/dd/BENCHMARKING.md b/src/uu/dd/BENCHMARKING.md
new file mode 100644
index 00000000000..57a19bf5c23
--- /dev/null
+++ b/src/uu/dd/BENCHMARKING.md
@@ -0,0 +1,62 @@
+# Benchmarking dd
+
+`dd` is a utility used for copying and converting files. It is often used for
+writing directly to devices, such as when writing an `.iso` file directly to a
+drive.
+
+## Understanding dd
+
+At the core, `dd` has a simple loop of operation. It reads in `blocksize` bytes
+from an input, optionally performs a conversion on the bytes, and then writes
+`blocksize` bytes to an output file.
+
+In typical usage, the performance of `dd` is dominated by the speed at which it
+can read or write to the filesystem. For those scenarios it is best to optimize
+the blocksize for the performance of the devices being read/written to. Devices
+typically have an optimal block size that they work best at, so for maximum
+performance `dd` should be using a block size, or multiple of the block size,
+that the underlying devices prefer.
+
+For benchmarking `dd` itself we will use fast special files provided by the
+operating system that work out of RAM, `/dev/zero` and `/dev/null`. This reduces
+the time taken reading/writing files to a minimum and maximises the percentage
+time we spend in the `dd` tool itself, but care still needs to be taken to
+understand where we are benchmarking the `dd` tool and where we are just
+benchmarking memory performance.
+
+The main parameter to vary for a `dd` benchmark is the blocksize, but benchmarks
+testing the conversions that are supported by `dd` could also be interesting.
+
+`dd` has a convenient `count` argument, that will copy `count` blocks of data
+from the input to the output, which is useful for benchmarking.
+
+## Blocksize Benchmarks
+
+When measuring the impact of blocksize on the throughput, we want to avoid
+testing the startup time of `dd`. `dd` itself will give a report on the
+throughput speed once complete, but it's probably better to use an external
+tool, such as `hyperfine` to measure the performance.
+
+Benchmarks should be sized so that they run for a handful of seconds at a
+minimum to avoid measuring the startup time unnecessarily. The total time will
+be roughly equivalent to the total bytes copied (`blocksize` x `count`).
+
+Some useful invocations for testing would be the following:
+
+```
+hyperfine "./target/release/dd bs=4k count=1000000 < /dev/zero > /dev/null"
+hyperfine "./target/release/dd bs=1M count=20000 < /dev/zero > /dev/null"
+hyperfine "./target/release/dd bs=1G count=10 < /dev/zero > /dev/null"
+```
+
+Choosing what to benchmark depends greatly on what you want to measure.
+Typically you would choose a small blocksize for measuring the performance of
+`dd`, as that would maximize the overhead introduced by the `dd` tool. `dd`
+typically does some set amount of work per block which only depends on the size
+of the block if conversions are used.
+
+As an example, https://github.com/uutils/coreutils/pull/3600 made a change to
+reuse the same buffer between block copies, avoiding the need to reallocate a
+new block of memory for each copy. The impact of that change mostly had an
+impact on large block size copies because those are the circumstances where the
+memory performance dominated the total performance.