From 96e401420888471adaae9420a6dc3230f6e20355 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:19:11 +0300
Subject: [PATCH 01/12] Get rid of allocation in lcs_substr

---
 src/preproc.rs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/src/preproc.rs b/src/preproc.rs
index c8c5827..4d169d2 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -127,19 +127,20 @@ fn trim(input: Cow<str>) -> Cow<str> {
 
 // Aggressive preprocessors
 
-fn lcs_substr(f_line: &str, s_line: &str) -> String {
+fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str {
     // grab character iterators from both strings
-    let f_line_chars = f_line.chars();
+    let f_line_chars = f_line.char_indices();
     let s_line_chars = s_line.chars();
 
     // zip them together and find the common substring from the start
-    f_line_chars
+    let common_len = f_line_chars
         .zip(s_line_chars)
-        .take_while(|&(f, s)| f == s)
-        .map(|(f, _s)| f)
-        .collect::<String>()
-        .trim()
-        .into() //TODO: big optimization needed, this is a wasteful conversion
+        .take_while(|&((_i, f), s)| f == s)
+        .map(|((i, _f), _s)| i)
+        .last()
+        .unwrap_or(0);
+    
+    f_line[..common_len].trim()
 }
 
 fn remove_common_tokens(input: Cow<str>) -> Cow<str> {

From 8dddb39c825e039b31dc8c7fa3460d877169877d Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:37:29 +0300
Subject: [PATCH 02/12] Allocate less in remove_common_tokens

This makes code use HashMap with string slices as keys.
Also eliminate `Peekable` iterators.
---
 src/preproc.rs | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index 4d169d2..7968f19 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -145,22 +145,28 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str {
 
 fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     let lines: Vec<&str> = input.split('\n').collect();
-    let mut l_iter = lines.iter().peekable();
+    let mut l_iter = lines.iter();
 
     // TODO: consider whether this can all be done in one pass https://github.com/amzn/askalono/issues/36
 
-    let mut prefix_counts: HashMap<String, u32> = HashMap::new();
+    let mut prefix_counts = HashMap::<_, u32>::new();
 
     // pass 1: iterate through the text to record common prefixes
-    while let Some(line) = l_iter.next() {
-        if let Some(next) = l_iter.peek() {
-            let common = lcs_substr(line, next);
+    if let(Some(first), Some(second)) = (l_iter.next(), l_iter.next()) {
+        let mut pair = (first, second);
+        let line_pairs = std::iter::from_fn(|| {
+            let ret = pair;
+            pair = (pair.1, l_iter.next()?);
+            Some(ret)
+        });
+        for (a, b) in line_pairs {
+            let common = lcs_substr(a, b);
 
             // why start at 1, then immediately add 1?
             // lcs_substr compares two lines!
             // this doesn't need to be exact, just consistent.
             if common.len() > 3 {
-                *prefix_counts.entry(common.to_owned()).or_insert(1) += 1;
+                *prefix_counts.entry(common).or_insert(1) += 1;
             }
         }
     }
@@ -180,6 +186,21 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
         }
     }
 
+    // look at the most common observed prefix
+    let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v);
+    if max_prefix.is_none() {
+        return input;
+    }
+    let (most_common, _) = max_prefix.unwrap();
+
+    // reconcile the count with other longer prefixes that may be stored
+    let mut final_common_count = 0;
+    for (k, v) in prefix_counts.iter() {
+        if k.starts_with(most_common) {
+            final_common_count += v;
+        }
+    }
+
     // the common string must be at least 80% of the text
     let prefix_threshold: u32 = (0.8f32 * lines.len() as f32) as u32;
     if final_common_count < prefix_threshold {

From a62ab2b930fd48d0fde150a6abdf34c745bd0237 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:41:50 +0300
Subject: [PATCH 03/12] De-`unwrap` extracting most common prefix

---
 src/preproc.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index 7968f19..595c6d6 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -172,11 +172,10 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     }
 
     // look at the most common observed prefix
-    let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v);
-    if max_prefix.is_none() {
-        return input;
-    }
-    let (most_common, _) = max_prefix.unwrap();
+    let most_common = match prefix_counts.iter().max_by_key(|&(_k, v)| v) {
+        Some((prefix, _count)) => prefix,
+        None => return input,
+    };
 
     // reconcile the count with other longer prefixes that may be stored
     let mut final_common_count = 0;

From b18a378e9dd822dd83ead7c9cbfa172e8856d5c4 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:49:41 +0300
Subject: [PATCH 04/12] Simplify recounting most common prefix

---
 src/preproc.rs | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index 595c6d6..93edcb1 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -193,16 +193,13 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     let (most_common, _) = max_prefix.unwrap();
 
     // reconcile the count with other longer prefixes that may be stored
-    let mut final_common_count = 0;
-    for (k, v) in prefix_counts.iter() {
-        if k.starts_with(most_common) {
-            final_common_count += v;
-        }
-    }
+    let common_count = prefix_counts.iter()
+        .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common)))
+        .sum::<u32>();
 
     // the common string must be at least 80% of the text
-    let prefix_threshold: u32 = (0.8f32 * lines.len() as f32) as u32;
-    if final_common_count < prefix_threshold {
+    let prefix_threshold = (0.8f32 * lines.len() as f32) as _;
+    if common_count < prefix_threshold {
         return input;
     }
 

From 4e8c8d577aead4ff5214d0ffac1ae24e2aaec67d Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:53:45 +0300
Subject: [PATCH 05/12] Remove accidentally left old code

---
 src/preproc.rs | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index 93edcb1..ea24bab 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -177,21 +177,6 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
         None => return input,
     };
 
-    // reconcile the count with other longer prefixes that may be stored
-    let mut final_common_count = 0;
-    for (k, v) in prefix_counts.iter() {
-        if k.starts_with(most_common) {
-            final_common_count += v;
-        }
-    }
-
-    // look at the most common observed prefix
-    let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v);
-    if max_prefix.is_none() {
-        return input;
-    }
-    let (most_common, _) = max_prefix.unwrap();
-
     // reconcile the count with other longer prefixes that may be stored
     let common_count = prefix_counts.iter()
         .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common)))

From a358539baa5f29c38a9c0860f836e78d96de6bfa Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 20:55:22 +0300
Subject: [PATCH 06/12] Simplify removing most common prefix

---
 src/preproc.rs | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index ea24bab..a0d5599 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -189,17 +189,9 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     }
 
     // pass 2: remove that substring
-    let prefix_len = most_common.len();
     lines
         .iter()
-        .map(|line| {
-            if line.starts_with(most_common) {
-                &line[prefix_len..]
-            } else {
-                &line
-            }
-        })
-        .map(|line| line.trim())
+        .map(|s| s.trim_start_matches(most_common).trim())
         .collect::<Vec<_>>()
         .join("\n")
         .into()

From a2f695073b135ba8beb74df2b5e94102d3a2d10d Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Thu, 12 Dec 2019 21:25:11 +0300
Subject: [PATCH 07/12] Include last character of common prefix

Before this change the code effectively discarded the last common character
---
 src/preproc.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index a0d5599..ca75c45 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -136,7 +136,7 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str {
     let common_len = f_line_chars
         .zip(s_line_chars)
         .take_while(|&((_i, f), s)| f == s)
-        .map(|((i, _f), _s)| i)
+        .map(|((i, f), _s)| i + f.len_utf8())
         .last()
         .unwrap_or(0);
     

From b6745710249ef1adeaa64b1e09c477c83b476ee9 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Fri, 13 Dec 2019 23:53:32 +0300
Subject: [PATCH 08/12] Calculate len of only last character

---
 src/preproc.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index ca75c45..940ffa1 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -136,9 +136,8 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str {
     let common_len = f_line_chars
         .zip(s_line_chars)
         .take_while(|&((_i, f), s)| f == s)
-        .map(|((i, f), _s)| i + f.len_utf8())
         .last()
-        .unwrap_or(0);
+        .map_or(0, |((i, f), _s)| i + f.len_utf8());
     
     f_line[..common_len].trim()
 }

From e98e13031a7d8c6f1e28a6a2a3944811f6ffcf1f Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Fri, 13 Dec 2019 23:58:24 +0300
Subject: [PATCH 09/12] Include the last pair of strings into lookup

---
 src/preproc.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index 940ffa1..a597d51 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -151,12 +151,11 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     let mut prefix_counts = HashMap::<_, u32>::new();
 
     // pass 1: iterate through the text to record common prefixes
-    if let(Some(first), Some(second)) = (l_iter.next(), l_iter.next()) {
-        let mut pair = (first, second);
+    if let Some(first) = l_iter.next() {
+        let mut pair = ("", first);
         let line_pairs = std::iter::from_fn(|| {
-            let ret = pair;
             pair = (pair.1, l_iter.next()?);
-            Some(ret)
+            Some(pair)
         });
         for (a, b) in line_pairs {
             let common = lcs_substr(a, b);

From edbe536f571fdf57786db3b48230bf695a4005d5 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Wed, 18 Dec 2019 16:36:37 +0300
Subject: [PATCH 10/12] Fix prefix trimming

Previously the code would make "prefixprefixret" into "prefix"
---
 src/preproc.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index a597d51..d4296c1 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -189,7 +189,13 @@ fn remove_common_tokens(input: Cow<str>) -> Cow<str> {
     // pass 2: remove that substring
     lines
         .iter()
-        .map(|s| s.trim_start_matches(most_common).trim())
+        .map(|line| {
+            if line.starts_with(most_common) {
+                &line[most_common.len()..]
+            } else {
+                line
+            }.trim()
+        })
         .collect::<Vec<_>>()
         .join("\n")
         .into()

From 8e566b0b001a7cafa153a22e64f0ad1be6ef45c5 Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Wed, 18 Dec 2019 17:20:04 +0300
Subject: [PATCH 11/12] Speed up extracting the common prefix

`std::str::Chars` is slow since it have to construct codepoints and thus should be avoided.
With a small adjustments finding a common byte prefix is sufficient
and compiles into more efficient code (compiler vectorized the comparison loop)
---
 src/preproc.rs | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/src/preproc.rs b/src/preproc.rs
index d4296c1..c91806f 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -127,19 +127,44 @@ fn trim(input: Cow<str>) -> Cow<str> {
 
 // Aggressive preprocessors
 
+// Cut prefix of string near given byte index.
+// If given index doesn't lie at char boundary,
+// returns the biggest prefix with length not exceeding idx.
+// If index is bigger than length or string, returns the whole string.
+fn trim_byte_adjusted(s: &str, idx: usize) -> &str {
+    if idx >= s.len() {
+        return s
+    }
+    
+    if let Some(sub) = s.get(..idx) {
+        sub
+    } else {
+        // Inspect bytes before index
+        let trailing_continuation = s.as_bytes()[..idx]
+            .iter()
+            .rev()
+            // Multibyte characters are encoded in UTF-8 in the following manner:
+            //    first byte | rest of bytes
+            //    1..10xxxxx   10xxxxxx
+            //    ^^^^ number of ones is equal to number of bytes in codepoint
+            // Number of 10xxxxxx bytes in codepoint is at most 3 in valid UTF-8-encoded string,
+            // so this loop actually runs a little iterations
+            .take_while(|&byte| byte & 0b1100_0000 == 0b1000_0000)
+            .count();
+        // Subtract 1 to take the first byte in codepoint into account
+        &s[..idx - trailing_continuation - 1]
+    }
+}
+
 fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str {
-    // grab character iterators from both strings
-    let f_line_chars = f_line.char_indices();
-    let s_line_chars = s_line.chars();
-
-    // zip them together and find the common substring from the start
-    let common_len = f_line_chars
-        .zip(s_line_chars)
-        .take_while(|&((_i, f), s)| f == s)
-        .last()
-        .map_or(0, |((i, f), _s)| i + f.len_utf8());
+    // find the length of common prefix in byte representations of strings
+    let prefix_len = f_line.as_bytes()
+        .iter()
+        .zip(s_line.as_bytes())
+        .take_while(|(&f, &s)| f == s)
+        .count();
     
-    f_line[..common_len].trim()
+    trim_byte_adjusted(f_line, prefix_len).trim()
 }
 
 fn remove_common_tokens(input: Cow<str>) -> Cow<str> {

From ab584f7f19cdfa0de5eeb56b3bf120b09f6af55f Mon Sep 17 00:00:00 2001
From: AnthonyMikh <anthony.mikh@yandex.ru>
Date: Fri, 20 Dec 2019 10:54:04 +0300
Subject: [PATCH 12/12] Add test for trim_byte_adjusted

---
 src/preproc.rs | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/preproc.rs b/src/preproc.rs
index c91806f..94e0f28 100644
--- a/src/preproc.rs
+++ b/src/preproc.rs
@@ -296,6 +296,40 @@ fn collapse_whitespace(input: Cow<str>) -> Cow<str> {
 mod tests {
     use super::*;
 
+    #[test]
+    fn trim_byte_adjusted_respects_multibyte_characters() {
+        let input = "RustКраб橙蟹🦀";
+        let expected = [
+            "",
+            "R",
+            "Ru",
+            "Rus",
+            "Rust",
+            "Rust",
+            "RustК",
+            "RustК",
+            "RustКр",
+            "RustКр",
+            "RustКра",
+            "RustКра",
+            "RustКраб",
+            "RustКраб",
+            "RustКраб",
+            "RustКраб橙",
+            "RustКраб橙",
+            "RustКраб橙",
+            "RustКраб橙蟹",
+            "RustКраб橙蟹",
+            "RustКраб橙蟹",
+            "RustКраб橙蟹",
+            "RustКраб橙蟹🦀",
+        ];
+
+        for (i, &outcome) in expected.iter().enumerate() {
+            assert_eq!(outcome, trim_byte_adjusted(input, i))
+        }
+    }
+
     #[test]
     fn greatest_substring_removal() {
         // the funky string syntax \n\ is to add a newline but skip the