From 96e401420888471adaae9420a6dc3230f6e20355 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:19:11 +0300 Subject: [PATCH 01/12] Get rid of allocation in lcs_substr --- src/preproc.rs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index c8c5827..4d169d2 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -127,19 +127,20 @@ fn trim(input: Cow) -> Cow { // Aggressive preprocessors -fn lcs_substr(f_line: &str, s_line: &str) -> String { +fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { // grab character iterators from both strings - let f_line_chars = f_line.chars(); + let f_line_chars = f_line.char_indices(); let s_line_chars = s_line.chars(); // zip them together and find the common substring from the start - f_line_chars + let common_len = f_line_chars .zip(s_line_chars) - .take_while(|&(f, s)| f == s) - .map(|(f, _s)| f) - .collect::() - .trim() - .into() //TODO: big optimization needed, this is a wasteful conversion + .take_while(|&((_i, f), s)| f == s) + .map(|((i, _f), _s)| i) + .last() + .unwrap_or(0); + + f_line[..common_len].trim() } fn remove_common_tokens(input: Cow) -> Cow { From 8dddb39c825e039b31dc8c7fa3460d877169877d Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:37:29 +0300 Subject: [PATCH 02/12] Allocate less in remove_common_tokens This makes code use HashMap with string slices as keys. Also eliminate `Peekable` iterators. --- src/preproc.rs | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index 4d169d2..7968f19 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -145,22 +145,28 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { fn remove_common_tokens(input: Cow) -> Cow { let lines: Vec<&str> = input.split('\n').collect(); - let mut l_iter = lines.iter().peekable(); + let mut l_iter = lines.iter(); // TODO: consider whether this can all be done in one pass https://github.com/amzn/askalono/issues/36 - let mut prefix_counts: HashMap = HashMap::new(); + let mut prefix_counts = HashMap::<_, u32>::new(); // pass 1: iterate through the text to record common prefixes - while let Some(line) = l_iter.next() { - if let Some(next) = l_iter.peek() { - let common = lcs_substr(line, next); + if let(Some(first), Some(second)) = (l_iter.next(), l_iter.next()) { + let mut pair = (first, second); + let line_pairs = std::iter::from_fn(|| { + let ret = pair; + pair = (pair.1, l_iter.next()?); + Some(ret) + }); + for (a, b) in line_pairs { + let common = lcs_substr(a, b); // why start at 1, then immediately add 1? // lcs_substr compares two lines! // this doesn't need to be exact, just consistent. if common.len() > 3 { - *prefix_counts.entry(common.to_owned()).or_insert(1) += 1; + *prefix_counts.entry(common).or_insert(1) += 1; } } } @@ -180,6 +186,21 @@ fn remove_common_tokens(input: Cow) -> Cow { } } + // look at the most common observed prefix + let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v); + if max_prefix.is_none() { + return input; + } + let (most_common, _) = max_prefix.unwrap(); + + // reconcile the count with other longer prefixes that may be stored + let mut final_common_count = 0; + for (k, v) in prefix_counts.iter() { + if k.starts_with(most_common) { + final_common_count += v; + } + } + // the common string must be at least 80% of the text let prefix_threshold: u32 = (0.8f32 * lines.len() as f32) as u32; if final_common_count < prefix_threshold { From a62ab2b930fd48d0fde150a6abdf34c745bd0237 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:41:50 +0300 Subject: [PATCH 03/12] De-`unwrap` extracting most common prefix --- src/preproc.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index 7968f19..595c6d6 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -172,11 +172,10 @@ fn remove_common_tokens(input: Cow) -> Cow { } // look at the most common observed prefix - let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v); - if max_prefix.is_none() { - return input; - } - let (most_common, _) = max_prefix.unwrap(); + let most_common = match prefix_counts.iter().max_by_key(|&(_k, v)| v) { + Some((prefix, _count)) => prefix, + None => return input, + }; // reconcile the count with other longer prefixes that may be stored let mut final_common_count = 0; From b18a378e9dd822dd83ead7c9cbfa172e8856d5c4 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:49:41 +0300 Subject: [PATCH 04/12] Simplify recounting most common prefix --- src/preproc.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index 595c6d6..93edcb1 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -193,16 +193,13 @@ fn remove_common_tokens(input: Cow) -> Cow { let (most_common, _) = max_prefix.unwrap(); // reconcile the count with other longer prefixes that may be stored - let mut final_common_count = 0; - for (k, v) in prefix_counts.iter() { - if k.starts_with(most_common) { - final_common_count += v; - } - } + let common_count = prefix_counts.iter() + .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common))) + .sum::(); // the common string must be at least 80% of the text - let prefix_threshold: u32 = (0.8f32 * lines.len() as f32) as u32; - if final_common_count < prefix_threshold { + let prefix_threshold = (0.8f32 * lines.len() as f32) as _; + if common_count < prefix_threshold { return input; } From 4e8c8d577aead4ff5214d0ffac1ae24e2aaec67d Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:53:45 +0300 Subject: [PATCH 05/12] Remove accidentally left old code --- src/preproc.rs | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index 93edcb1..ea24bab 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -177,21 +177,6 @@ fn remove_common_tokens(input: Cow) -> Cow { None => return input, }; - // reconcile the count with other longer prefixes that may be stored - let mut final_common_count = 0; - for (k, v) in prefix_counts.iter() { - if k.starts_with(most_common) { - final_common_count += v; - } - } - - // look at the most common observed prefix - let max_prefix = prefix_counts.iter().max_by_key(|&(_k, v)| v); - if max_prefix.is_none() { - return input; - } - let (most_common, _) = max_prefix.unwrap(); - // reconcile the count with other longer prefixes that may be stored let common_count = prefix_counts.iter() .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common))) From a358539baa5f29c38a9c0860f836e78d96de6bfa Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 20:55:22 +0300 Subject: [PATCH 06/12] Simplify removing most common prefix --- src/preproc.rs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index ea24bab..a0d5599 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -189,17 +189,9 @@ fn remove_common_tokens(input: Cow) -> Cow { } // pass 2: remove that substring - let prefix_len = most_common.len(); lines .iter() - .map(|line| { - if line.starts_with(most_common) { - &line[prefix_len..] - } else { - &line - } - }) - .map(|line| line.trim()) + .map(|s| s.trim_start_matches(most_common).trim()) .collect::>() .join("\n") .into() From a2f695073b135ba8beb74df2b5e94102d3a2d10d Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Thu, 12 Dec 2019 21:25:11 +0300 Subject: [PATCH 07/12] Include last character of common prefix Before this change the code effectively discarded the last common character --- src/preproc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/preproc.rs b/src/preproc.rs index a0d5599..ca75c45 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -136,7 +136,7 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { let common_len = f_line_chars .zip(s_line_chars) .take_while(|&((_i, f), s)| f == s) - .map(|((i, _f), _s)| i) + .map(|((i, f), _s)| i + f.len_utf8()) .last() .unwrap_or(0); From b6745710249ef1adeaa64b1e09c477c83b476ee9 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Fri, 13 Dec 2019 23:53:32 +0300 Subject: [PATCH 08/12] Calculate len of only last character --- src/preproc.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index ca75c45..940ffa1 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -136,9 +136,8 @@ fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { let common_len = f_line_chars .zip(s_line_chars) .take_while(|&((_i, f), s)| f == s) - .map(|((i, f), _s)| i + f.len_utf8()) .last() - .unwrap_or(0); + .map_or(0, |((i, f), _s)| i + f.len_utf8()); f_line[..common_len].trim() } From e98e13031a7d8c6f1e28a6a2a3944811f6ffcf1f Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Fri, 13 Dec 2019 23:58:24 +0300 Subject: [PATCH 09/12] Include the last pair of strings into lookup --- src/preproc.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index 940ffa1..a597d51 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -151,12 +151,11 @@ fn remove_common_tokens(input: Cow) -> Cow { let mut prefix_counts = HashMap::<_, u32>::new(); // pass 1: iterate through the text to record common prefixes - if let(Some(first), Some(second)) = (l_iter.next(), l_iter.next()) { - let mut pair = (first, second); + if let Some(first) = l_iter.next() { + let mut pair = ("", first); let line_pairs = std::iter::from_fn(|| { - let ret = pair; pair = (pair.1, l_iter.next()?); - Some(ret) + Some(pair) }); for (a, b) in line_pairs { let common = lcs_substr(a, b); From edbe536f571fdf57786db3b48230bf695a4005d5 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Wed, 18 Dec 2019 16:36:37 +0300 Subject: [PATCH 10/12] Fix prefix trimming Previously the code would make "prefixprefixret" into "prefix" --- src/preproc.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/preproc.rs b/src/preproc.rs index a597d51..d4296c1 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -189,7 +189,13 @@ fn remove_common_tokens(input: Cow) -> Cow { // pass 2: remove that substring lines .iter() - .map(|s| s.trim_start_matches(most_common).trim()) + .map(|line| { + if line.starts_with(most_common) { + &line[most_common.len()..] + } else { + line + }.trim() + }) .collect::>() .join("\n") .into() From 8e566b0b001a7cafa153a22e64f0ad1be6ef45c5 Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Wed, 18 Dec 2019 17:20:04 +0300 Subject: [PATCH 11/12] Speed up extracting the common prefix `std::str::Chars` is slow since it have to construct codepoints and thus should be avoided. With a small adjustments finding a common byte prefix is sufficient and compiles into more efficient code (compiler vectorized the comparison loop) --- src/preproc.rs | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/preproc.rs b/src/preproc.rs index d4296c1..c91806f 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -127,19 +127,44 @@ fn trim(input: Cow) -> Cow { // Aggressive preprocessors +// Cut prefix of string near given byte index. +// If given index doesn't lie at char boundary, +// returns the biggest prefix with length not exceeding idx. +// If index is bigger than length or string, returns the whole string. +fn trim_byte_adjusted(s: &str, idx: usize) -> &str { + if idx >= s.len() { + return s + } + + if let Some(sub) = s.get(..idx) { + sub + } else { + // Inspect bytes before index + let trailing_continuation = s.as_bytes()[..idx] + .iter() + .rev() + // Multibyte characters are encoded in UTF-8 in the following manner: + // first byte | rest of bytes + // 1..10xxxxx 10xxxxxx + // ^^^^ number of ones is equal to number of bytes in codepoint + // Number of 10xxxxxx bytes in codepoint is at most 3 in valid UTF-8-encoded string, + // so this loop actually runs a little iterations + .take_while(|&byte| byte & 0b1100_0000 == 0b1000_0000) + .count(); + // Subtract 1 to take the first byte in codepoint into account + &s[..idx - trailing_continuation - 1] + } +} + fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { - // grab character iterators from both strings - let f_line_chars = f_line.char_indices(); - let s_line_chars = s_line.chars(); - - // zip them together and find the common substring from the start - let common_len = f_line_chars - .zip(s_line_chars) - .take_while(|&((_i, f), s)| f == s) - .last() - .map_or(0, |((i, f), _s)| i + f.len_utf8()); + // find the length of common prefix in byte representations of strings + let prefix_len = f_line.as_bytes() + .iter() + .zip(s_line.as_bytes()) + .take_while(|(&f, &s)| f == s) + .count(); - f_line[..common_len].trim() + trim_byte_adjusted(f_line, prefix_len).trim() } fn remove_common_tokens(input: Cow) -> Cow { From ab584f7f19cdfa0de5eeb56b3bf120b09f6af55f Mon Sep 17 00:00:00 2001 From: AnthonyMikh Date: Fri, 20 Dec 2019 10:54:04 +0300 Subject: [PATCH 12/12] Add test for trim_byte_adjusted --- src/preproc.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/preproc.rs b/src/preproc.rs index c91806f..94e0f28 100644 --- a/src/preproc.rs +++ b/src/preproc.rs @@ -296,6 +296,40 @@ fn collapse_whitespace(input: Cow) -> Cow { mod tests { use super::*; + #[test] + fn trim_byte_adjusted_respects_multibyte_characters() { + let input = "RustКраб橙蟹🦀"; + let expected = [ + "", + "R", + "Ru", + "Rus", + "Rust", + "Rust", + "RustК", + "RustК", + "RustКр", + "RustКр", + "RustКра", + "RustКра", + "RustКраб", + "RustКраб", + "RustКраб", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹", + "RustКраб橙蟹🦀", + ]; + + for (i, &outcome) in expected.iter().enumerate() { + assert_eq!(outcome, trim_byte_adjusted(input, i)) + } + } + #[test] fn greatest_substring_removal() { // the funky string syntax \n\ is to add a newline but skip the