Skip to content

Commit

Permalink
Auto merge of rust-lang#128463 - GrigorenkoPV:perfect-hash, r=<try>
Browse files Browse the repository at this point in the history
rustc_errors: use perfect hashing for character replacements

The correctness of code in rust-lang#128200 relies on an array being sorted (so that it can be used in binary search later), which is currently enforced with `// tidy-alphabetical` (and characters being written in `\u{XXXX}` form), as well as lack of duplicate entries with conflicting keys, which is not currently enforced.

A const assert or a test can be added checking that (implemented in rust-lang#128465).

But this PR tries to use [perfect hashing](https://en.wikipedia.org/wiki/Perfect_hash_function) instead.

The performance implications are unclear. Asymptotically it's faster, but in reality we should just benchmark. Plus if there are no significant performance wins, this entire things is probably not even worse the additional dependencies it brings.

UPD: funnily enough, there's a PR optimizing the binary search implementation (rust-lang#128254) in the queue right now. So I guess we have to wait until that is merged too before benchmarking this.
  • Loading branch information
bors committed Aug 6, 2024
2 parents 60d1465 + 4108ac4 commit 1de00dc
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 53 deletions.
15 changes: 15 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2712,6 +2712,7 @@ version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
dependencies = [
"phf_macros",
"phf_shared 0.11.2",
]

Expand Down Expand Up @@ -2745,6 +2746,19 @@ dependencies = [
"rand",
]

[[package]]
name = "phf_macros"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b"
dependencies = [
"phf_generator 0.11.2",
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.67",
]

[[package]]
name = "phf_shared"
version = "0.10.0"
Expand Down Expand Up @@ -3653,6 +3667,7 @@ version = "0.0.0"
dependencies = [
"annotate-snippets 0.10.2",
"derive_setters",
"phf",
"rustc_ast",
"rustc_ast_pretty",
"rustc_data_structures",
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_errors/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2021"
# tidy-alphabetical-start
annotate-snippets = "0.10"
derive_setters = "0.1.6"
phf = { version = "0.11.2", features = ["macros"] }
rustc_ast = { path = "../rustc_ast" }
rustc_ast_pretty = { path = "../rustc_ast_pretty" }
rustc_data_structures = { path = "../rustc_data_structures" }
Expand Down
102 changes: 49 additions & 53 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2583,66 +2583,62 @@ fn num_decimal_digits(num: usize) -> usize {
}

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
// ATTENTION: keep lexicografically sorted so that the binary search will work
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
// tidy-alphabetical-start
const OUTPUT_REPLACEMENTS: phf::Map<char, &'static str> = phf::phf_map![
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
('\0', "␀"),
('\u{0001}', "␁"),
('\u{0002}', "␂"),
('\u{0003}', "␃"),
('\u{0004}', "␄"),
('\u{0005}', "␅"),
('\u{0006}', "␆"),
('\u{0007}', "␇"),
('\u{0008}', "␈"),
('\u{0009}', " "), // We do our own tab replacement
('\u{000b}', "␋"),
('\u{000c}', "␌"),
('\u{000d}', "␍"),
('\u{000e}', "␎"),
('\u{000f}', "␏"),
('\u{0010}', "␐"),
('\u{0011}', "␑"),
('\u{0012}', "␒"),
('\u{0013}', "␓"),
('\u{0014}', "␔"),
('\u{0015}', "␕"),
('\u{0016}', "␖"),
('\u{0017}', "␗"),
('\u{0018}', "␘"),
('\u{0019}', "␙"),
('\u{001a}', "␚"),
('\u{001b}', "␛"),
('\u{001c}', "␜"),
('\u{001d}', "␝"),
('\u{001e}', "␞"),
('\u{001f}', "␟"),
('\u{007f}', "␡"),
('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters.
('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently
('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always.
('\u{202d}', "�"),
('\u{202e}', "�"),
('\u{2066}', "�"),
('\u{2067}', "�"),
('\u{2068}', "�"),
('\u{2069}', "�"),
// tidy-alphabetical-end
'\0' => "␀",
'\t' => " ", // We do our own tab replacement
'\r' => "␍",
'\u{0001}' => "␁",
'\u{0002}' => "␂",
'\u{0003}' => "␃",
'\u{0004}' => "␄",
'\u{0005}' => "␅",
'\u{0006}' => "␆",
'\u{0007}' => "␇",
'\u{0008}' => "␈",
'\u{000b}' => "␋",
'\u{000c}' => "␌",
'\u{000e}' => "␎",
'\u{000f}' => "␏",
'\u{0010}' => "␐",
'\u{0011}' => "␑",
'\u{0012}' => "␒",
'\u{0013}' => "␓",
'\u{0014}' => "␔",
'\u{0015}' => "␕",
'\u{0016}' => "␖",
'\u{0017}' => "␗",
'\u{0018}' => "␘",
'\u{0019}' => "␙",
'\u{001a}' => "␚",
'\u{001b}' => "␛",
'\u{001c}' => "␜",
'\u{001d}' => "␝",
'\u{001e}' => "␞",
'\u{001f}' => "␟",
'\u{007f}' => "␡",
'\u{200d}' => "", // Replace ZWJ for consistent terminal output of grapheme clusters.
'\u{202a}' => "�", // The following unicode text flow control characters are inconsistently
'\u{202b}' => "�", // supported across CLIs and can cause confusion due to the bytes on disk
'\u{202c}' => "�", // not corresponding to the visible source code, so we replace them always.
'\u{202d}' => "�",
'\u{202e}' => "�",
'\u{2066}' => "�",
'\u{2067}' => "�",
'\u{2068}' => "�",
'\u{2069}' => "�",
];

fn normalize_whitespace(s: &str) -> String {
// Scan the input string for a character in the ordered table above. If it's present, replace
// it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input
// char. At the end, allocate all chars into a string in one operation.
// Scan the input string for a character in the replacement table above.
// If it's present, replace it with its alternative string (it can be more than 1 char!).
// Otherwise, retain the input char.
s.chars().fold(String::with_capacity(s.len()), |mut s, c| {
match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) {
Ok(i) => s.push_str(OUTPUT_REPLACEMENTS[i].1),
_ => s.push(c),
match OUTPUT_REPLACEMENTS.get(&c) {
Some(r) => s.push_str(r),
None => s.push(c),
}
s
})
Expand Down
5 changes: 5 additions & 0 deletions src/tools/tidy/src/deps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
"parking_lot_core",
"pathdiff",
"perf-event-open-sys",
"phf",
"phf_generator",
"phf_macros",
"phf_shared",
"pin-project-lite",
"polonius-engine",
"portable-atomic", // dependency for platforms doesn't support `AtomicU64` in std
Expand Down Expand Up @@ -386,6 +390,7 @@ const PERMITTED_RUSTC_DEPENDENCIES: &[&str] = &[
"sha2",
"sharded-slab",
"shlex",
"siphasher",
"smallvec",
"snap",
"stable_deref_trait",
Expand Down

0 comments on commit 1de00dc

Please sign in to comment.