From b1edc7d13f72880fd0ac569403a409e5f7961d5f Mon Sep 17 00:00:00 2001
From: Alisa Sireneva <me@purplesyringa.moe>
Date: Sat, 27 Jul 2024 00:22:11 +0300
Subject: [PATCH] Optimize position search in error path

Translating index into a line/column pair takes considerable time.
Notably, the JSON benchmark modified to run on malformed data spends
around 50% of the CPU time generating the error object.

While it is generally assumed that the cold path is quite slow, such a
drastic pessimization may be unexpected, especially when a faster
implementation exists.

Using vectorized routines provided by the memchr crate increases
performance of the failure path by 2x on average.

Old implementation:
				DOM         STRUCT
	data/canada.json        122 MB/s    168 MB/s
	data/citm_catalog.json  135 MB/s    195 MB/s
	data/twitter.json       142 MB/s    226 MB/s

New implementation:
				DOM         STRUCT
	data/canada.json        216 MB/s    376 MB/s
	data/citm_catalog.json  238 MB/s    736 MB/s
	data/twitter.json       210 MB/s    492 MB/s

In comparison, the performance of the happy path is:

				DOM         STRUCT
	data/canada.json        283 MB/s    416 MB/s
	data/citm_catalog.json  429 MB/s    864 MB/s
	data/twitter.json       275 MB/s    541 MB/s

While this introduces a new dependency, memchr is much faster to compile
than serde, so compile time does not increase significantly.
Additionally, memchr provides a more efficient SWAR-based implementation
of both the memchr and count routines even without std, providing
benefits for embedded uses as well.
---
 Cargo.toml  |  3 ++-
 src/read.rs | 19 +++++++------------
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6a8e74526..8f8b45582 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,7 @@ rust-version = "1.56"
 [dependencies]
 indexmap = { version = "2.2.3", optional = true }
 itoa = "1.0"
+memchr = { version = "2", default-features = false }
 ryu = "1.0"
 serde = { version = "1.0.194", default-features = false }
 
@@ -45,7 +46,7 @@ features = ["raw_value"]
 [features]
 default = ["std"]
 
-std = ["serde/std"]
+std = ["memchr/std", "serde/std"]
 
 # Provide integration for heap-allocated collections without depending on the
 # rest of the Rust standard library.
diff --git a/src/read.rs b/src/read.rs
index a426911c7..e03e13f28 100644
--- a/src/read.rs
+++ b/src/read.rs
@@ -415,19 +415,14 @@ impl<'a> SliceRead<'a> {
     }
 
     fn position_of_index(&self, i: usize) -> Position {
-        let mut position = Position { line: 1, column: 0 };
-        for ch in &self.slice[..i] {
-            match *ch {
-                b'\n' => {
-                    position.line += 1;
-                    position.column = 0;
-                }
-                _ => {
-                    position.column += 1;
-                }
-            }
+        let start_of_line = match memchr::memrchr(b'\n', &self.slice[..i]) {
+            Some(position) => position + 1,
+            None => 0,
+        };
+        Position {
+            line: 1 + memchr::memchr_iter(b'\n', &self.slice[..start_of_line]).count(),
+            column: i - start_of_line,
         }
-        position
     }
 
     /// The big optimization here over IoRead is that if the string contains no