fix(stream): Don't break out of UTF8 character part-way

This fixes a soundness issue where we create invalid UTF-8 data and then do a `str::from_unchecked` on release builds. This ensures we ignore up-to the start of UTF-8 sequences and not mid-way through. Fixes #156
rust-cli · Jan 12, 2024 · a54cb5b · a54cb5b
1 parent 51ffc2d
commit a54cb5b
Showing 1 changed file with 4 additions and 18 deletions.
diff --git a/crates/anstream/src/adapter/strip.rs b/crates/anstream/src/adapter/strip.rs
@@ -114,19 +114,19 @@ impl<'s> Iterator for StripStrIter<'s> {
 #[inline]
 fn next_str<'s>(bytes: &mut &'s [u8], state: &mut State) -> Option<&'s str> {
     let offset = bytes.iter().copied().position(|b| {
-        let (next_state, action) = state_change(*state, b);
+        let (next_state, action) = dbg!(state_change(*state, b));
         if next_state != State::Anywhere {
             *state = next_state;
         }
-        is_printable_str(action, b)
+        is_printable_bytes(action, b)
     });
     let (_, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
     *bytes = next;
     *state = State::Ground;
 
     let offset = bytes.iter().copied().position(|b| {
         let (_next_state, action) = state_change(State::Ground, b);
-        !is_printable_str(action, b)
+        !(is_printable_bytes(action, b) || is_utf8_continuation(b))
     });
     let (printable, next) = bytes.split_at(offset.unwrap_or(bytes.len()));
     *bytes = next;
@@ -153,19 +153,6 @@ unsafe fn from_utf8_unchecked<'b>(bytes: &'b [u8], safety_justification: &'stati
     }
 }
 
-#[inline]
-fn is_printable_str(action: Action, byte: u8) -> bool {
-    // VT320 considered 0x7f to be `Print`able but we expect to be working in UTF-8 systems and not
-    // ISO Latin-1, making it DEL and non-printable
-    const DEL: u8 = 0x7f;
-    (action == Action::Print && byte != DEL)
-        || action == Action::BeginUtf8
-        // since we know the input is valid UTF-8, the only thing  we can do with
-        // continuations is to print them
-        || is_utf8_continuation(byte)
-        || (action == Action::Execute && byte.is_ascii_whitespace())
-}
-
 #[inline]
 fn is_utf8_continuation(b: u8) -> bool {
     matches!(b, 0x80..=0xbf)
@@ -474,13 +461,12 @@ mod test {
     }
 
     #[test]
-    #[should_panic]
     fn test_strip_str_handles_broken_sequence() {
         // valid utf8: \xc3\xb6 then \x1b then \xf0\x9f\x98\x80
         let s = "ö\x1b😀hello😀goodbye";
         let mut it = strip_str(s);
         assert_eq!("ö", it.next().unwrap());
-        assert_eq!("😀hello😀goodbye", it.next().unwrap());
+        assert_eq!("ello😀goodbye", it.next().unwrap());
     }
 
     proptest! {