diff --git a/Cargo.toml b/Cargo.toml index ed7740f..88b6fa4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,6 @@ mac = "0.1" encoding = {version = "0.2", optional = true} encoding_rs = {version = "0.8.12", optional = true} futf = "0.1.5" -utf-8 = "0.7" [dev-dependencies] rand = "0.4" diff --git a/src/lib.rs b/src/lib.rs index 33782fd..6595d86 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,7 +16,6 @@ extern crate test; #[macro_use] extern crate mac; extern crate futf; -extern crate utf8; pub use fmt::Format; pub use stream::TendrilSink; @@ -31,5 +30,6 @@ mod buf32; mod tendril; mod utf8_decode; mod util; +mod utf8; static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; diff --git a/src/utf8.rs b/src/utf8.rs new file mode 100644 index 0000000..f59429a --- /dev/null +++ b/src/utf8.rs @@ -0,0 +1,135 @@ +use std::cmp; +use std::str; + +/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. +pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; + +#[derive(Debug, Copy, Clone)] +pub enum DecodeError<'a> { + /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, + /// then call `decode()` again with `remaining_input`. + Invalid { + valid_prefix: &'a str, + invalid_sequence: &'a [u8], + remaining_input: &'a [u8], + }, + + /// Call the `incomplete_suffix.try_complete` method with more input when available. + /// If no more input is available, this is an invalid byte sequence. + Incomplete { + valid_prefix: &'a str, + incomplete_suffix: Incomplete, + }, +} + +#[derive(Debug, Copy, Clone)] +pub struct Incomplete { + pub buffer: [u8; 4], + pub buffer_len: u8, +} + +pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { + let error = match str::from_utf8(input) { + Ok(valid) => return Ok(valid), + Err(error) => error, + }; + + // FIXME: separate function from here to guide inlining? + let (valid, after_valid) = input.split_at(error.valid_up_to()); + let valid = unsafe { + str::from_utf8_unchecked(valid) + }; + + match error.error_len() { + Some(invalid_sequence_length) => { + let (invalid, rest) = after_valid.split_at(invalid_sequence_length); + Err(DecodeError::Invalid { + valid_prefix: valid, + invalid_sequence: invalid, + remaining_input: rest + }) + } + None => { + Err(DecodeError::Incomplete { + valid_prefix: valid, + incomplete_suffix: Incomplete::new(after_valid), + }) + } + } +} + +impl Incomplete { + fn new(bytes: &[u8]) -> Self { + let mut buffer = [0, 0, 0, 0]; + let len = bytes.len(); + buffer[..len].copy_from_slice(bytes); + Incomplete { + buffer: buffer, + buffer_len: len as u8, + } + } + + /// * `None`: still incomplete, call `try_complete` again with more input. + /// If no more input is available, this is invalid byte sequence. + /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. + /// To keep decoding, pass `remaining_input` to `decode()`. + pub fn try_complete<'input>(&mut self, input: &'input [u8]) + -> Option<(Result<&str, &[u8]>, &'input [u8])> { + let (consumed, opt_result) = self.try_complete_offsets(input); + let result = opt_result?; + let remaining_input = &input[consumed..]; + let result_bytes = self.take_buffer(); + let result = match result { + Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), + Err(()) => Err(result_bytes), + }; + Some((result, remaining_input)) + } + + fn take_buffer(&mut self) -> &[u8] { + let len = self.buffer_len as usize; + self.buffer_len = 0; + &self.buffer[..len as usize] + } + + /// (consumed_from_input, None): not enough input + /// (consumed_from_input, Some(Err(()))): error bytes in buffer + /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer + fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { + let initial_buffer_len = self.buffer_len as usize; + let copied_from_input; + { + let unwritten = &mut self.buffer[initial_buffer_len..]; + copied_from_input = cmp::min(unwritten.len(), input.len()); + unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); + } + let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; + match str::from_utf8(spliced) { + Ok(_) => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, Some(Ok(()))) + } + Err(error) => { + let valid_up_to = error.valid_up_to(); + if valid_up_to > 0 { + let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = valid_up_to as u8; + (consumed, Some(Ok(()))) + } else { + match error.error_len() { + Some(invalid_sequence_length) => { + let consumed = invalid_sequence_length + .checked_sub(initial_buffer_len).unwrap(); + self.buffer_len = invalid_sequence_length as u8; + (consumed, Some(Err(()))) + } + None => { + self.buffer_len = spliced.len() as u8; + (copied_from_input, None) + } + } + } + } + } + } +}