Skip to content

Commit

Permalink
codec: AnyDelimiter codec (#3406)
Browse files Browse the repository at this point in the history
  • Loading branch information
r-zig authored Feb 23, 2021
1 parent 8efed43 commit c9d2a36
Show file tree
Hide file tree
Showing 3 changed files with 492 additions and 1 deletion.
263 changes: 263 additions & 0 deletions tokio-util/src/codec/any_delimiter_codec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
use crate::codec::decoder::Decoder;
use crate::codec::encoder::Encoder;

use bytes::{Buf, BufMut, Bytes, BytesMut};
use std::{cmp, fmt, io, str, usize};

const DEFAULT_SEEK_DELIMITERS: &[u8] = b",;\n\r";
const DEFAULT_SEQUENCE_WRITER: &[u8] = b",";
/// A simple [`Decoder`] and [`Encoder`] implementation that splits up data into chunks based on any character in the given delimiter string.
///
/// [`Decoder`]: crate::codec::Decoder
/// [`Encoder`]: crate::codec::Encoder
///
/// # Example
/// Decode string of bytes containing various different delimiters.
///
/// [`BytesMut`]: bytes::BytesMut
/// [`Error`]: std::io::Error
///
/// ```
/// use tokio_util::codec::{AnyDelimiterCodec, Decoder};
/// use bytes::{BufMut, BytesMut};
///
/// #
/// # #[tokio::main(flavor = "current_thread")]
/// # async fn main() -> Result<(), std::io::Error> {
/// let mut codec = AnyDelimiterCodec::new(b",;\r\n".to_vec(),b";".to_vec());
/// let buf = &mut BytesMut::new();
/// buf.reserve(200);
/// buf.put_slice(b"chunk 1,chunk 2;chunk 3\n\r");
/// assert_eq!("chunk 1", codec.decode(buf).unwrap().unwrap());
/// assert_eq!("chunk 2", codec.decode(buf).unwrap().unwrap());
/// assert_eq!("chunk 3", codec.decode(buf).unwrap().unwrap());
/// assert_eq!("", codec.decode(buf).unwrap().unwrap());
/// assert_eq!(None, codec.decode(buf).unwrap());
/// # Ok(())
/// # }
/// ```
///
#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct AnyDelimiterCodec {
// Stored index of the next index to examine for the delimiter character.
// This is used to optimize searching.
// For example, if `decode` was called with `abc` and the delimiter is '{}', it would hold `3`,
// because that is the next index to examine.
// The next time `decode` is called with `abcde}`, the method will
// only look at `de}` before returning.
next_index: usize,

/// The maximum length for a given chunk. If `usize::MAX`, chunks will be
/// read until a delimiter character is reached.
max_length: usize,

/// Are we currently discarding the remainder of a chunk which was over
/// the length limit?
is_discarding: bool,

/// The bytes that are using for search during decode
seek_delimiters: Vec<u8>,

/// The bytes that are using for encoding
sequence_writer: Vec<u8>,
}

impl AnyDelimiterCodec {
/// Returns a `AnyDelimiterCodec` for splitting up data into chunks.
///
/// # Note
///
/// The returned `AnyDelimiterCodec` will not have an upper bound on the length
/// of a buffered chunk. See the documentation for [`new_with_max_length`]
/// for information on why this could be a potential security risk.
///
/// [`new_with_max_length`]: crate::codec::AnyDelimiterCodec::new_with_max_length()
pub fn new(seek_delimiters: Vec<u8>, sequence_writer: Vec<u8>) -> AnyDelimiterCodec {
AnyDelimiterCodec {
next_index: 0,
max_length: usize::MAX,
is_discarding: false,
seek_delimiters,
sequence_writer,
}
}

/// Returns a `AnyDelimiterCodec` with a maximum chunk length limit.
///
/// If this is set, calls to `AnyDelimiterCodec::decode` will return a
/// [`AnyDelimiterCodecError`] when a chunk exceeds the length limit. Subsequent calls
/// will discard up to `limit` bytes from that chunk until a delimiter
/// character is reached, returning `None` until the delimiter over the limit
/// has been fully discarded. After that point, calls to `decode` will
/// function as normal.
///
/// # Note
///
/// Setting a length limit is highly recommended for any `AnyDelimiterCodec` which
/// will be exposed to untrusted input. Otherwise, the size of the buffer
/// that holds the chunk currently being read is unbounded. An attacker could
/// exploit this unbounded buffer by sending an unbounded amount of input
/// without any delimiter characters, causing unbounded memory consumption.
///
/// [`AnyDelimiterCodecError`]: crate::codec::AnyDelimiterCodecError
pub fn new_with_max_length(
seek_delimiters: Vec<u8>,
sequence_writer: Vec<u8>,
max_length: usize,
) -> Self {
AnyDelimiterCodec {
max_length,
..AnyDelimiterCodec::new(seek_delimiters, sequence_writer)
}
}

/// Returns the maximum chunk length when decoding.
///
/// ```
/// use std::usize;
/// use tokio_util::codec::AnyDelimiterCodec;
///
/// let codec = AnyDelimiterCodec::new(b",;\n".to_vec(), b";".to_vec());
/// assert_eq!(codec.max_length(), usize::MAX);
/// ```
/// ```
/// use tokio_util::codec::AnyDelimiterCodec;
///
/// let codec = AnyDelimiterCodec::new_with_max_length(b",;\n".to_vec(), b";".to_vec(), 256);
/// assert_eq!(codec.max_length(), 256);
/// ```
pub fn max_length(&self) -> usize {
self.max_length
}
}

impl Decoder for AnyDelimiterCodec {
type Item = Bytes;
type Error = AnyDelimiterCodecError;

fn decode(&mut self, buf: &mut BytesMut) -> Result<Option<Bytes>, AnyDelimiterCodecError> {
loop {
// Determine how far into the buffer we'll search for a delimiter. If
// there's no max_length set, we'll read to the end of the buffer.
let read_to = cmp::min(self.max_length.saturating_add(1), buf.len());

let new_chunk_offset = buf[self.next_index..read_to].iter().position(|b| {
self.seek_delimiters
.iter()
.any(|delimiter| *b == *delimiter)
});

match (self.is_discarding, new_chunk_offset) {
(true, Some(offset)) => {
// If we found a new chunk, discard up to that offset and
// then stop discarding. On the next iteration, we'll try
// to read a chunk normally.
buf.advance(offset + self.next_index + 1);
self.is_discarding = false;
self.next_index = 0;
}
(true, None) => {
// Otherwise, we didn't find a new chunk, so we'll discard
// everything we read. On the next iteration, we'll continue
// discarding up to max_len bytes unless we find a new chunk.
buf.advance(read_to);
self.next_index = 0;
if buf.is_empty() {
return Ok(None);
}
}
(false, Some(offset)) => {
// Found a chunk!
let new_chunk_index = offset + self.next_index;
self.next_index = 0;
let mut chunk = buf.split_to(new_chunk_index + 1);
chunk.truncate(chunk.len() - 1);
let chunk = chunk.freeze();
return Ok(Some(chunk));
}
(false, None) if buf.len() > self.max_length => {
// Reached the maximum length without finding a
// new chunk, return an error and start discarding on the
// next call.
self.is_discarding = true;
return Err(AnyDelimiterCodecError::MaxChunkLengthExceeded);
}
(false, None) => {
// We didn't find a chunk or reach the length limit, so the next
// call will resume searching at the current offset.
self.next_index = read_to;
return Ok(None);
}
}
}
}

fn decode_eof(&mut self, buf: &mut BytesMut) -> Result<Option<Bytes>, AnyDelimiterCodecError> {
Ok(match self.decode(buf)? {
Some(frame) => Some(frame),
None => {
// return remaining data, if any
if buf.is_empty() {
None
} else {
let chunk = buf.split_to(buf.len());
self.next_index = 0;
Some(chunk.freeze())
}
}
})
}
}

impl<T> Encoder<T> for AnyDelimiterCodec
where
T: AsRef<str>,
{
type Error = AnyDelimiterCodecError;

fn encode(&mut self, chunk: T, buf: &mut BytesMut) -> Result<(), AnyDelimiterCodecError> {
let chunk = chunk.as_ref();
buf.reserve(chunk.len() + 1);
buf.put(chunk.as_bytes());
buf.put(self.sequence_writer.as_ref());

Ok(())
}
}

impl Default for AnyDelimiterCodec {
fn default() -> Self {
Self::new(
DEFAULT_SEEK_DELIMITERS.to_vec(),
DEFAULT_SEQUENCE_WRITER.to_vec(),
)
}
}

/// An error occured while encoding or decoding a chunk.
#[derive(Debug)]
pub enum AnyDelimiterCodecError {
/// The maximum chunk length was exceeded.
MaxChunkLengthExceeded,
/// An IO error occurred.
Io(io::Error),
}

impl fmt::Display for AnyDelimiterCodecError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
AnyDelimiterCodecError::MaxChunkLengthExceeded => {
write!(f, "max chunk length exceeded")
}
AnyDelimiterCodecError::Io(e) => write!(f, "{}", e),
}
}
}

impl From<io::Error> for AnyDelimiterCodecError {
fn from(e: io::Error) -> AnyDelimiterCodecError {
AnyDelimiterCodecError::Io(e)
}
}

impl std::error::Error for AnyDelimiterCodecError {}
3 changes: 3 additions & 0 deletions tokio-util/src/codec/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,6 @@ pub use self::length_delimited::{LengthDelimitedCodec, LengthDelimitedCodecError

mod lines_codec;
pub use self::lines_codec::{LinesCodec, LinesCodecError};

mod any_delimiter_codec;
pub use self::any_delimiter_codec::{AnyDelimiterCodec, AnyDelimiterCodecError};
Loading

0 comments on commit c9d2a36

Please sign in to comment.