Skip to content

Commit

Permalink
Additional tooling access methods, trait impls, and a bug fix (#780)
Browse files Browse the repository at this point in the history
* Additional tooling access methods, trait impls, and a bug fix

* Renames the `RawReaderType` enum to `IonEncoding` because it is
  generally useful in other contexts too.
* Adds the ability for system and raw readers using `AnyEncoding`
  to report the encoding they're currently using.
* Adds `value_span` and `annotations_span` methods to `LazyValue`
  in addition to `span`, which includes both the annotations and
  the value.
* The binary 1.0 and 1.1 writers will now write `f64`s as `f32`s
  to save space when it can be done losslessly.
* Adds `WriteAsIon` implementations for `LazyList`, `LazySExp`,
  and `LazyStruct`.
* Adds `IntoIterator` impls for `LazyList`, `LazySExp` and
  `LazyStruct`. Previously, they only existed for borrowed (`&`)
  references to those types.
* Fixes a bug in the `StreamingRawReader` that could cause
  a value's annotations span to be overwritten if reading that
  value consumed all of the data remaining in the buffer.
* Renames the feature-gated `LazyValue::lower` method (which
  returns a `LazyExpandedValue`) to `LazyValue::expanded()`
  so I could add a `raw()` method alongside it that returns
  the underlying `LazyRawValue` when applicable.

* Adds SmallestFloatRepr trait for f64, f32

* `unsafe` explanations

---------

Co-authored-by: Zack Slayton <[email protected]>
  • Loading branch information
zslayton and zslayton authored May 31, 2024
1 parent 1439b07 commit 80088b4
Show file tree
Hide file tree
Showing 18 changed files with 417 additions and 98 deletions.
93 changes: 69 additions & 24 deletions src/lazy/any_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
use std::fmt::Debug;
use std::ops::Range;

use bumpalo::Bump as BumpAllocator;

use crate::lazy::any_encoding::RawReaderKind::{Binary_1_0, Text_1_0};
use crate::lazy::binary::raw::annotations_iterator::RawBinaryAnnotationsIterator as RawBinaryAnnotationsIterator_1_0;
use crate::lazy::binary::raw::r#struct::{
LazyRawBinaryFieldName_1_0, LazyRawBinaryStruct_1_0, RawBinaryStructIterator_1_0,
Expand Down Expand Up @@ -48,15 +45,16 @@ use crate::lazy::text::raw::sequence::{
LazyRawTextList_1_0, LazyRawTextSExp_1_0, RawTextListIterator_1_0, RawTextSExpIterator_1_0,
};
use crate::lazy::text::raw::v1_1::reader::{
LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextSExp_1_1, LazyRawTextStruct_1_1,
MacroIdRef, RawTextEExpression_1_1, RawTextSequenceCacheIterator_1_1,
LazyRawTextFieldName_1_1, LazyRawTextList_1_1, LazyRawTextReader_1_1, LazyRawTextSExp_1_1,
LazyRawTextStruct_1_1, MacroIdRef, RawTextEExpression_1_1, RawTextSequenceCacheIterator_1_1,
RawTextStructCacheIterator_1_1,
};
use crate::lazy::text::value::{
LazyRawTextValue_1_0, LazyRawTextValue_1_1, LazyRawTextVersionMarker_1_0,
LazyRawTextVersionMarker_1_1, RawTextAnnotationsIterator,
};
use crate::{IonResult, IonType, RawSymbolRef};
use bumpalo::Bump as BumpAllocator;

/// An implementation of the `LazyDecoder` trait that can read any encoding of Ion.
#[derive(Debug, Clone, Copy)]
Expand All @@ -67,7 +65,7 @@ pub struct AnyEncoding;
// underlying type.
impl Decoder for AnyEncoding {
type Reader<'data> = LazyRawAnyReader<'data>;
type ReaderSavedState = RawReaderType;
type ReaderSavedState = IonEncoding;
type Value<'top> = LazyRawAnyValue<'top>;
type SExp<'top> = LazyRawAnySExp<'top>;
type List<'top> = LazyRawAnyList<'top>;
Expand Down Expand Up @@ -262,45 +260,68 @@ pub struct LazyRawAnyReader<'data> {
}

impl<'data> LazyRawAnyReader<'data> {
fn detect_encoding(data: &[u8]) -> RawReaderType {
fn detect_encoding(data: &[u8]) -> IonEncoding {
const BINARY_1_0_IVM: &[u8] = &[0xEA, 0x01, 0x00, 0xE0];

match *data {
[0xE0, 0x01, 0x00, 0xEA, ..] => RawReaderType::Binary_1_0,
[0xE0, 0x01, 0x01, 0xEA, ..] => RawReaderType::Binary_1_1,
_ => RawReaderType::Text_1_0,
[0xE0, 0x01, 0x00, 0xEA, ..] => IonEncoding::Binary_1_0,
[0xE0, 0x01, 0x01, 0xEA, ..] => IonEncoding::Binary_1_1,
_ => IonEncoding::Text_1_0,
}
}
}

pub enum RawReaderKind<'data> {
Text_1_0(LazyRawTextReader_1_0<'data>),
Binary_1_0(LazyRawBinaryReader_1_0<'data>),
Text_1_1(LazyRawTextReader_1_1<'data>),
Binary_1_1(LazyRawBinaryReader_1_1<'data>),
}

#[derive(Default, Copy, Clone)]
pub enum RawReaderType {
#[non_exhaustive]
pub enum IonEncoding {
// In the absence of a binary IVM, readers must assume Ion 1.0 text data until a
// text Ion 1.1 version marker is found.
#[default]
Text_1_0,
Binary_1_0,
Text_1_1,
Binary_1_1,
}

impl IonEncoding {
pub fn is_text(&self) -> bool {
use IonEncoding::*;
matches!(*self, Text_1_0 | Text_1_1)
}

pub fn is_binary(&self) -> bool {
use IonEncoding::*;
matches!(*self, Binary_1_0 | Binary_1_1)
}
}

impl<'data> From<LazyRawTextReader_1_0<'data>> for LazyRawAnyReader<'data> {
fn from(reader: LazyRawTextReader_1_0<'data>) -> Self {
LazyRawAnyReader {
encoding: Text_1_0(reader),
encoding: RawReaderKind::Text_1_0(reader),
}
}
}

impl<'data> From<LazyRawTextReader_1_1<'data>> for LazyRawAnyReader<'data> {
fn from(reader: LazyRawTextReader_1_1<'data>) -> Self {
LazyRawAnyReader {
encoding: RawReaderKind::Text_1_1(reader),
}
}
}

impl<'data> From<LazyRawBinaryReader_1_0<'data>> for LazyRawAnyReader<'data> {
fn from(reader: LazyRawBinaryReader_1_0<'data>) -> Self {
LazyRawAnyReader {
encoding: Binary_1_0(reader),
encoding: RawReaderKind::Binary_1_0(reader),
}
}
}
Expand All @@ -322,21 +343,24 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> {
fn resume_at_offset(
data: &'data [u8],
offset: usize,
mut raw_reader_type: RawReaderType,
mut raw_reader_type: IonEncoding,
) -> Self {
if offset == 0 {
// If we're at the beginning of the stream, the provided `raw_reader_type` may be a
// default. We need to inspect the bytes to see if we should override it.
raw_reader_type = Self::detect_encoding(data);
}
match raw_reader_type {
RawReaderType::Text_1_0 => {
IonEncoding::Text_1_0 => {
LazyRawTextReader_1_0::resume_at_offset(data, offset, ()).into()
}
RawReaderType::Binary_1_0 => {
IonEncoding::Binary_1_0 => {
LazyRawBinaryReader_1_0::resume_at_offset(data, offset, ()).into()
}
RawReaderType::Binary_1_1 => {
IonEncoding::Text_1_1 => {
LazyRawTextReader_1_0::resume_at_offset(data, offset, ()).into()
}
IonEncoding::Binary_1_1 => {
LazyRawBinaryReader_1_1::resume_at_offset(data, offset, ()).into()
}
}
Expand All @@ -353,6 +377,7 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> {
match &mut self.encoding {
Text_1_0(r) => Ok(r.next(allocator)?.into()),
Binary_1_0(r) => Ok(r.next()?.into()),
Text_1_1(r) => Ok(r.next(allocator)?.into()),
Binary_1_1(r) => Ok(r.next()?.into()),
}
}
Expand All @@ -361,9 +386,10 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> {
fn save_state(&self) -> <AnyEncoding as Decoder>::ReaderSavedState {
use RawReaderKind::*;
match &self.encoding {
Text_1_0(_) => RawReaderType::Text_1_0,
Binary_1_0(_) => RawReaderType::Binary_1_0,
Binary_1_1(_) => RawReaderType::Binary_1_1,
Text_1_0(_) => IonEncoding::Text_1_0,
Binary_1_0(_) => IonEncoding::Binary_1_0,
Text_1_1(_) => IonEncoding::Text_1_1,
Binary_1_1(_) => IonEncoding::Binary_1_1,
}
}

Expand All @@ -372,6 +398,7 @@ impl<'data> LazyRawReader<'data, AnyEncoding> for LazyRawAnyReader<'data> {
match &self.encoding {
Text_1_0(r) => r.position(),
Binary_1_0(r) => r.position(),
Text_1_1(r) => r.position(),
Binary_1_1(r) => r.position(),
}
}
Expand Down Expand Up @@ -724,6 +751,24 @@ impl<'top> LazyRawValue<'top, AnyEncoding> for LazyRawAnyValue<'top> {
Binary_1_1(v) => Ok(v.read()?.into()),
}
}

fn annotations_span(&self) -> Span<'top> {
match &self.encoding {
LazyRawValueKind::Text_1_0(v) => v.annotations_span(),
LazyRawValueKind::Binary_1_0(v) => v.annotations_span(),
LazyRawValueKind::Text_1_1(v) => v.annotations_span(),
LazyRawValueKind::Binary_1_1(v) => v.annotations_span(),
}
}

fn value_span(&self) -> Span<'top> {
match &self.encoding {
LazyRawValueKind::Text_1_0(v) => v.value_span(),
LazyRawValueKind::Binary_1_0(v) => v.value_span(),
LazyRawValueKind::Text_1_1(v) => v.value_span(),
LazyRawValueKind::Binary_1_1(v) => v.value_span(),
}
}
}

// ===== Annotations =====
Expand Down Expand Up @@ -763,10 +808,10 @@ impl<'top> LazyRawAnyList<'top> {
pub fn as_value(&self) -> LazyRawAnyValue<'top> {
use LazyRawListKind::*;
match self.encoding {
Text_1_0(_) => todo!(),
Binary_1_0(s) => s.as_value().into(),
Text_1_1(_) => todo!(),
Binary_1_1(_) => todo!(),
Text_1_0(l) => l.as_value().into(),
Binary_1_0(l) => l.as_value().into(),
Text_1_1(l) => l.as_value().into(),
Binary_1_1(l) => l.as_value().into(),
}
}
}
Expand Down
16 changes: 16 additions & 0 deletions src/lazy/binary/raw/v1_1/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,22 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_1> for LazyRawBinaryValue_1_1<'to
fn read(&self) -> IonResult<RawValueRef<'top, BinaryEncoding_1_1>> {
self.read()
}

fn annotations_span(&self) -> Span<'top> {
let Some(range) = self.encoded_value.annotations_range() else {
// If there are no annotations, return an empty slice positioned at the opcode
return Span::with_offset(self.encoded_value.header_offset, &[]);
};
// Subtract the `offset()` of the ImmutableBuffer to get the local indexes for start/end
let local_range = (range.start - self.input.offset())..(range.end - self.input.offset());
Span::with_offset(range.start, &self.input.bytes()[local_range])
}

fn value_span(&self) -> Span<'top> {
let range = self.encoded_value.unannotated_value_range();
let local_range = (range.start - self.input.offset())..(range.end - self.input.offset());
Span::with_offset(range.start, &self.input.bytes()[local_range])
}
}

impl<'top> LazyRawBinaryValue_1_1<'top> {
Expand Down
15 changes: 15 additions & 0 deletions src/lazy/binary/raw/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,21 @@ impl<'top> LazyRawValue<'top, BinaryEncoding_1_0> for LazyRawBinaryValue_1_0<'to
fn read(&self) -> IonResult<RawValueRef<'top, BinaryEncoding_1_0>> {
self.read()
}

fn annotations_span(&self) -> Span<'top> {
let Some(range) = self.encoded_value.annotations_range() else {
// If there are no annotations, return an empty slice positioned at the opcode
return Span::with_offset(self.encoded_value.header_offset, &[]);
};
let local_range = (range.start - self.input.offset())..(range.end - self.input.offset());
Span::with_offset(range.start, &self.input.bytes()[local_range])
}

fn value_span(&self) -> Span<'top> {
let range = self.encoded_value.unannotated_value_range();
let local_range = (range.start - self.input.offset())..(range.end - self.input.offset());
Span::with_offset(range.start, &self.input.bytes()[local_range])
}
}

#[derive(Copy, Clone)]
Expand Down
4 changes: 4 additions & 0 deletions src/lazy/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,10 @@ pub trait LazyRawValue<'top, D: Decoder>:
fn is_null(&self) -> bool;
fn annotations(&self) -> D::AnnotationsIterator<'top>;
fn read(&self) -> IonResult<RawValueRef<'top, D>>;

fn annotations_span(&self) -> Span<'top>;

fn value_span(&self) -> Span<'top>;
}

pub trait LazyRawSequence<'top, D: Decoder>:
Expand Down
7 changes: 7 additions & 0 deletions src/lazy/encoder/binary/v1_0/value_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,13 @@ impl<'value, 'top> BinaryValueWriter_1_0<'value, 'top> {
return Ok(());
}

// See if this value can be losslessly encoded in 4 bytes instead of 8
let float32 = value as f32;
if float32 as f64 == value {
// No data lost during cast; write it as an f32 instead.
return self.write_f32(float32);
}

self.push_byte(0x48);
self.push_bytes(&value.to_be_bytes());
Ok(())
Expand Down
59 changes: 42 additions & 17 deletions src/lazy/encoder/binary/v1_1/value_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::lazy::encoder::value_writer::{delegate_value_writer_to_self, Annotata
use crate::lazy::text::raw::v1_1::reader::MacroIdRef;
use crate::raw_symbol_ref::AsRawSymbolRef;
use crate::result::IonFailure;
use crate::types::float::{FloatRepr, SmallestFloatRepr};
use crate::{Decimal, Int, IonResult, IonType, RawSymbolRef, SymbolId, Timestamp};

/// The initial size of the bump-allocated buffer created to hold a container's child elements.
Expand Down Expand Up @@ -155,26 +156,34 @@ impl<'value, 'top> BinaryValueWriter_1_1<'value, 'top> {
// TODO: write_f16(...)

pub fn write_f32(mut self, value: f32) -> IonResult<()> {
if value == 0f32 && !value.is_sign_negative() {
self.push_byte(0x5A);
return Ok(());
match value.smallest_repr() {
FloatRepr::Zero => {
self.push_byte(0x5A);
}
FloatRepr::Single(f) => {
self.push_byte(0x5C);
self.push_bytes(&f.to_le_bytes());
}
FloatRepr::Double(_) => unreachable!("smallest repr for f32 cannot be f64"),
}
self.push_byte(0x5C);
// Float endianness is an open question.
// See: https://github.com/amazon-ion/ion-docs/issues/294
self.push_bytes(&value.to_le_bytes());
Ok(())
}

pub fn write_f64(mut self, value: f64) -> IonResult<()> {
if value == 0f64 && !value.is_sign_negative() {
self.push_byte(0x5A);
return Ok(());
match value.smallest_repr() {
FloatRepr::Zero => {
self.push_byte(0x5A);
}
FloatRepr::Single(f) => {
self.push_byte(0x5C);
self.push_bytes(&f.to_le_bytes());
}
FloatRepr::Double(f) => {
self.push_byte(0x5D);
self.push_bytes(&f.to_le_bytes());
}
}
self.push_byte(0x5D);
// Float endianness is an open question.
// See: https://github.com/amazon-ion/ion-docs/issues/294
self.push_bytes(&value.to_le_bytes());

Ok(())
}

Expand Down Expand Up @@ -823,17 +832,18 @@ impl<'value, 'top> BinaryAnnotatedValueWriter_1_1<'value, 'top> {

#[cfg(test)]
mod tests {

use crate::lazy::encoder::annotate::{Annotatable, Annotated};
use crate::lazy::encoder::annotation_seq::AnnotationSeq;
use crate::lazy::encoder::binary::v1_1::writer::LazyRawBinaryWriter_1_1;
use crate::lazy::encoder::value_writer::ValueWriter;
use crate::lazy::encoder::value_writer::{SequenceWriter, StructWriter};
use crate::lazy::encoder::write_as_ion::{WriteAsIon, WriteAsSExp};
use crate::raw_symbol_ref::AsRawSymbolRef;
use crate::types::float::{FloatRepr, SmallestFloatRepr};
use crate::{
Decimal, Element, Int, IonResult, IonType, Null, RawSymbolRef, SymbolId, Timestamp,
};
use num_traits::FloatConst;

fn encoding_test(
test: impl FnOnce(&mut LazyRawBinaryWriter_1_1<&mut Vec<u8>>) -> IonResult<()>,
Expand Down Expand Up @@ -971,10 +981,25 @@ mod tests {
f64::INFINITY,
f64::NEG_INFINITY,
f64::NAN,
f64::PI(),
f64::E(),
f64::EPSILON,
];
for value in test_f64s {
let mut expected_encoding = vec![0x5D];
expected_encoding.extend_from_slice(&value.to_le_bytes()[..]);
let mut expected_encoding = vec![];
match value.smallest_repr() {
FloatRepr::Zero => {
expected_encoding.push(0x5A);
}
FloatRepr::Single(f) => {
expected_encoding.push(0x5C);
expected_encoding.extend_from_slice(&f.to_le_bytes()[..]);
}
FloatRepr::Double(f) => {
expected_encoding.push(0x5D);
expected_encoding.extend_from_slice(&f.to_le_bytes()[..]);
}
}
encoding_test(
|writer: &mut LazyRawBinaryWriter_1_1<&mut Vec<u8>>| {
writer.write(value)?;
Expand Down
Loading

0 comments on commit 80088b4

Please sign in to comment.