From afa6d90a3bd6a28e7c66d7b427eb7334fc15b1cf Mon Sep 17 00:00:00 2001 From: Ian Joiner <14581281+iajoiner@users.noreply.github.com> Date: Mon, 11 Nov 2024 01:04:04 -0500 Subject: [PATCH] feat: add `Table` and `table_utility.rs` --- crates/proof-of-sql/src/base/database/mod.rs | 8 + .../proof-of-sql/src/base/database/table.rs | 97 +++++ .../src/base/database/table_test.rs | 208 +++++++++++ .../src/base/database/table_utility.rs | 337 ++++++++++++++++++ 4 files changed, 650 insertions(+) create mode 100644 crates/proof-of-sql/src/base/database/table.rs create mode 100644 crates/proof-of-sql/src/base/database/table_test.rs create mode 100644 crates/proof-of-sql/src/base/database/table_utility.rs diff --git a/crates/proof-of-sql/src/base/database/mod.rs b/crates/proof-of-sql/src/base/database/mod.rs index 5516fb61b..38159a4ef 100644 --- a/crates/proof-of-sql/src/base/database/mod.rs +++ b/crates/proof-of-sql/src/base/database/mod.rs @@ -56,6 +56,14 @@ pub(crate) use owned_table::OwnedTableError; mod owned_table_test; pub mod owned_table_utility; +mod table; +pub use table::Table; +#[cfg(test)] +pub(crate) use table::TableError; +#[cfg(test)] +mod table_test; +pub mod table_utility; + /// TODO: add docs pub(crate) mod expression_evaluation; mod expression_evaluation_error; diff --git a/crates/proof-of-sql/src/base/database/table.rs b/crates/proof-of-sql/src/base/database/table.rs new file mode 100644 index 000000000..ecf7f67e3 --- /dev/null +++ b/crates/proof-of-sql/src/base/database/table.rs @@ -0,0 +1,97 @@ +use super::Column; +use crate::base::{map::IndexMap, scalar::Scalar}; +use proof_of_sql_parser::Identifier; +use snafu::Snafu; + +/// An error that occurs when working with tables. +#[derive(Snafu, Debug, PartialEq, Eq)] +pub enum TableError { + /// The columns have different lengths. + #[snafu(display("Columns have different lengths"))] + ColumnLengthMismatch, +} +/// A table of data, with schema included. This is simply a map from `Identifier` to `Column`, +/// where columns order matters. +/// This is primarily used as an internal result that is used before +/// converting to the final result in either Arrow format or JSON. +/// This is the analog of an arrow [`RecordBatch`](arrow::record_batch::RecordBatch). +#[derive(Debug, Clone, Eq)] +pub struct Table<'a, S: Scalar> { + table: IndexMap>, +} +impl<'a, S: Scalar> Table<'a, S> { + /// Creates a new [`Table`]. + pub fn try_new(table: IndexMap>) -> Result { + if table.is_empty() { + return Ok(Self { table }); + } + let num_rows = table[0].len(); + if table.values().any(|column| column.len() != num_rows) { + Err(TableError::ColumnLengthMismatch) + } else { + Ok(Self { table }) + } + } + /// Creates a new [`Table`]. + pub fn try_from_iter)>>( + iter: T, + ) -> Result { + Self::try_new(IndexMap::from_iter(iter)) + } + /// Number of columns in the table. + #[must_use] + pub fn num_columns(&self) -> usize { + self.table.len() + } + /// Number of rows in the table. + #[must_use] + pub fn num_rows(&self) -> usize { + if self.table.is_empty() { + 0 + } else { + self.table[0].len() + } + } + /// Whether the table has no columns. + #[must_use] + pub fn is_empty(&self) -> bool { + self.table.is_empty() + } + /// Returns the columns of this table as an `IndexMap` + #[must_use] + pub fn into_inner(self) -> IndexMap> { + self.table + } + /// Returns the columns of this table as an `IndexMap` + #[must_use] + pub fn inner_table(&self) -> &IndexMap> { + &self.table + } + /// Returns the columns of this table as an Iterator + pub fn column_names(&self) -> impl Iterator { + self.table.keys() + } +} + +// Note: we modify the default PartialEq for IndexMap to also check for column ordering. +// This is to align with the behaviour of a `RecordBatch`. +impl PartialEq for Table<'_, S> { + fn eq(&self, other: &Self) -> bool { + self.table == other.table + && self + .table + .keys() + .zip(other.table.keys()) + .all(|(a, b)| a == b) + } +} + +#[cfg(test)] +impl<'a, S: Scalar> core::ops::Index<&str> for Table<'a, S> { + type Output = Column<'a, S>; + fn index(&self, index: &str) -> &Self::Output { + self.table + .get(&index.parse::().unwrap()) + .unwrap() + } +} diff --git a/crates/proof-of-sql/src/base/database/table_test.rs b/crates/proof-of-sql/src/base/database/table_test.rs new file mode 100644 index 000000000..9f6055ef5 --- /dev/null +++ b/crates/proof-of-sql/src/base/database/table_test.rs @@ -0,0 +1,208 @@ +use crate::{ + base::{ + database::{table_utility::*, Column, Table, TableError}, + map::IndexMap, + scalar::test_scalar::TestScalar, + }, + proof_primitive::dory::DoryScalar, +}; +use bumpalo::Bump; +use proof_of_sql_parser::{ + posql_time::{PoSQLTimeUnit, PoSQLTimeZone}, + Identifier, +}; + +#[test] +fn we_can_create_a_table_with_no_columns() { + let table = Table::::try_new(IndexMap::default()).unwrap(); + assert_eq!(table.num_columns(), 0); +} +#[test] +fn we_can_create_an_empty_table() { + let alloc = Bump::new(); + let borrowed_table = table::([ + bigint("bigint", [0; 0], &alloc), + int128("decimal", [0; 0], &alloc), + varchar("varchar", ["0"; 0], &alloc), + scalar("scalar", [0; 0], &alloc), + boolean("boolean", [true; 0], &alloc), + ]); + let mut table = IndexMap::default(); + table.insert(Identifier::try_new("bigint").unwrap(), Column::BigInt(&[])); + table.insert(Identifier::try_new("decimal").unwrap(), Column::Int128(&[])); + table.insert( + Identifier::try_new("varchar").unwrap(), + Column::VarChar((&[], &[])), + ); + table.insert(Identifier::try_new("scalar").unwrap(), Column::Scalar(&[])); + table.insert( + Identifier::try_new("boolean").unwrap(), + Column::Boolean(&[]), + ); + assert_eq!(borrowed_table.into_inner(), table); +} + +#[test] +fn we_can_create_a_table_with_data() { + let alloc = Bump::new(); + + let borrowed_table = table::([ + bigint( + "bigint", + [0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX], + &alloc, + ), + int128( + "decimal", + [0_i128, 1, 2, 3, 4, 5, 6, i128::MIN, i128::MAX], + &alloc, + ), + varchar( + "varchar", + ["0", "1", "2", "3", "4", "5", "6", "7", "8"], + &alloc, + ), + scalar("scalar", [0, 1, 2, 3, 4, 5, 6, 7, 8], &alloc), + boolean( + "boolean", + [true, false, true, false, true, false, true, false, true], + &alloc, + ), + timestamptz( + "time_stamp", + PoSQLTimeUnit::Second, + PoSQLTimeZone::Utc, + [0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX], + &alloc, + ), + ]); + + let mut expected_table = IndexMap::default(); + + let time_stamp_data = alloc.alloc_slice_copy(&[0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX]); + expected_table.insert( + Identifier::try_new("time_stamp").unwrap(), + Column::TimestampTZ(PoSQLTimeUnit::Second, PoSQLTimeZone::Utc, time_stamp_data), + ); + + let bigint_data = alloc.alloc_slice_copy(&[0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX]); + expected_table.insert( + Identifier::try_new("bigint").unwrap(), + Column::BigInt(bigint_data), + ); + + let decimal_data = alloc.alloc_slice_copy(&[0_i128, 1, 2, 3, 4, 5, 6, i128::MIN, i128::MAX]); + expected_table.insert( + Identifier::try_new("decimal").unwrap(), + Column::Int128(decimal_data), + ); + + let varchar_data: Vec<&str> = ["0", "1", "2", "3", "4", "5", "6", "7", "8"] + .iter() + .map(|&s| alloc.alloc_str(s) as &str) + .collect(); + let varchar_str_slice = alloc.alloc_slice_clone(&varchar_data); + let varchar_scalars: Vec = varchar_data.iter().map(Into::into).collect(); + let varchar_scalars_slice = alloc.alloc_slice_clone(&varchar_scalars); + expected_table.insert( + Identifier::try_new("varchar").unwrap(), + Column::VarChar((varchar_str_slice, varchar_scalars_slice)), + ); + + let scalar_data: Vec = (0..=8).map(DoryScalar::from).collect(); + let scalar_slice = alloc.alloc_slice_copy(&scalar_data); + expected_table.insert( + Identifier::try_new("scalar").unwrap(), + Column::Scalar(scalar_slice), + ); + + let boolean_data = + alloc.alloc_slice_copy(&[true, false, true, false, true, false, true, false, true]); + expected_table.insert( + Identifier::try_new("boolean").unwrap(), + Column::Boolean(boolean_data), + ); + + assert_eq!(borrowed_table.into_inner(), expected_table); +} + +#[test] +fn we_get_inequality_between_tables_with_differing_column_order() { + let alloc = Bump::new(); + + let table_a: Table<'_, TestScalar> = table([ + bigint("a", [0; 0], &alloc), + int128("b", [0; 0], &alloc), + varchar("c", ["0"; 0], &alloc), + boolean("d", [false; 0], &alloc), + timestamptz( + "time_stamp", + PoSQLTimeUnit::Second, + PoSQLTimeZone::Utc, + [0_i64; 0], + &alloc, + ), + ]); + + let table_b: Table<'_, TestScalar> = table([ + boolean("d", [false; 0], &alloc), + int128("b", [0; 0], &alloc), + bigint("a", [0; 0], &alloc), + varchar("c", ["0"; 0], &alloc), + timestamptz( + "time_stamp", + PoSQLTimeUnit::Second, + PoSQLTimeZone::Utc, + [0_i64; 0], + &alloc, + ), + ]); + + assert_ne!(table_a, table_b); +} + +#[test] +fn we_get_inequality_between_tables_with_differing_data() { + let alloc = Bump::new(); + + let table_a: Table<'_, DoryScalar> = table([ + bigint("a", [0], &alloc), + int128("b", [0], &alloc), + varchar("c", ["0"], &alloc), + boolean("d", [true], &alloc), + timestamptz( + "time_stamp", + PoSQLTimeUnit::Second, + PoSQLTimeZone::Utc, + [1_625_072_400], + &alloc, + ), + ]); + + let table_b: Table<'_, DoryScalar> = table([ + bigint("a", [1], &alloc), + int128("b", [0], &alloc), + varchar("c", ["0"], &alloc), + boolean("d", [true], &alloc), + timestamptz( + "time_stamp", + PoSQLTimeUnit::Second, + PoSQLTimeZone::Utc, + [1_625_076_000], + &alloc, + ), + ]); + + assert_ne!(table_a, table_b); +} + +#[test] +fn we_cannot_create_a_table_with_differing_column_lengths() { + assert!(matches!( + Table::::try_from_iter([ + ("a".parse().unwrap(), Column::BigInt(&[0])), + ("b".parse().unwrap(), Column::BigInt(&[])), + ]), + Err(TableError::ColumnLengthMismatch) + )); +} diff --git a/crates/proof-of-sql/src/base/database/table_utility.rs b/crates/proof-of-sql/src/base/database/table_utility.rs new file mode 100644 index 000000000..28669119c --- /dev/null +++ b/crates/proof-of-sql/src/base/database/table_utility.rs @@ -0,0 +1,337 @@ +//! Utility functions for creating [`Table`]s and [`Column`]s. +//! These functions are primarily intended for use in tests. +//! +//! # Example +//! ``` +//! use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +//! let result = table::([ +//! bigint("a", [1, 2, 3]), +//! boolean("b", [true, false, true]), +//! int128("c", [1, 2, 3]), +//! scalar("d", [1, 2, 3]), +//! varchar("e", ["a", "b", "c"]), +//! decimal75("f", 12, 1, [1, 2, 3]), +//! ]); +//! ``` +use super::{Column, Table}; +use crate::base::scalar::Scalar; +use alloc::{string::String, vec::Vec}; +use bumpalo::Bump; +use core::ops::Deref; +use proof_of_sql_parser::{ + posql_time::{PoSQLTimeUnit, PoSQLTimeZone}, + Identifier, +}; + +/// Creates an [`Table`] from a list of `(Identifier, Column)` pairs. +/// This is a convenience wrapper around [`Table::try_from_iter`] primarily for use in tests and +/// intended to be used along with the other methods in this module (e.g. [bigint], [boolean], etc). +/// The function will panic under a variety of conditions. See [`Table::try_from_iter`] for more details. +/// +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// bigint("a", [1, 2, 3], &alloc), +/// boolean("b", [true, false, true], &alloc), +/// int128("c", [1, 2, 3], &alloc), +/// scalar("d", [1, 2, 3], &alloc), +/// varchar("e", ["a", "b", "c"], &alloc), +/// decimal75("f", 12, 1, [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if converting the iterator into an `Table<'a, S>` fails. +pub fn table<'a, S: Scalar>( + iter: impl IntoIterator)>, +) -> Table<'a, S> { + Table::try_from_iter(iter).unwrap() +} + +/// Creates a (Identifier, `Column`) pair for a tinyint column. +/// This is primarily intended for use in conjunction with [`table`]. +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// tinyint("a", [1_i8, 2, 3], &alloc), +/// ]); +///``` +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn tinyint<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::TinyInt(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for a smallint column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ```rust +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// smallint("a", [1_i16, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn smallint<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::SmallInt(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for an int column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ```rust +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// int("a", [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn int<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::Int(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for a bigint column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ```rust +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// bigint("a", [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +#[allow(clippy::missing_panics_doc)] +pub fn bigint<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::BigInt(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for a boolean column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// boolean("a", [true, false, true], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn boolean<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::Boolean(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for an int128 column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// int128("a", [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn int128<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::Int128(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for a scalar column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// scalar("a", [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn scalar<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + (name.parse().unwrap(), Column::Scalar(alloc_data)) +} + +/// Creates a `(Identifier, Column)` pair for a varchar column. +/// This is primarily intended for use in conjunction with [`table`]. +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// varchar("a", ["a", "b", "c"], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn varchar<'a, S: Scalar>( + name: impl Deref, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let strings: Vec<&'a str> = data + .into_iter() + .map(|item| { + let string = item.into(); + alloc.alloc_str(&string) as &'a str + }) + .collect(); + let alloc_strings = alloc.alloc_slice_clone(&strings); + let scalars: Vec = strings.into_iter().map(Into::into).collect(); + let alloc_scalars = alloc.alloc_slice_copy(&scalars); + ( + name.parse().unwrap(), + Column::VarChar((alloc_strings, alloc_scalars)), + ) +} + +/// Creates a `(Identifier, Column)` pair for a decimal75 column. +/// This is primarily intended for use in conjunction with [`table`]. +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, scalar::Curve25519Scalar}; +/// let alloc = Bump::new(); +/// let result = table::([ +/// decimal75("a", 12, 1, [1, 2, 3], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +/// - Panics if creating the `Precision` from the specified precision value fails. +pub fn decimal75<'a, S: Scalar>( + name: impl Deref, + precision: u8, + scale: i8, + data: impl IntoIterator>, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let transformed_data: Vec = data.into_iter().map(Into::into).collect(); + let alloc_data = alloc.alloc_slice_copy(&transformed_data); + ( + name.parse().unwrap(), + Column::Decimal75( + crate::base::math::decimal::Precision::new(precision).unwrap(), + scale, + alloc_data, + ), + ) +} + +/// Creates a `(Identifier, Column)` pair for a timestamp column. +/// This is primarily intended for use in conjunction with [`table`]. +/// +/// # Parameters +/// - `name`: The name of the column. +/// - `time_unit`: The time unit of the timestamps. +/// - `timezone`: The timezone for the timestamps. +/// - `data`: The data for the column, provided as an iterator over `i64` values representing time since the unix epoch. +/// - `alloc`: The bump allocator to use for allocating the column data. +/// +/// # Example +/// ``` +/// use bumpalo::Bump; +/// use proof_of_sql::base::{database::table_utility::*, +/// scalar::Curve25519Scalar, +/// }; +/// use proof_of_sql_parser::{ +/// posql_time::{PoSQLTimeZone, PoSQLTimeUnit}}; +/// +/// let alloc = Bump::new(); +/// let result = table::([ +/// timestamptz("event_time", PoSQLTimeUnit::Second, PoSQLTimeZone::Utc, vec![1625072400, 1625076000, 1625079600], &alloc), +/// ]); +/// ``` +/// +/// # Panics +/// - Panics if `name.parse()` fails to convert the name into an `Identifier`. +pub fn timestamptz<'a, S: Scalar>( + name: impl Deref, + time_unit: PoSQLTimeUnit, + timezone: PoSQLTimeZone, + data: impl IntoIterator, + alloc: &'a Bump, +) -> (Identifier, Column<'a, S>) { + let vec_data: Vec = data.into_iter().collect(); + let alloc_data = alloc.alloc_slice_copy(&vec_data); + ( + name.parse().unwrap(), + Column::TimestampTZ(time_unit, timezone, alloc_data), + ) +}