feat: add Table and table_utility.rs

spaceandtimelabs · Nov 11, 2024 · afa6d90 · afa6d90
1 parent fdb4c9e
commit afa6d90
Show file tree

Hide file tree

Showing 4 changed files with 650 additions and 0 deletions.
diff --git a/crates/proof-of-sql/src/base/database/mod.rs b/crates/proof-of-sql/src/base/database/mod.rs
@@ -56,6 +56,14 @@ pub(crate) use owned_table::OwnedTableError;
 mod owned_table_test;
 pub mod owned_table_utility;
 
+mod table;
+pub use table::Table;
+#[cfg(test)]
+pub(crate) use table::TableError;
+#[cfg(test)]
+mod table_test;
+pub mod table_utility;
+
 /// TODO: add docs
 pub(crate) mod expression_evaluation;
 mod expression_evaluation_error;

diff --git a/crates/proof-of-sql/src/base/database/table.rs b/crates/proof-of-sql/src/base/database/table.rs
@@ -0,0 +1,97 @@
+use super::Column;
+use crate::base::{map::IndexMap, scalar::Scalar};
+use proof_of_sql_parser::Identifier;
+use snafu::Snafu;
+
+/// An error that occurs when working with tables.
+#[derive(Snafu, Debug, PartialEq, Eq)]
+pub enum TableError {
+    /// The columns have different lengths.
+    #[snafu(display("Columns have different lengths"))]
+    ColumnLengthMismatch,
+}
+/// A table of data, with schema included. This is simply a map from `Identifier` to `Column`,
+/// where columns order matters.
+/// This is primarily used as an internal result that is used before
+/// converting to the final result in either Arrow format or JSON.
+/// This is the analog of an arrow [`RecordBatch`](arrow::record_batch::RecordBatch).
+#[derive(Debug, Clone, Eq)]
+pub struct Table<'a, S: Scalar> {
+    table: IndexMap<Identifier, Column<'a, S>>,
+}
+impl<'a, S: Scalar> Table<'a, S> {
+    /// Creates a new [`Table`].
+    pub fn try_new(table: IndexMap<Identifier, Column<'a, S>>) -> Result<Self, TableError> {
+        if table.is_empty() {
+            return Ok(Self { table });
+        }
+        let num_rows = table[0].len();
+        if table.values().any(|column| column.len() != num_rows) {
+            Err(TableError::ColumnLengthMismatch)
+        } else {
+            Ok(Self { table })
+        }
+    }
+    /// Creates a new [`Table`].
+    pub fn try_from_iter<T: IntoIterator<Item = (Identifier, Column<'a, S>)>>(
+        iter: T,
+    ) -> Result<Self, TableError> {
+        Self::try_new(IndexMap::from_iter(iter))
+    }
+    /// Number of columns in the table.
+    #[must_use]
+    pub fn num_columns(&self) -> usize {
+        self.table.len()
+    }
+    /// Number of rows in the table.
+    #[must_use]
+    pub fn num_rows(&self) -> usize {
+        if self.table.is_empty() {
+            0
+        } else {
+            self.table[0].len()
+        }
+    }
+    /// Whether the table has no columns.
+    #[must_use]
+    pub fn is_empty(&self) -> bool {
+        self.table.is_empty()
+    }
+    /// Returns the columns of this table as an `IndexMap`
+    #[must_use]
+    pub fn into_inner(self) -> IndexMap<Identifier, Column<'a, S>> {
+        self.table
+    }
+    /// Returns the columns of this table as an `IndexMap`
+    #[must_use]
+    pub fn inner_table(&self) -> &IndexMap<Identifier, Column<'a, S>> {
+        &self.table
+    }
+    /// Returns the columns of this table as an Iterator
+    pub fn column_names(&self) -> impl Iterator<Item = &Identifier> {
+        self.table.keys()
+    }
+}
+
+// Note: we modify the default PartialEq for IndexMap to also check for column ordering.
+// This is to align with the behaviour of a `RecordBatch`.
+impl<S: Scalar> PartialEq for Table<'_, S> {
+    fn eq(&self, other: &Self) -> bool {
+        self.table == other.table
+            && self
+                .table
+                .keys()
+                .zip(other.table.keys())
+                .all(|(a, b)| a == b)
+    }
+}
+
+#[cfg(test)]
+impl<'a, S: Scalar> core::ops::Index<&str> for Table<'a, S> {
+    type Output = Column<'a, S>;
+    fn index(&self, index: &str) -> &Self::Output {
+        self.table
+            .get(&index.parse::<Identifier>().unwrap())
+            .unwrap()
+    }
+}
diff --git a/crates/proof-of-sql/src/base/database/table_test.rs b/crates/proof-of-sql/src/base/database/table_test.rs
@@ -0,0 +1,208 @@
+use crate::{
+    base::{
+        database::{table_utility::*, Column, Table, TableError},
+        map::IndexMap,
+        scalar::test_scalar::TestScalar,
+    },
+    proof_primitive::dory::DoryScalar,
+};
+use bumpalo::Bump;
+use proof_of_sql_parser::{
+    posql_time::{PoSQLTimeUnit, PoSQLTimeZone},
+    Identifier,
+};
+
+#[test]
+fn we_can_create_a_table_with_no_columns() {
+    let table = Table::<TestScalar>::try_new(IndexMap::default()).unwrap();
+    assert_eq!(table.num_columns(), 0);
+}
+#[test]
+fn we_can_create_an_empty_table() {
+    let alloc = Bump::new();
+    let borrowed_table = table::<DoryScalar>([
+        bigint("bigint", [0; 0], &alloc),
+        int128("decimal", [0; 0], &alloc),
+        varchar("varchar", ["0"; 0], &alloc),
+        scalar("scalar", [0; 0], &alloc),
+        boolean("boolean", [true; 0], &alloc),
+    ]);
+    let mut table = IndexMap::default();
+    table.insert(Identifier::try_new("bigint").unwrap(), Column::BigInt(&[]));
+    table.insert(Identifier::try_new("decimal").unwrap(), Column::Int128(&[]));
+    table.insert(
+        Identifier::try_new("varchar").unwrap(),
+        Column::VarChar((&[], &[])),
+    );
+    table.insert(Identifier::try_new("scalar").unwrap(), Column::Scalar(&[]));
+    table.insert(
+        Identifier::try_new("boolean").unwrap(),
+        Column::Boolean(&[]),
+    );
+    assert_eq!(borrowed_table.into_inner(), table);
+}
+
+#[test]
+fn we_can_create_a_table_with_data() {
+    let alloc = Bump::new();
+
+    let borrowed_table = table::<DoryScalar>([
+        bigint(
+            "bigint",
+            [0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX],
+            &alloc,
+        ),
+        int128(
+            "decimal",
+            [0_i128, 1, 2, 3, 4, 5, 6, i128::MIN, i128::MAX],
+            &alloc,
+        ),
+        varchar(
+            "varchar",
+            ["0", "1", "2", "3", "4", "5", "6", "7", "8"],
+            &alloc,
+        ),
+        scalar("scalar", [0, 1, 2, 3, 4, 5, 6, 7, 8], &alloc),
+        boolean(
+            "boolean",
+            [true, false, true, false, true, false, true, false, true],
+            &alloc,
+        ),
+        timestamptz(
+            "time_stamp",
+            PoSQLTimeUnit::Second,
+            PoSQLTimeZone::Utc,
+            [0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX],
+            &alloc,
+        ),
+    ]);
+
+    let mut expected_table = IndexMap::default();
+
+    let time_stamp_data = alloc.alloc_slice_copy(&[0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX]);
+    expected_table.insert(
+        Identifier::try_new("time_stamp").unwrap(),
+        Column::TimestampTZ(PoSQLTimeUnit::Second, PoSQLTimeZone::Utc, time_stamp_data),
+    );
+
+    let bigint_data = alloc.alloc_slice_copy(&[0_i64, 1, 2, 3, 4, 5, 6, i64::MIN, i64::MAX]);
+    expected_table.insert(
+        Identifier::try_new("bigint").unwrap(),
+        Column::BigInt(bigint_data),
+    );
+
+    let decimal_data = alloc.alloc_slice_copy(&[0_i128, 1, 2, 3, 4, 5, 6, i128::MIN, i128::MAX]);
+    expected_table.insert(
+        Identifier::try_new("decimal").unwrap(),
+        Column::Int128(decimal_data),
+    );
+
+    let varchar_data: Vec<&str> = ["0", "1", "2", "3", "4", "5", "6", "7", "8"]
+        .iter()
+        .map(|&s| alloc.alloc_str(s) as &str)
+        .collect();
+    let varchar_str_slice = alloc.alloc_slice_clone(&varchar_data);
+    let varchar_scalars: Vec<DoryScalar> = varchar_data.iter().map(Into::into).collect();
+    let varchar_scalars_slice = alloc.alloc_slice_clone(&varchar_scalars);
+    expected_table.insert(
+        Identifier::try_new("varchar").unwrap(),
+        Column::VarChar((varchar_str_slice, varchar_scalars_slice)),
+    );
+
+    let scalar_data: Vec<DoryScalar> = (0..=8).map(DoryScalar::from).collect();
+    let scalar_slice = alloc.alloc_slice_copy(&scalar_data);
+    expected_table.insert(
+        Identifier::try_new("scalar").unwrap(),
+        Column::Scalar(scalar_slice),
+    );
+
+    let boolean_data =
+        alloc.alloc_slice_copy(&[true, false, true, false, true, false, true, false, true]);
+    expected_table.insert(
+        Identifier::try_new("boolean").unwrap(),
+        Column::Boolean(boolean_data),
+    );
+
+    assert_eq!(borrowed_table.into_inner(), expected_table);
+}
+
+#[test]
+fn we_get_inequality_between_tables_with_differing_column_order() {
+    let alloc = Bump::new();
+
+    let table_a: Table<'_, TestScalar> = table([
+        bigint("a", [0; 0], &alloc),
+        int128("b", [0; 0], &alloc),
+        varchar("c", ["0"; 0], &alloc),
+        boolean("d", [false; 0], &alloc),
+        timestamptz(
+            "time_stamp",
+            PoSQLTimeUnit::Second,
+            PoSQLTimeZone::Utc,
+            [0_i64; 0],
+            &alloc,
+        ),
+    ]);
+
+    let table_b: Table<'_, TestScalar> = table([
+        boolean("d", [false; 0], &alloc),
+        int128("b", [0; 0], &alloc),
+        bigint("a", [0; 0], &alloc),
+        varchar("c", ["0"; 0], &alloc),
+        timestamptz(
+            "time_stamp",
+            PoSQLTimeUnit::Second,
+            PoSQLTimeZone::Utc,
+            [0_i64; 0],
+            &alloc,
+        ),
+    ]);
+
+    assert_ne!(table_a, table_b);
+}
+
+#[test]
+fn we_get_inequality_between_tables_with_differing_data() {
+    let alloc = Bump::new();
+
+    let table_a: Table<'_, DoryScalar> = table([
+        bigint("a", [0], &alloc),
+        int128("b", [0], &alloc),
+        varchar("c", ["0"], &alloc),
+        boolean("d", [true], &alloc),
+        timestamptz(
+            "time_stamp",
+            PoSQLTimeUnit::Second,
+            PoSQLTimeZone::Utc,
+            [1_625_072_400],
+            &alloc,
+        ),
+    ]);
+
+    let table_b: Table<'_, DoryScalar> = table([
+        bigint("a", [1], &alloc),
+        int128("b", [0], &alloc),
+        varchar("c", ["0"], &alloc),
+        boolean("d", [true], &alloc),
+        timestamptz(
+            "time_stamp",
+            PoSQLTimeUnit::Second,
+            PoSQLTimeZone::Utc,
+            [1_625_076_000],
+            &alloc,
+        ),
+    ]);
+
+    assert_ne!(table_a, table_b);
+}
+
+#[test]
+fn we_cannot_create_a_table_with_differing_column_lengths() {
+    assert!(matches!(
+        Table::<TestScalar>::try_from_iter([
+            ("a".parse().unwrap(), Column::BigInt(&[0])),
+            ("b".parse().unwrap(), Column::BigInt(&[])),
+        ]),
+        Err(TableError::ColumnLengthMismatch)
+    ));
+}