Skip to content

Commit

Permalink
Add tests and fixes for Daft integration (apache#381)
Browse files Browse the repository at this point in the history
* Implement to_daft on Table instead of Scan

* Add integration tests

---------

Co-authored-by: Jay Chia <[email protected]@users.noreply.github.com>
  • Loading branch information
jaychia and Jay Chia authored Feb 7, 2024
1 parent a7794ca commit cec051f
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 10 deletions.
20 changes: 10 additions & 10 deletions pyiceberg/table/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,16 @@ def __repr__(self) -> str:
result_str = f"{table_name}(\n {schema_str}\n),\n{partition_str},\n{sort_order_str},\n{snapshot_str}"
return result_str

def to_daft(self) -> daft.DataFrame:
"""Read a Daft DataFrame lazily from this Iceberg table.
Returns:
daft.DataFrame: Unmaterialized Daft Dataframe created from the Iceberg table
"""
import daft

return daft.read_iceberg(self)


class StaticTable(Table):
"""Load a table directly from a metadata file (i.e., without using a catalog)."""
Expand Down Expand Up @@ -1382,16 +1392,6 @@ def to_ray(self) -> ray.data.dataset.Dataset:

return ray.data.from_arrow(self.to_arrow())

def to_daft(self) -> daft.DataFrame:
"""Read a Daft DataFrame lazily from this Iceberg table.
Returns:
daft.DataFrame: Unmaterialized Daft Dataframe created from the Iceberg table
"""
import daft

return daft.read_iceberg(self)


class MoveOperation(Enum):
First = 1
Expand Down
22 changes: 22 additions & 0 deletions tests/integration/test_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,28 @@ def test_pyarrow_limit(catalog: Catalog) -> None:
assert len(full_result) == 10


@pytest.mark.integration
@pytest.mark.filterwarnings("ignore")
@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('catalog_hive'), pytest.lazy_fixture('catalog_rest')])
def test_daft_nan(catalog: Catalog) -> None:
table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten")
df = table_test_null_nan_rewritten.to_daft()
assert df.count_rows() == 3
assert math.isnan(df.to_pydict()["col_numeric"][0])


@pytest.mark.integration
@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('catalog_hive'), pytest.lazy_fixture('catalog_rest')])
def test_daft_nan_rewritten(catalog: Catalog) -> None:
table_test_null_nan_rewritten = catalog.load_table("default.test_null_nan_rewritten")
df = table_test_null_nan_rewritten.to_daft()
df = df.where(df["col_numeric"].float.is_nan())
df = df.select("idx", "col_numeric")
assert df.count_rows() == 1
assert df.to_pydict()["idx"][0] == 1
assert math.isnan(df.to_pydict()["col_numeric"][0])


@pytest.mark.integration
@pytest.mark.filterwarnings("ignore")
@pytest.mark.parametrize('catalog', [pytest.lazy_fixture('catalog_hive'), pytest.lazy_fixture('catalog_rest')])
Expand Down

0 comments on commit cec051f

Please sign in to comment.