From f0bcf0ffc0c3207ab535ad712dbb1e8af904e78d Mon Sep 17 00:00:00 2001 From: Chris Fonnesbeck Date: Thu, 15 Aug 2024 17:23:37 -0500 Subject: [PATCH 1/5] Added support for polars dataframes and series --- conda-envs/environment-dev.yml | 1 + conda-envs/environment-docs.yml | 1 + conda-envs/environment-jax.yml | 1 + conda-envs/windows-environment-dev.yml | 1 + conda-envs/windows-environment-test.yml | 1 + pymc/data.py | 5 +++-- pymc/pytensorf.py | 15 +++++++++++---- requirements-dev.txt | 1 + tests/test_data.py | 15 +++++++++++++++ tests/test_pytensorf.py | 14 ++++++++++++++ 10 files changed, 49 insertions(+), 6 deletions(-) diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml index 85e6694a95..1caf4ad3f9 100644 --- a/conda-envs/environment-dev.yml +++ b/conda-envs/environment-dev.yml @@ -13,6 +13,7 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip +- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/conda-envs/environment-docs.yml b/conda-envs/environment-docs.yml index 86097c5ab3..c46131a136 100644 --- a/conda-envs/environment-docs.yml +++ b/conda-envs/environment-docs.yml @@ -11,6 +11,7 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip +- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - rich>=13.7.1 diff --git a/conda-envs/environment-jax.yml b/conda-envs/environment-jax.yml index 97d25dd5b8..a61d396f7d 100644 --- a/conda-envs/environment-jax.yml +++ b/conda-envs/environment-jax.yml @@ -20,6 +20,7 @@ dependencies: - numpyro>=0.8.0 - pandas>=0.24.0 - pip +- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml index 6d785e2cac..b4fd350077 100644 --- a/conda-envs/windows-environment-dev.yml +++ b/conda-envs/windows-environment-dev.yml @@ -13,6 +13,7 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip +- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml index fd17c31711..c451d60b2d 100644 --- a/conda-envs/windows-environment-test.yml +++ b/conda-envs/windows-environment-test.yml @@ -16,6 +16,7 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip +- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/pymc/data.py b/pymc/data.py index 7e306f19e3..98ae964d66 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -22,6 +22,7 @@ import numpy as np import pandas as pd +import polars as pl import pytensor import pytensor.tensor as pt import xarray as xr @@ -204,7 +205,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size: def determine_coords( model, - value: pd.DataFrame | pd.Series | xr.DataArray, + value: pd.DataFrame | pd.Series | pl.DataFrame | pl.Series | xr.DataArray, dims: Sequence[str | None] | None = None, coords: dict[str, Sequence | np.ndarray] | None = None, ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None]]: @@ -348,7 +349,7 @@ def Data( ---------- name : str The name for this variable. - value : array_like or pandas.Series, pandas.Dataframe + value : array_like or pandas.Series, pandas.Dataframe, polars.DataFrame, polars.Series, xarray.DataArray A value to associate with this variable. dims : str or tuple of str, optional Dimension names of the random variables (as opposed to the shapes of these diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index cc7204c28a..90d71cc5a1 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -18,6 +18,7 @@ import numpy as np import pandas as pd +import polars as pl import pytensor import pytensor.tensor as pt import scipy.sparse as sps @@ -96,12 +97,16 @@ def convert_generator_data(data) -> TensorVariable: def convert_data(data) -> np.ndarray | Variable: ret: np.ndarray | Variable - if hasattr(data, "to_numpy") and hasattr(data, "isnull"): + if hasattr(data, "to_numpy"): # typically, but not limited to pandas objects vals = data.to_numpy() - null_data = data.isnull() + if hasattr(data, "is_null"): + # polars DataFrame or Series + null_data = data.is_null() + else: + null_data = data.isnull() if hasattr(null_data, "to_numpy"): - # pandas Series + # pandas or polars Series mask = null_data.to_numpy() else: # pandas Index @@ -144,7 +149,9 @@ def convert_data(data) -> np.ndarray | Variable: @_as_tensor_variable.register(pd.Series) @_as_tensor_variable.register(pd.DataFrame) -def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable: +@_as_tensor_variable.register(pl.DataFrame) +@_as_tensor_variable.register(pl.Series) +def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable: return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) diff --git a/requirements-dev.txt b/requirements-dev.txt index 082eab73ce..1cc26419ae 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,6 +15,7 @@ numdifftools>=0.9.40 numpy>=1.15.0 numpydoc pandas>=0.24.0 +polars>=1.5.0 polyagamma pre-commit>=2.8.0 pytensor>=2.25.1,<2.26 diff --git a/tests/test_data.py b/tests/test_data.py index c8472359f1..2d3b3dd7fd 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -404,6 +404,21 @@ def test_implicit_coords_dataframe(self, seeded_test): assert "columns" in pmodel.coords assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_polars_dataframe(self, seeded_test): + pl = pytest.importorskip("polars") + N_rows = 5 + N_cols = 7 + df_data = pl.DataFrame({f"Column {c+1}": np.random.normal(size=(N_rows,)) for c in range(N_cols)}) + df_data = df_data.with_row_count("rows") + + # infer coordinates from index and columns of the DataFrame + with pm.Model() as pmodel: + pm.Data("observations", df_data, dims=("rows", "columns"), infer_dims_and_coords=True) + + assert "rows" in pmodel.coords + assert "columns" in pmodel.coords + assert pmodel.named_vars_to_dims == {"observations": ("rows", "columns")} + def test_implicit_coords_xarray(self): xr = pytest.importorskip("xarray") data = xr.DataArray([[1, 2, 3], [4, 5, 6]], dims=("y", "x")) diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index e8881451bf..3a13beccfb 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -16,6 +16,7 @@ import numpy.ma as ma import numpy.testing as npt import pandas as pd +import polars as pl import pytensor import pytensor.tensor as pt import pytest @@ -66,6 +67,19 @@ def test_pd_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: df = pd.DataFrame(np_array) np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array) +@pytest.mark.parametrize( + argnames="np_array", + argvalues=[ + np.array([[1.0], [2.0], [-1.0]]), + np.array([[1.0, 1.0, 1.0], [0.0, 0.0, 0.0]]), + np.ones(shape=(10, 1)), + ], +) +def test_polars_dataframe_as_tensor_variable(np_array: np.ndarray) -> None: + pl = pytest.importorskip("polars") + df = pl.DataFrame(np_array) + np.testing.assert_array_equal(x=pt.as_tensor_variable(x=df).eval(), y=np_array) + @pytest.mark.parametrize( argnames="np_array", From 4dc60528e344b4232e0a4f11d33d5e04f30ae0f0 Mon Sep 17 00:00:00 2001 From: Chris Fonnesbeck Date: Thu, 15 Aug 2024 20:53:58 -0500 Subject: [PATCH 2/5] Make polars optional --- pymc/data.py | 5 ++--- pymc/pytensorf.py | 46 +++++++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/pymc/data.py b/pymc/data.py index 98ae964d66..13393c4924 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -21,8 +21,7 @@ from typing import cast import numpy as np -import pandas as pd -import polars as pl +from numpy.typing import ArrayLike import pytensor import pytensor.tensor as pt import xarray as xr @@ -205,7 +204,7 @@ def Minibatch(variable: TensorVariable, *variables: TensorVariable, batch_size: def determine_coords( model, - value: pd.DataFrame | pd.Series | pl.DataFrame | pl.Series | xr.DataArray, + value: ArrayLike, dims: Sequence[str | None] | None = None, coords: dict[str, Sequence | np.ndarray] | None = None, ) -> tuple[dict[str, Sequence | np.ndarray], Sequence[str | None]]: diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index 90d71cc5a1..69ed5a25c4 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -18,7 +18,10 @@ import numpy as np import pandas as pd -import polars as pl +try: + import polars as pl +except ImportError: + pl = None import pytensor import pytensor.tensor as pt import scipy.sparse as sps @@ -97,16 +100,12 @@ def convert_generator_data(data) -> TensorVariable: def convert_data(data) -> np.ndarray | Variable: ret: np.ndarray | Variable - if hasattr(data, "to_numpy"): + if hasattr(data, "to_numpy") and hasattr(data, "isnull"): # typically, but not limited to pandas objects vals = data.to_numpy() - if hasattr(data, "is_null"): - # polars DataFrame or Series - null_data = data.is_null() - else: - null_data = data.isnull() + null_data = data.isnull() if hasattr(null_data, "to_numpy"): - # pandas or polars Series + # pandas Series mask = null_data.to_numpy() else: # pandas Index @@ -116,6 +115,18 @@ def convert_data(data) -> np.ndarray | Variable: ret = np.ma.MaskedArray(vals, mask) else: ret = vals + elif hasattr(data, "to_numpy") and hasattr(data, "is_null"): + vals = data.to_numpy() + try: + null_data = data.is_null() + except AttributeError: + null_data = data.with_columns(pl.all().is_null()) + mask = null_data.to_numpy() + if mask.any(): + # there are missing values + ret = np.ma.MaskedArray(vals, mask) + else: + ret = vals elif isinstance(data, np.ndarray): if isinstance(data, np.ma.MaskedArray): if not data.mask.any(): @@ -146,13 +157,18 @@ def convert_data(data) -> np.ndarray | Variable: # Otherwise we only convert the precision. return smarttypeX(ret) - -@_as_tensor_variable.register(pd.Series) -@_as_tensor_variable.register(pd.DataFrame) -@_as_tensor_variable.register(pl.DataFrame) -@_as_tensor_variable.register(pl.Series) -def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable: - return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) +if pl is not None: + @_as_tensor_variable.register(pd.Series) + @_as_tensor_variable.register(pd.DataFrame) + @_as_tensor_variable.register(pl.DataFrame) + @_as_tensor_variable.register(pl.Series) + def dataframe_to_tensor_variable(df: pd.DataFrame | pl.DataFrame, *args, **kwargs) -> TensorVariable: + return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) +else: + @_as_tensor_variable.register(pd.Series) + @_as_tensor_variable.register(pd.DataFrame) + def dataframe_to_tensor_variable(df: pd.DataFrame, *args, **kwargs) -> TensorVariable: + return pt.as_tensor_variable(df.to_numpy(), *args, **kwargs) def extract_obs_data(x: TensorVariable) -> np.ndarray: From f2ed37e5e0764726fd643be6fbf1b06d04535103 Mon Sep 17 00:00:00 2001 From: Chris Fonnesbeck Date: Thu, 15 Aug 2024 20:56:48 -0500 Subject: [PATCH 3/5] Remove polars from envs --- conda-envs/environment-dev.yml | 1 - conda-envs/environment-docs.yml | 1 - conda-envs/windows-environment-dev.yml | 1 - conda-envs/windows-environment-test.yml | 1 - requirements-dev.txt | 1 - 5 files changed, 5 deletions(-) diff --git a/conda-envs/environment-dev.yml b/conda-envs/environment-dev.yml index 1caf4ad3f9..85e6694a95 100644 --- a/conda-envs/environment-dev.yml +++ b/conda-envs/environment-dev.yml @@ -13,7 +13,6 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip -- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/conda-envs/environment-docs.yml b/conda-envs/environment-docs.yml index c46131a136..86097c5ab3 100644 --- a/conda-envs/environment-docs.yml +++ b/conda-envs/environment-docs.yml @@ -11,7 +11,6 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip -- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - rich>=13.7.1 diff --git a/conda-envs/windows-environment-dev.yml b/conda-envs/windows-environment-dev.yml index b4fd350077..6d785e2cac 100644 --- a/conda-envs/windows-environment-dev.yml +++ b/conda-envs/windows-environment-dev.yml @@ -13,7 +13,6 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip -- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/conda-envs/windows-environment-test.yml b/conda-envs/windows-environment-test.yml index c451d60b2d..fd17c31711 100644 --- a/conda-envs/windows-environment-test.yml +++ b/conda-envs/windows-environment-test.yml @@ -16,7 +16,6 @@ dependencies: - numpy>=1.15.0 - pandas>=0.24.0 - pip -- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx diff --git a/requirements-dev.txt b/requirements-dev.txt index 1cc26419ae..082eab73ce 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,7 +15,6 @@ numdifftools>=0.9.40 numpy>=1.15.0 numpydoc pandas>=0.24.0 -polars>=1.5.0 polyagamma pre-commit>=2.8.0 pytensor>=2.25.1,<2.26 From ae3834277fae9882ad969ca745d13d800aeef29c Mon Sep 17 00:00:00 2001 From: Chris Fonnesbeck Date: Thu, 15 Aug 2024 20:57:20 -0500 Subject: [PATCH 4/5] Remove polars from envs --- conda-envs/environment-jax.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/conda-envs/environment-jax.yml b/conda-envs/environment-jax.yml index a61d396f7d..97d25dd5b8 100644 --- a/conda-envs/environment-jax.yml +++ b/conda-envs/environment-jax.yml @@ -20,7 +20,6 @@ dependencies: - numpyro>=0.8.0 - pandas>=0.24.0 - pip -- polars>=1.5.0 - pytensor>=2.25.1,<2.26 - python-graphviz - networkx From f304035f76422cced428bae266bb708835275fa0 Mon Sep 17 00:00:00 2001 From: Chris Fonnesbeck Date: Thu, 15 Aug 2024 20:58:49 -0500 Subject: [PATCH 5/5] Remove polars import --- tests/test_pytensorf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index 3a13beccfb..709f2728a9 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -16,7 +16,6 @@ import numpy.ma as ma import numpy.testing as npt import pandas as pd -import polars as pl import pytensor import pytensor.tensor as pt import pytest