chore: add compatibility to pandas>=2,numpy>=2

- align requirements.txt with pyproject.toml - remove calls to np.string_ not existing in numpy >= 2.0.0 - remove calls to pd._testing.makeMixedDataFrame not existing in new pandas versions - fix install and test commands in documentation for developers - replace np.mean with column-wise version - drop pandas dependency constraint <2 - require histogrammar>=1.0.34 - require Python 3.9 in pyproject.toml - add PySpark 3.5.3 to test pipeline matrix - update test pipeline matrix: exclude Python 3.8, include Python 3.12 - add test notebook output to .gitignore - switch to importlib from pkg_resources - install project dependencies after pyspark in spark build tests - run mean and std calculations only on numeric columns - add dependency versions constraints to Spark tests
ing-bank · Jan 5, 2025 · 3df42f5 · 3df42f5
1 parent ac79d21
commit 3df42f5
Show file tree

Hide file tree

Showing 15 changed files with 110 additions and 55 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python: ['3.8', '3.9', '3.10', '3.11']
+        python: ['3.9', '3.10', '3.11', '3.12']
     runs-on: ${{ matrix.os }}
 
     steps:
@@ -40,16 +40,18 @@ jobs:
     strategy:
       matrix:
         include:
-#          - SPARK_VERSION: "2.4.8"
-#            HADOOP_VERSION: "2.7"
-#            JAVA_VERSION: "8"
-#            python: "3.7"
-#            os: ubuntu-latest
           - SPARK_VERSION: "3.3.2"
             HADOOP_VERSION: "3"
             JAVA_VERSION: "11"
-            python: "3.8"
+            python: "3.9"
             os: ubuntu-latest
+            dependency_constraints: "pandas<2,numpy<2"
+          - SPARK_VERSION: "3.5.4"
+            HADOOP_VERSION: "3"
+            JAVA_VERSION: "11"
+            python: "3.12"
+            os: ubuntu-latest
+            dependency_constraints: "pandas>=2,numpy>=2"
     runs-on: ${{ matrix.os }}
     name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }}
 
@@ -67,10 +69,9 @@ jobs:
           /home/runner/work/spark.tgz
           ~/.cache/pip
         key: ${{ runner.os }}-spark-${{ matrix.SPARK_VERSION }}-hadoop${{ matrix.HADOOP_VERSION }}-java${{ matrix.JAVA_VERSION }}-${{ hashFiles('**/pyproject.toml') }}
-    - name: Install dependencies
+    - name: Install pip and setuptools
       run: |
         python -m pip install --upgrade pip setuptools
-        pip install -e .[test]
     - name: Download spark
       if: steps.cache-spark.outputs.cache-hit != 'true'
       env:
@@ -93,6 +94,12 @@ jobs:
         # https://github.com/python-poetry/poetry/issues/6792
         pip3 install "pypandoc<1.8" 
         pip install "pyspark==${SPARK_VERSION}"
+    - name: Install Spark-related dependency versions
+      run: |
+        pip install ${{ matrix.dependency_constraints }}
+    - name: Install project dependencies
+      run: |
+        pip install -e .[test]
     - name: Test with pytest (spark-specific)
       env:
         BUILD_DIR: "/home/runner/work/" #${{ github.workspace }}

diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,5 @@ docs/build
 # Developer's playground
 /playground/
 .ruff_cache/
+
+notebooks/report.html
diff --git a/docs/source/developing.rst b/docs/source/developing.rst
@@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
 .. code-block:: bash
 
   cd popmon/
-  pip install -r requirements-test.txt
-  python setup.py test
+  pip install -r .[test]
+  pytest
 
 That's it!
 

diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py
@@ -83,7 +83,7 @@ def expanding_mean(df, shift: int = 1):
     :param int shift: size of shift. default is 1.
     :return: df with expanding means of columns
     """
-    return df.shift(shift).expanding().mean()
+    return df.shift(shift).expanding().mean(numeric_only=True)
 
 
 def expanding_std(df, shift: int = 1):
@@ -95,7 +95,7 @@ def expanding_std(df, shift: int = 1):
     :param int shift: size of shift. default is 1.
     :return: df with expanding std of columns
     """
-    return df.shift(shift).expanding().std()
+    return df.shift(shift).expanding().std(numeric_only=True)
 
 
 def expanding_apply(df, func, shift: int = 1, *args, **kwargs):
@@ -123,7 +123,7 @@ def rolling_std(df, window, shift: int = 1):
     :param int window: size of rolling window.
     :return: df with rolling std of columns
     """
-    return df.shift(shift).rolling(window).std()
+    return df.shift(shift).rolling(window).std(numeric_only=True)
 
 
 def rolling_mean(df, window, shift: int = 1):
@@ -136,7 +136,7 @@ def rolling_mean(df, window, shift: int = 1):
     :param int window: size of rolling window.
     :return: df with rolling mean of columns
     """
-    return df.shift(shift).rolling(window).mean()
+    return df.shift(shift).rolling(window).mean(numeric_only=True)
 
 
 def rolling_apply(df, window, func, shift: int = 1, *args, **kwargs):

diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py
@@ -186,9 +186,7 @@ def replace(bl):
     if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
         return np.nan
     if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
-        if not np.all(
-            [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
-        ):
+        if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
             return np.nan
         # all strings from hereon
         n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()

diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py
@@ -16,7 +16,7 @@
 # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+from functools import partial
 
 import numpy as np
 import pandas as pd
@@ -233,8 +233,8 @@ def __init__(
         :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
         """
         super().__init__(
-            np.mean,
-            np.std,
+            partial(pd.DataFrame.mean, numeric_only=True),
+            partial(pd.DataFrame.std, numeric_only=True, ddof=0),
             reference_key,
             assign_to_key,
             store_key,

diff --git a/popmon/notebooks/__init__.py b/popmon/notebooks/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 ING Analytics Wholesale Banking
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/popmon/resources.py b/popmon/resources.py
@@ -20,37 +20,30 @@
 
 # Resources lookup file for popmon
 import json
-import pathlib
+from importlib import resources
 
 from jinja2 import Environment, FileSystemLoader
-from pkg_resources import resource_filename
 
-import popmon
+from popmon import notebooks, test_data, visualization
 
 # data files that are shipped with popmon.
-_DATA = {
-    _.name: _
-    for _ in pathlib.Path(resource_filename(popmon.__name__, "test_data")).glob("*")
-}
+_DATA = {_.name: _ for _ in resources.files(test_data).iterdir()}
 
 # Tutorial notebooks
 _NOTEBOOK = {
-    _.name: _
-    for _ in pathlib.Path(resource_filename(popmon.__name__, "notebooks")).glob(
-        "*.ipynb"
-    )
+    p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"
 }
 
 # Resource types
 _RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK}
 
 # Environment for visualization templates' directory
-_TEMPLATES_ENV = Environment(
-    loader=FileSystemLoader(
-        resource_filename(popmon.__name__, "visualization/templates")
-    ),
-    autoescape=True,
-)
+ref = resources.files(visualization) / "templates"
+with resources.as_file(ref) as templates_dir_path:
+    _TEMPLATES_ENV = Environment(
+        loader=FileSystemLoader(templates_dir_path),
+        autoescape=True,
+    )
 _TEMPLATES_ENV.filters["fmt_metric"] = lambda x: x.replace("_", " ")
 
 

diff --git a/popmon/test_data/__init__.py b/popmon/test_data/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 ING Analytics Wholesale Banking
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/pyproject.toml b/pyproject.toml
@@ -17,14 +17,14 @@ keywords = [
     "ipython"
 ]
 readme = "README.rst"
-requires-python = ">=3.7"
+requires-python = ">=3.9"
 authors = [{name = "ING Analytics Wholesale Banking", email = "[email protected]"}]
 license = {type = "MIT", file = "LICENSE"}
 dependencies = [
     "numpy>=1.18.0",
-    "pandas>=0.25.1,<2",
+    "pandas>=0.25.1",
     "scipy>=1.5.2",
-    "histogrammar>=1.0.32",
+    "histogrammar>=1.0.34",
     "phik",
     "jinja2",
     "tqdm",

diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,13 @@
 numpy>=1.18.0
 pandas>=0.25.1
 scipy>=1.5.2
-histogrammar>=1.0.32
+histogrammar>=1.0.34
 phik
 jinja2
 tqdm
 plotly>=5.8.0
 joblib>=0.14.0
 htmlmin
-pydantic
-typing_extensions
+pydantic>=2
+pydantic-settings
+typing_extensions
diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py
@@ -13,6 +13,11 @@
 from popmon.base import Pipeline
 
 
+def mean(x):
+    """Column-wise np.mean."""
+    return np.mean(x, axis=0)
+
+
 def get_test_data():
     df = pd.DataFrame()
     df["a"] = np.arange(100)
@@ -25,7 +30,7 @@ def test_pull():
 
     module1 = ApplyFunc(apply_to_key="to_profile")
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
     module2.add_apply_func(
@@ -57,7 +62,7 @@ def func(x):
     )
 
     module.add_apply_func(np.std, entire=True)
-    module.add_apply_func(np.mean, entire=True)
+    module.add_apply_func(mean, entire=True)
     module.add_apply_func(func)
 
     datastore = module.transform(datastore)
@@ -77,7 +82,7 @@ def test_variance_comparer():
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
     )
     module1.add_apply_func(np.std, suffix="_std", entire=True)
-    module1.add_apply_func(np.mean, suffix="_mean", entire=True)
+    module1.add_apply_func(mean, suffix="_mean", entire=True)
 
     module2 = ApplyFunc(
         apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
@@ -171,7 +176,7 @@ def test_apply_func():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     d = apply_func(
@@ -195,7 +200,7 @@ def test_apply_func_array():
 
     apply_funcs = [
         {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
-        {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
+        {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
     ]
 
     f, p = apply_func_array(

diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from conftest import make_mixed_dataframe
 
 from popmon.analysis.hist_numpy import (
     assert_similar_hists,
@@ -30,7 +31,7 @@ def get_test_histograms1():
     """Get set 1 of test histograms"""
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
     df["boolT"] = True
     df["boolF"] = False
@@ -55,8 +56,7 @@ def get_test_histograms1():
 def get_test_histograms2():
     """Get set 2 of test histograms"""
     # dummy dataset with mixed types
-    # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
 
     # building 1d-, 2d-histogram (iteratively)
     hist1 = hg.Categorize(unit("C"))
@@ -351,7 +351,7 @@ def test_check_similar_hists():
     """
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
 
     # building 1d-, 2d-, and 3d-histogram (iteratively)
@@ -391,7 +391,7 @@ def test_assert_similar_hists():
     """
     # dummy dataset with mixed types
     # convert timestamp (col D) to nanosec since 1970-1-1
-    df = pd._testing.makeMixedDataFrame()
+    df = make_mixed_dataframe()
     df["date"] = df["D"].apply(to_ns)
 
     # building 1d-, 2d-, and 3d-histogram (iteratively)

diff --git a/tests/popmon/conftest.py b/tests/popmon/conftest.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from pandas.core.indexes.datetimes import bdate_range
 
 from popmon import resources
 
@@ -88,3 +89,14 @@ def pytest_configure():
     df = pd.read_csv(resources.data(CSV_FILE))
     df["date"] = pd.to_datetime(df["date"])
     pytest.test_df = df
+
+
+def make_mixed_dataframe() -> pd.DataFrame:
+    return pd.DataFrame(
+        {
+            "A": [0.0, 1.0, 2.0, 3.0, 4.0],
+            "B": [0.0, 1.0, 0.0, 1.0, 0.0],
+            "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
+            "D": bdate_range("1/1/2009", periods=5),
+        }
+    )