diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4872f19a..9ee61f00 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,7 +10,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python: ['3.8', '3.9', '3.10', '3.11'] + python: ['3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} steps: @@ -40,16 +40,18 @@ jobs: strategy: matrix: include: -# - SPARK_VERSION: "2.4.8" -# HADOOP_VERSION: "2.7" -# JAVA_VERSION: "8" -# python: "3.7" -# os: ubuntu-latest - SPARK_VERSION: "3.3.2" HADOOP_VERSION: "3" JAVA_VERSION: "11" - python: "3.8" + python: "3.9" os: ubuntu-latest + dependency_constraints: "pandas<2,numpy<2" + - SPARK_VERSION: "3.5.4" + HADOOP_VERSION: "3" + JAVA_VERSION: "11" + python: "3.12" + os: ubuntu-latest + dependency_constraints: "pandas>=2,numpy>=2" runs-on: ${{ matrix.os }} name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }} @@ -67,10 +69,9 @@ jobs: /home/runner/work/spark.tgz ~/.cache/pip key: ${{ runner.os }}-spark-${{ matrix.SPARK_VERSION }}-hadoop${{ matrix.HADOOP_VERSION }}-java${{ matrix.JAVA_VERSION }}-${{ hashFiles('**/pyproject.toml') }} - - name: Install dependencies + - name: Install pip and setuptools run: | python -m pip install --upgrade pip setuptools - pip install -e .[test] - name: Download spark if: steps.cache-spark.outputs.cache-hit != 'true' env: @@ -93,6 +94,12 @@ jobs: # https://github.com/python-poetry/poetry/issues/6792 pip3 install "pypandoc<1.8" pip install "pyspark==${SPARK_VERSION}" + - name: Install Spark-related dependency versions + run: | + pip install ${{ matrix.dependency_constraints }} + - name: Install project dependencies + run: | + pip install -e .[test] - name: Test with pytest (spark-specific) env: BUILD_DIR: "/home/runner/work/" #${{ github.workspace }} diff --git a/.gitignore b/.gitignore index 195607b9..cd7d46cd 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,5 @@ docs/build # Developer's playground /playground/ .ruff_cache/ + +notebooks/report.html \ No newline at end of file diff --git a/docs/source/developing.rst b/docs/source/developing.rst index 15dc7fc2..99d8eb2e 100644 --- a/docs/source/developing.rst +++ b/docs/source/developing.rst @@ -27,8 +27,8 @@ For this you'll need to install our test requirements: .. code-block:: bash cd popmon/ - pip install -r requirements-test.txt - python setup.py test + pip install -r .[test] + pytest That's it! diff --git a/popmon/analysis/functions.py b/popmon/analysis/functions.py index 6d28f75c..a8ad6716 100644 --- a/popmon/analysis/functions.py +++ b/popmon/analysis/functions.py @@ -83,7 +83,7 @@ def expanding_mean(df, shift: int = 1): :param int shift: size of shift. default is 1. :return: df with expanding means of columns """ - return df.shift(shift).expanding().mean() + return df.shift(shift).expanding().mean(numeric_only=True) def expanding_std(df, shift: int = 1): @@ -95,7 +95,7 @@ def expanding_std(df, shift: int = 1): :param int shift: size of shift. default is 1. :return: df with expanding std of columns """ - return df.shift(shift).expanding().std() + return df.shift(shift).expanding().std(numeric_only=True) def expanding_apply(df, func, shift: int = 1, *args, **kwargs): @@ -123,7 +123,7 @@ def rolling_std(df, window, shift: int = 1): :param int window: size of rolling window. :return: df with rolling std of columns """ - return df.shift(shift).rolling(window).std() + return df.shift(shift).rolling(window).std(numeric_only=True) def rolling_mean(df, window, shift: int = 1): @@ -136,7 +136,7 @@ def rolling_mean(df, window, shift: int = 1): :param int window: size of rolling window. :return: df with rolling mean of columns """ - return df.shift(shift).rolling(window).mean() + return df.shift(shift).rolling(window).mean(numeric_only=True) def rolling_apply(df, window, func, shift: int = 1, *args, **kwargs): diff --git a/popmon/analysis/profiling/profiles.py b/popmon/analysis/profiling/profiles.py index 4dbe2aad..747abd3c 100644 --- a/popmon/analysis/profiling/profiles.py +++ b/popmon/analysis/profiling/profiles.py @@ -186,9 +186,7 @@ def replace(bl): if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0: return np.nan if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]): - if not np.all( - [isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels] - ): + if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]): return np.nan # all strings from hereon n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum() diff --git a/popmon/analysis/profiling/pull_calculator.py b/popmon/analysis/profiling/pull_calculator.py index b1d0e0a6..e6488307 100644 --- a/popmon/analysis/profiling/pull_calculator.py +++ b/popmon/analysis/profiling/pull_calculator.py @@ -16,7 +16,7 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +from functools import partial import numpy as np import pandas as pd @@ -233,8 +233,8 @@ def __init__( :param kwargs: (dict, optional): residual kwargs passed on to mean and std functions """ super().__init__( - np.mean, - np.std, + partial(pd.DataFrame.mean, numeric_only=True), + partial(pd.DataFrame.std, numeric_only=True, ddof=0), reference_key, assign_to_key, store_key, diff --git a/popmon/notebooks/__init__.py b/popmon/notebooks/__init__.py new file mode 100644 index 00000000..bc081466 --- /dev/null +++ b/popmon/notebooks/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/popmon/resources.py b/popmon/resources.py index 14a39857..542da722 100644 --- a/popmon/resources.py +++ b/popmon/resources.py @@ -20,37 +20,30 @@ # Resources lookup file for popmon import json -import pathlib +from importlib import resources from jinja2 import Environment, FileSystemLoader -from pkg_resources import resource_filename -import popmon +from popmon import notebooks, test_data, visualization # data files that are shipped with popmon. -_DATA = { - _.name: _ - for _ in pathlib.Path(resource_filename(popmon.__name__, "test_data")).glob("*") -} +_DATA = {_.name: _ for _ in resources.files(test_data).iterdir()} # Tutorial notebooks _NOTEBOOK = { - _.name: _ - for _ in pathlib.Path(resource_filename(popmon.__name__, "notebooks")).glob( - "*.ipynb" - ) + p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb" } # Resource types _RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK} # Environment for visualization templates' directory -_TEMPLATES_ENV = Environment( - loader=FileSystemLoader( - resource_filename(popmon.__name__, "visualization/templates") - ), - autoescape=True, -) +ref = resources.files(visualization) / "templates" +with resources.as_file(ref) as templates_dir_path: + _TEMPLATES_ENV = Environment( + loader=FileSystemLoader(templates_dir_path), + autoescape=True, + ) _TEMPLATES_ENV.filters["fmt_metric"] = lambda x: x.replace("_", " ") diff --git a/popmon/test_data/__init__.py b/popmon/test_data/__init__.py new file mode 100644 index 00000000..bc081466 --- /dev/null +++ b/popmon/test_data/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2023 ING Analytics Wholesale Banking +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/pyproject.toml b/pyproject.toml index 70bf35d2..6328d075 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,14 +17,14 @@ keywords = [ "ipython" ] readme = "README.rst" -requires-python = ">=3.7" +requires-python = ">=3.9" authors = [{name = "ING Analytics Wholesale Banking", email = "wbaa@ing.com"}] license = {type = "MIT", file = "LICENSE"} dependencies = [ "numpy>=1.18.0", - "pandas>=0.25.1,<2", + "pandas>=0.25.1", "scipy>=1.5.2", - "histogrammar>=1.0.32", + "histogrammar>=1.0.34", "phik", "jinja2", "tqdm", diff --git a/requirements.txt b/requirements.txt index 08a9d220..1c5ae8ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,13 @@ numpy>=1.18.0 pandas>=0.25.1 scipy>=1.5.2 -histogrammar>=1.0.32 +histogrammar>=1.0.34 phik jinja2 tqdm plotly>=5.8.0 joblib>=0.14.0 htmlmin -pydantic -typing_extensions +pydantic>=2 +pydantic-settings +typing_extensions \ No newline at end of file diff --git a/tests/popmon/analysis/profiling/test_apply_func.py b/tests/popmon/analysis/profiling/test_apply_func.py index 556a0e7d..113e34bd 100644 --- a/tests/popmon/analysis/profiling/test_apply_func.py +++ b/tests/popmon/analysis/profiling/test_apply_func.py @@ -13,6 +13,11 @@ from popmon.base import Pipeline +def mean(x): + """Column-wise np.mean.""" + return np.mean(x, axis=0) + + def get_test_data(): df = pd.DataFrame() df["a"] = np.arange(100) @@ -25,7 +30,7 @@ def test_pull(): module1 = ApplyFunc(apply_to_key="to_profile") module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"]) module2.add_apply_func( @@ -57,7 +62,7 @@ def func(x): ) module.add_apply_func(np.std, entire=True) - module.add_apply_func(np.mean, entire=True) + module.add_apply_func(mean, entire=True) module.add_apply_func(func) datastore = module.transform(datastore) @@ -77,7 +82,7 @@ def test_variance_comparer(): apply_to_key="to_profile", features=["the_feature", "dummy_feature"] ) module1.add_apply_func(np.std, suffix="_std", entire=True) - module1.add_apply_func(np.mean, suffix="_mean", entire=True) + module1.add_apply_func(mean, suffix="_mean", entire=True) module2 = ApplyFunc( apply_to_key="to_profile", features=["the_feature", "dummy_feature"] @@ -171,7 +176,7 @@ def test_apply_func(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] d = apply_func( @@ -195,7 +200,7 @@ def test_apply_func_array(): apply_funcs = [ {"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True}, - {"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, + {"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True}, ] f, p = apply_func_array( diff --git a/tests/popmon/analysis/test_hist_numpy.py b/tests/popmon/analysis/test_hist_numpy.py index d33da9de..477b27c1 100644 --- a/tests/popmon/analysis/test_hist_numpy.py +++ b/tests/popmon/analysis/test_hist_numpy.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd import pytest +from conftest import make_mixed_dataframe from popmon.analysis.hist_numpy import ( assert_similar_hists, @@ -30,7 +31,7 @@ def get_test_histograms1(): """Get set 1 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) df["boolT"] = True df["boolF"] = False @@ -55,8 +56,7 @@ def get_test_histograms1(): def get_test_histograms2(): """Get set 2 of test histograms""" # dummy dataset with mixed types - # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() # building 1d-, 2d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) @@ -351,7 +351,7 @@ def test_check_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) @@ -391,7 +391,7 @@ def test_assert_similar_hists(): """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) diff --git a/tests/popmon/conftest.py b/tests/popmon/conftest.py index b6b50b8b..dba9cd90 100644 --- a/tests/popmon/conftest.py +++ b/tests/popmon/conftest.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.indexes.datetimes import bdate_range from popmon import resources @@ -88,3 +89,14 @@ def pytest_configure(): df = pd.read_csv(resources.data(CSV_FILE)) df["date"] = pd.to_datetime(df["date"]) pytest.test_df = df + + +def make_mixed_dataframe() -> pd.DataFrame: + return pd.DataFrame( + { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + ) diff --git a/tests/popmon/hist/test_histogram.py b/tests/popmon/hist/test_histogram.py index 18d83e17..721bff07 100644 --- a/tests/popmon/hist/test_histogram.py +++ b/tests/popmon/hist/test_histogram.py @@ -1,6 +1,7 @@ import histogrammar as hg import numpy as np import pandas as pd +from conftest import make_mixed_dataframe from popmon.hist.hist_utils import ( is_numeric, @@ -15,7 +16,7 @@ def get_test_data(): - df = pd._testing.makeMixedDataFrame() + df = make_mixed_dataframe() df["date"] = df["D"].apply(lambda x: pd.to_datetime(x).value) return df