Skip to content

Commit

Permalink
chore: add compatibility to pandas>=2,numpy>=2
Browse files Browse the repository at this point in the history
- align requirements.txt with pyproject.toml
- remove calls to np.string_ not existing in numpy >= 2.0.0
- remove calls to pd._testing.makeMixedDataFrame not existing in new pandas versions
- fix install and test commands in documentation for developers
- replace np.mean with column-wise version
- drop pandas dependency constraint <2
- require histogrammar>=1.0.34
- require Python 3.9 in pyproject.toml
- add PySpark 3.5.3 to test pipeline matrix
- update test pipeline matrix: exclude Python 3.8, include Python 3.12
- add test notebook output to .gitignore
- switch to importlib from pkg_resources
- install project dependencies after pyspark in spark build tests
- run mean and std calculations only on numeric columns
- add dependency versions constraints to Spark tests
  • Loading branch information
mkopec87 committed Jan 5, 2025
1 parent ac79d21 commit 3df42f5
Show file tree
Hide file tree
Showing 15 changed files with 110 additions and 55 deletions.
25 changes: 16 additions & 9 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python: ['3.8', '3.9', '3.10', '3.11']
python: ['3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -40,16 +40,18 @@ jobs:
strategy:
matrix:
include:
# - SPARK_VERSION: "2.4.8"
# HADOOP_VERSION: "2.7"
# JAVA_VERSION: "8"
# python: "3.7"
# os: ubuntu-latest
- SPARK_VERSION: "3.3.2"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.8"
python: "3.9"
os: ubuntu-latest
dependency_constraints: "pandas<2,numpy<2"
- SPARK_VERSION: "3.5.4"
HADOOP_VERSION: "3"
JAVA_VERSION: "11"
python: "3.12"
os: ubuntu-latest
dependency_constraints: "pandas>=2,numpy>=2"
runs-on: ${{ matrix.os }}
name: ${{ matrix.os }}, Spark ${{ matrix.SPARK_VERSION}}, Python ${{ matrix.python }}

Expand All @@ -67,10 +69,9 @@ jobs:
/home/runner/work/spark.tgz
~/.cache/pip
key: ${{ runner.os }}-spark-${{ matrix.SPARK_VERSION }}-hadoop${{ matrix.HADOOP_VERSION }}-java${{ matrix.JAVA_VERSION }}-${{ hashFiles('**/pyproject.toml') }}
- name: Install dependencies
- name: Install pip and setuptools
run: |
python -m pip install --upgrade pip setuptools
pip install -e .[test]
- name: Download spark
if: steps.cache-spark.outputs.cache-hit != 'true'
env:
Expand All @@ -93,6 +94,12 @@ jobs:
# https://github.com/python-poetry/poetry/issues/6792
pip3 install "pypandoc<1.8"
pip install "pyspark==${SPARK_VERSION}"
- name: Install Spark-related dependency versions
run: |
pip install ${{ matrix.dependency_constraints }}
- name: Install project dependencies
run: |
pip install -e .[test]
- name: Test with pytest (spark-specific)
env:
BUILD_DIR: "/home/runner/work/" #${{ github.workspace }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,5 @@ docs/build
# Developer's playground
/playground/
.ruff_cache/

notebooks/report.html
4 changes: 2 additions & 2 deletions docs/source/developing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ For this you'll need to install our test requirements:
.. code-block:: bash
cd popmon/
pip install -r requirements-test.txt
python setup.py test
pip install -r .[test]
pytest
That's it!

Expand Down
8 changes: 4 additions & 4 deletions popmon/analysis/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def expanding_mean(df, shift: int = 1):
:param int shift: size of shift. default is 1.
:return: df with expanding means of columns
"""
return df.shift(shift).expanding().mean()
return df.shift(shift).expanding().mean(numeric_only=True)


def expanding_std(df, shift: int = 1):
Expand All @@ -95,7 +95,7 @@ def expanding_std(df, shift: int = 1):
:param int shift: size of shift. default is 1.
:return: df with expanding std of columns
"""
return df.shift(shift).expanding().std()
return df.shift(shift).expanding().std(numeric_only=True)


def expanding_apply(df, func, shift: int = 1, *args, **kwargs):
Expand Down Expand Up @@ -123,7 +123,7 @@ def rolling_std(df, window, shift: int = 1):
:param int window: size of rolling window.
:return: df with rolling std of columns
"""
return df.shift(shift).rolling(window).std()
return df.shift(shift).rolling(window).std(numeric_only=True)


def rolling_mean(df, window, shift: int = 1):
Expand All @@ -136,7 +136,7 @@ def rolling_mean(df, window, shift: int = 1):
:param int window: size of rolling window.
:return: df with rolling mean of columns
"""
return df.shift(shift).rolling(window).mean()
return df.shift(shift).rolling(window).mean(numeric_only=True)


def rolling_apply(df, window, func, shift: int = 1, *args, **kwargs):
Expand Down
4 changes: 1 addition & 3 deletions popmon/analysis/profiling/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,7 @@ def replace(bl):
if len(bin_labels) == 0 or len(bin_labels) > 4 or np.sum(bin_entries) == 0:
return np.nan
if not np.all([isinstance(bl, (bool, np.bool_)) for bl in bin_labels]):
if not np.all(
[isinstance(bl, (str, np.str_, np.string_)) for bl in bin_labels]
):
if not np.all([isinstance(bl, (str, np.str_, np.bytes_)) for bl in bin_labels]):
return np.nan
# all strings from hereon
n_true = (bin_labels == "True").sum() + (bin_labels == "true").sum()
Expand Down
6 changes: 3 additions & 3 deletions popmon/analysis/profiling/pull_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from functools import partial

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -233,8 +233,8 @@ def __init__(
:param kwargs: (dict, optional): residual kwargs passed on to mean and std functions
"""
super().__init__(
np.mean,
np.std,
partial(pd.DataFrame.mean, numeric_only=True),
partial(pd.DataFrame.std, numeric_only=True, ddof=0),
reference_key,
assign_to_key,
store_key,
Expand Down
18 changes: 18 additions & 0 deletions popmon/notebooks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2023 ING Analytics Wholesale Banking
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 changes: 10 additions & 17 deletions popmon/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,30 @@

# Resources lookup file for popmon
import json
import pathlib
from importlib import resources

from jinja2 import Environment, FileSystemLoader
from pkg_resources import resource_filename

import popmon
from popmon import notebooks, test_data, visualization

# data files that are shipped with popmon.
_DATA = {
_.name: _
for _ in pathlib.Path(resource_filename(popmon.__name__, "test_data")).glob("*")
}
_DATA = {_.name: _ for _ in resources.files(test_data).iterdir()}

# Tutorial notebooks
_NOTEBOOK = {
_.name: _
for _ in pathlib.Path(resource_filename(popmon.__name__, "notebooks")).glob(
"*.ipynb"
)
p.name: p for p in resources.files(notebooks).iterdir() if p.suffix == ".ipynb"
}

# Resource types
_RESOURCES = {"data": _DATA, "notebook": _NOTEBOOK}

# Environment for visualization templates' directory
_TEMPLATES_ENV = Environment(
loader=FileSystemLoader(
resource_filename(popmon.__name__, "visualization/templates")
),
autoescape=True,
)
ref = resources.files(visualization) / "templates"
with resources.as_file(ref) as templates_dir_path:
_TEMPLATES_ENV = Environment(
loader=FileSystemLoader(templates_dir_path),
autoescape=True,
)
_TEMPLATES_ENV.filters["fmt_metric"] = lambda x: x.replace("_", " ")


Expand Down
18 changes: 18 additions & 0 deletions popmon/test_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright (c) 2023 ING Analytics Wholesale Banking
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ keywords = [
"ipython"
]
readme = "README.rst"
requires-python = ">=3.7"
requires-python = ">=3.9"
authors = [{name = "ING Analytics Wholesale Banking", email = "[email protected]"}]
license = {type = "MIT", file = "LICENSE"}
dependencies = [
"numpy>=1.18.0",
"pandas>=0.25.1,<2",
"pandas>=0.25.1",
"scipy>=1.5.2",
"histogrammar>=1.0.32",
"histogrammar>=1.0.34",
"phik",
"jinja2",
"tqdm",
Expand Down
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
numpy>=1.18.0
pandas>=0.25.1
scipy>=1.5.2
histogrammar>=1.0.32
histogrammar>=1.0.34
phik
jinja2
tqdm
plotly>=5.8.0
joblib>=0.14.0
htmlmin
pydantic
typing_extensions
pydantic>=2
pydantic-settings
typing_extensions
15 changes: 10 additions & 5 deletions tests/popmon/analysis/profiling/test_apply_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@
from popmon.base import Pipeline


def mean(x):
"""Column-wise np.mean."""
return np.mean(x, axis=0)


def get_test_data():
df = pd.DataFrame()
df["a"] = np.arange(100)
Expand All @@ -25,7 +30,7 @@ def test_pull():

module1 = ApplyFunc(apply_to_key="to_profile")
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(apply_to_key="to_profile", features=["asc_numbers"])
module2.add_apply_func(
Expand Down Expand Up @@ -57,7 +62,7 @@ def func(x):
)

module.add_apply_func(np.std, entire=True)
module.add_apply_func(np.mean, entire=True)
module.add_apply_func(mean, entire=True)
module.add_apply_func(func)

datastore = module.transform(datastore)
Expand All @@ -77,7 +82,7 @@ def test_variance_comparer():
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
)
module1.add_apply_func(np.std, suffix="_std", entire=True)
module1.add_apply_func(np.mean, suffix="_mean", entire=True)
module1.add_apply_func(mean, suffix="_mean", entire=True)

module2 = ApplyFunc(
apply_to_key="to_profile", features=["the_feature", "dummy_feature"]
Expand Down Expand Up @@ -171,7 +176,7 @@ def test_apply_func():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

d = apply_func(
Expand All @@ -195,7 +200,7 @@ def test_apply_func_array():

apply_funcs = [
{"func": np.std, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": np.mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
{"func": mean, "features": [feature], "metrics": ["a", "b"], "entire": True},
]

f, p = apply_func_array(
Expand Down
10 changes: 5 additions & 5 deletions tests/popmon/analysis/test_hist_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd
import pytest
from conftest import make_mixed_dataframe

from popmon.analysis.hist_numpy import (
assert_similar_hists,
Expand Down Expand Up @@ -30,7 +31,7 @@ def get_test_histograms1():
"""Get set 1 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)
df["boolT"] = True
df["boolF"] = False
Expand All @@ -55,8 +56,7 @@ def get_test_histograms1():
def get_test_histograms2():
"""Get set 2 of test histograms"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()

# building 1d-, 2d-histogram (iteratively)
hist1 = hg.Categorize(unit("C"))
Expand Down Expand Up @@ -351,7 +351,7 @@ def test_check_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down Expand Up @@ -391,7 +391,7 @@ def test_assert_similar_hists():
"""
# dummy dataset with mixed types
# convert timestamp (col D) to nanosec since 1970-1-1
df = pd._testing.makeMixedDataFrame()
df = make_mixed_dataframe()
df["date"] = df["D"].apply(to_ns)

# building 1d-, 2d-, and 3d-histogram (iteratively)
Expand Down
12 changes: 12 additions & 0 deletions tests/popmon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.core.indexes.datetimes import bdate_range

from popmon import resources

Expand Down Expand Up @@ -88,3 +89,14 @@ def pytest_configure():
df = pd.read_csv(resources.data(CSV_FILE))
df["date"] = pd.to_datetime(df["date"])
pytest.test_df = df


def make_mixed_dataframe() -> pd.DataFrame:
return pd.DataFrame(
{
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
"C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
"D": bdate_range("1/1/2009", periods=5),
}
)
Loading

0 comments on commit 3df42f5

Please sign in to comment.