Merge branch 'koaning:main' into examples

koaning · Mar 25, 2024 · a6cec13 · a6cec13
2 parents 5097b30 + d321198
commit a6cec13
Show file tree

Hide file tree

Showing 19 changed files with 174 additions and 61 deletions.
diff --git a/.github/ISSUE_TEMPLATE/feature-request-template.md b/.github/ISSUE_TEMPLATE/feature-request-template.md
@@ -1,5 +1,5 @@
 ---
-name: Feature Request Template
+name: New Feature Request
 about: This is a template for a Feature Request
 title: "[FEATURE]"
 labels: enhancement

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,24 +1,25 @@
-Before working on a large PR, please check with @koaning or @MBrouns that they agree with the direction of the PR. This discussion should take place in a Github issue before working on the PR, unless it's a minor change like spelling in the docs. 
+Before working on a large PR, please check with @FBruzzesi or @koaning to confirm that they agree with the direction of the PR. This discussion should take place in a [Github issue](https://github.com/koaning/scikit-lego/issues/new/choose) before working on the PR, unless it's a minor change like spelling in the docs. 
 
 # Description
 
 Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context.
 
-Fixes # (issue)
+Fixes #(issue)
 
 ## Type of change
+
 - [ ] Bug fix (non-breaking change which fixes an issue)
 - [ ] New feature (non-breaking change which adds functionality)
 - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
 
 
-# Checklist:
+## Checklist:
 
-- [ ] My code follows the style guidelines (flake8)
+- [ ] My code follows the style guidelines (ruff)
 - [ ] I have commented my code, particularly in hard-to-understand areas
 - [ ] I have made corresponding changes to the documentation (also to the readme.md)
 - [ ] I have added tests that prove my fix is effective or that my feature works
 - [ ] I have added tests to check whether the new feature adheres to the sklearn convention
 - [ ] New and existing unit tests pass locally with my changes
 
-If you feel your PR is ready for a review, ping @koaning or @mbrouns. 
+If you feel your PR is ready for a review, ping @FBruzzesi or @koaning.
diff --git a/.github/workflows/dependencies.yml b/.github/workflows/dependencies.yml
@@ -15,33 +15,34 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: "3.10"
+    - name: Install uv
+      run: curl -LsSf https://astral.sh/uv/install.sh | sh
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip setuptools wheel
-        python -m pip install pytest
+        uv pip install pytest setuptools wheel --system
     - name: Run Base Install
       run: |
-        python -m pip install -e .
+        uv pip install -e . --system
     - name: Run Checks
       run: |
         python tests/scripts/check_pip.py missing cvxpy
         python tests/scripts/check_pip.py installed scikit-learn
         python tests/scripts/import_all.py
     - name: Install cvxpy
       run: |
-        python -m pip install -e ".[cvxpy]"
+        uv pip install -e ".[cvxpy]" --system
     - name: Run Checks
       run: |
         python tests/scripts/check_pip.py installed cvxpy scikit-learn
         python tests/scripts/import_all.py
     - name: Install All
       run: |
-        python -m pip install -e ".[all]"
+        uv pip install -e ".[all]" --system
     - name: Run Checks
       run: |
         python tests/scripts/check_pip.py installed cvxpy formulaic scikit-learn umap-learn
     - name: Docs can Build
       run: |
         sudo apt-get update && sudo apt-get install pandoc
-        python -m pip install -e ".[docs]"
+        uv pip install -e ".[docs]" --system
         mkdocs build
diff --git a/.github/workflows/schedule-dependencies.yml b/.github/workflows/schedule-dependencies.yml
@@ -1,8 +1,10 @@
 name: Cron Test Dependencies
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
+
 
 jobs:
   cron:
@@ -15,17 +17,22 @@ jobs:
     steps:
     - name: Checkout source code
       uses: actions/checkout@v4
+    - name: Install uv (Unix)
+      if: runner.os != 'Windows'
+      run: curl -LsSf https://astral.sh/uv/install.sh | sh
+    - name: Install uv (Windows)
+      if: runner.os == 'Windows'
+      run: powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
-        python -m pip install wheel
-        pip install ${{ matrix.pre-release-dependencies }} scikit-lego
-        pip freeze
+        uv pip install wheel --system
+        uv pip install ${{ matrix.pre-release-dependencies }} scikit-lego --system
+        uv pip freeze
     - name: Test with pytest
       run: |
-        pip install -e ".[test]"
+        uv pip install -e ".[test]" --system
         make test
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -22,14 +22,17 @@ jobs:
     steps:
     - name: Checkout source code
       uses: actions/checkout@v4
+    - name: Install uv (Unix)
+      if: runner.os != 'Windows'
+      run: curl -LsSf https://astral.sh/uv/install.sh | sh
+    - name: Install uv (Windows)
+      if: runner.os == 'Windows'
+      run: powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip --no-cache-dir
-        python -m pip install -e ".[test]"
+      run: uv pip install -e ".[test]" --system
     - name: Test with pytest
-      run: |
-        make test
+      run: make test
diff --git a/docs/_scripts/cross-validation.py b/docs/_scripts/cross-validation.py
@@ -204,3 +204,28 @@ def print_folds(cv, X, y, groups):
 grid.best_estimator_.get_params()["reg__alpha"]
 # 0.8
 # --8<-- [end:grid-search]
+
+
+
+######################################## ClusterKfold ####################################
+##########################################################################################
+
+# --8<-- [start:cluster-fold-start]
+from sklego.model_selection import ClusterFoldValidation
+from sklearn.cluster import KMeans
+
+clusterer = KMeans(n_clusters=5, random_state=42)
+folder = ClusterFoldValidation(clusterer)
+# --8<-- [end:cluster-fold-start]
+
+
+# --8<-- [start:cluster-fold-plot]
+import matplotlib.pylab as plt
+import numpy as np
+
+X_orig = np.random.uniform(0, 1, (1000, 2))
+for i, split in enumerate(folder.split(X_orig)):
+    x_train, x_valid = split
+    plt.scatter(X_orig[x_valid, 0], X_orig[x_valid, 1], label=f"split {i}")
+plt.legend();
+# --8<-- [end:cluster-fold-plot]
diff --git a/docs/_static/cross-validation/kfold.png b/docs/_static/cross-validation/kfold.png
diff --git a/docs/contribution.md b/docs/contribution.md
@@ -20,7 +20,7 @@ This means we're usually open to ideas to add here but there are a few things to
 
 When writing a new feature there's some more
 [details with regard to how scikit learn likes to have its parts implemented][scikit-develop].
-We will display the a sample implementation of the `ColumnSelector` below. Please review all comments marked as Important.
+We will display a sample implementation of the `ColumnSelector` below. Please review all comments marked as Important.
 
 ```py hl_lines="19-22 24-28 46-51 65-69 77-78 83-85" linenums="1"
 from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin

diff --git a/docs/user-guide/cross-validation.md b/docs/user-guide/cross-validation.md
@@ -127,5 +127,40 @@ To use `GroupTimeSeriesSplit` with sklearn's [GridSearchCV](https://scikit-learn
 --8<-- "docs/_scripts/cross-validation.py:grid-search"
 ```
 
+## Cluster-Kfold
+
+The [ClusterFoldValidation](clusterfold-api) object is a cross-validator that splits the data into `n_splits` folds, where each fold is determined by a clustering algorithm. This is not a common pattern, probably more like an anti-pattern really, but it might be useful when you want to make sure that the train and test sets are very distinct. This can be seen as a way to make it harder for the algorithm perform well, because the training sets are sampled differently than the test sets.
+
+### Example
+
+Here's how you could set up a cross validator that uses KMeans.
+
+```py title="Using Kmeans to generate folds"
+--8<-- "docs/_scripts/cross-validation.py:cluster-fold-start"
+```
+
+You can also use other cross validation methods, but the nice thing about Kmeans is that it demos well. Here's how it would generate folds on a uniform dataset.
+
+```py title="Using Kmeans to generate folds"
+--8<-- "docs/_scripts/cross-validation.py:cluster-fold-plot"
+```
+
+![example-1](../_static/cross-validation/kfold.png)
+
+As you can see, each split will focus on a cluster of the data. Hopefully this also makes it clear that this method will ensure that each validation set will be rather distinct from the train set. These sets are not only exclusive, but they are also from a different region of the data by design.
+
+Note that this image is mostly for illustrative purposes because you typically won't directly generate these folds yourself. Instead you'd use a helper function like `cross_val_score` or `GridSearchCV` to do this for you.
+
+```py title="More realistic example"
+from sklearn.model_selection import cross_val_score
+
+# Given an existing pipeline and X,y dataset, you probably would do something like this:
+fold_method = KlusterFoldValidation(
+    KMeans(n_cluster=5, random_state=42)
+)
+cross_val_score(pipeline, X, y, cv=fold_method)
+```
+
 [time-gap-split-api]: ../../api/model-selection#sklego.model_selection.TimeGapSplit
 [group-ts-split-api]: ../../api/model-selection#sklego.model_selection.GroupTimeSeriesSplit
+[clusterfold-api]: ../../api/model-selection#sklego.model_selection.ClusterFoldValidation
diff --git a/docs/user-guide/linear-models.md b/docs/user-guide/linear-models.md
@@ -105,7 +105,7 @@ We've turned the array into a dataframe so that we can apply the [`ColumnSelecto
 
 --8<-- "docs/_static/linear-models/grid.html"
 
-You can see that the `ProbWeightRegression` indeeds sums to one.
+You can see that the `ProbWeightRegression` indeed sums to one.
 
 ```py
 --8<-- "docs/_scripts/linear-models.py:prob-weight-coefs"

diff --git a/docs/user-guide/mixture-methods.md b/docs/user-guide/mixture-methods.md
@@ -4,7 +4,7 @@ Gaussian Mixture Models (GMMs) are flexible building blocks for other machine le
 
 This is in part because they are great approximations for general probability distributions but also because they remain somewhat interpretable even when the dataset gets very complex.
 
-This package makes use of GMMs to construct other algorithms.
+This package makes use of GMMs to construct other algorithms. In addition to the [GMMClassifier][gmm-classifier-api] and [GMMDetector][gmm-classifier-api], this library also features a [BayesianGMMClassifier][bayes_gmm-classifier-api] and [BayesianGMMDetector][bayes_gmm-outlier-detector-api] as well. These methods offer pretty much the same API, but will have internal methods to figure out what number of components to estimate. These methods tend to take significantly more time to train, so alternatively you may also try doing a proper grid search to figure out the best number of components for your use-case. 
 
 ## Classification
 
@@ -59,4 +59,6 @@ As a sidenote: this image was generated with some dummy data, but its code can b
     ```
 
 [gmm-classifier-api]: ../../api/mixture#sklego.mixture.gmm_classifier.GMMClassifier
+[bayes_gmm-classifier-api]: ../../api/mixture#sklego.mixture.bayesian_gmm_classifier.BayesianGMMClassifier
 [gmm-outlier-detector-api]: ../../api/mixture#sklego.mixture.gmm_outlier_detector.GMMOutlierDetector
+[bayes_gmm-outlier-detector-api]: ../../api/mixture#sklego.mixture.gmm_outlier_detector.BayesianGMMOutlierDetector
diff --git a/docs/user-guide/preprocessing.md b/docs/user-guide/preprocessing.md
@@ -253,6 +253,14 @@ Now let's see what occurs when we add a constraint that enforces the feature to
 
 If these features are now passed to a model that supports monotonicity constraints then we can build models with guarantees.
 
+## Outlier Removal
+
+The [`OutlierRemover`][outlier-remover-api] class is a transformer that removes outliers from your dataset during training time only based on some outlier detector estimator. This can be useful in scenarios where outliers in the training data can negatively impact the performance of your model. By removing these outliers during training, your model can learn from a "clean" dataset that may lead to better performance.
+
+It's important to note that this transformer only removes outliers during training. This means that when you use your trained model to predict on new data, the new data will not have any outliers removed. This is useful because in a real-world scenario, new data may contain outliers and you would want your model to be able to handle these cases.
+
+The `OutlierRemover` class is initialized with an `outlier_detector` estimator, and a boolean flag `refit`. The outlier detector should be a scikit-learn compatible estimator that implements `.fit()` and `.predict()` methods. The refit flag determines whether the underlying estimator is fitted during `OutlierRemover.fit()`.
+
 [estimator-transformer-api]: ../../api/meta#sklego.meta.estimator_transformer.EstimatorTransformer
 [meta-module]: ../../api/meta
 [id-transformer-api]: ../../api/preprocessing#sklego.preprocessing.identitytransformer.IdentityTransformer
@@ -261,6 +269,7 @@ If these features are now passed to a model that supports monotonicity constrain
 [rbf-api]: ../../api/preprocessing#sklego.preprocessing.repeatingbasis.RepeatingBasisFunction
 [interval-encoder-api]: ../../api/preprocessing#sklego.preprocessing.intervalencoder.IntervalEncoder
 [decay-section]: ../../user-guide/meta#decayed-estimation
+[outlier-remover-api]: ../../api/preprocessing#sklego.preprocessing.outlier_remover.OutlierRemover
 
 [formulaic-docs]: https://matthewwardrop.github.io/formulaic/
 [formulaic-formulas]: https://matthewwardrop.github.io/formulaic/formulas/
diff --git a/sklego/mixture/bayesian_gmm_detector.py b/sklego/mixture/bayesian_gmm_detector.py
@@ -12,7 +12,11 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
     """The `BayesianGMMOutlierDetector` trains a Bayesian Gaussian Mixture model on a dataset `X`. Once a density is
     trained we can evaluate the likelihood scores to see if it is deemed likely.
 
-    By giving a threshold this model might then label outliers if their likelihood score is too low.
+    By providing a `threshold` this model might then label outliers if their likelihood score is too low.
+
+    !!! note
+        The parameters other than `threshold` and `method` are an exact copy of the parameters in
+        [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html).
 
     Parameters
     ----------
@@ -28,10 +32,6 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
             If you select `method="stddev"` then the threshold value represents the
             numbers of standard deviations before calling something an outlier.
 
-    !!! note
-        The other parameters are an exact copy of the parameters in
-        [sklearn.mixture.BayesianGaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html).
-
     Attributes
     ----------
     gmm_ : BayesianGaussianMixture

diff --git a/sklego/mixture/gmm_outlier_detector.py b/sklego/mixture/gmm_outlier_detector.py
@@ -12,7 +12,11 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator):
     """The `GMMDetector` trains a Gaussian Mixture model on a dataset `X`. Once a density is trained we can evaluate the
     likelihood scores to see if it is deemed likely.
 
-    By giving a threshold this model might then label outliers if their likelihood score is too low.
+    By providing a `threshold` this model might then label outliers if their likelihood score is too low.
+
+    !!! note
+        The parameters other than `threshold` and `method` are an exact copy of the parameters in
+        [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html).
 
     Parameters
     ----------
@@ -28,10 +32,6 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator):
             If you select `method="stddev"` then the threshold value represents the
             numbers of standard deviations before calling something an outlier.
 
-    !!! note
-        The other parameters are an exact copy of the parameters in
-        [sklearn.mixture.GaussianMixture]( https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html).
-
     Attributes
     ----------
     gmm_ : GaussianMixture

diff --git a/sklego/model_selection.py b/sklego/model_selection.py
@@ -248,8 +248,20 @@ def get_split_info(X, indices, j, part, summary):
         return pd.DataFrame(summary)
 
 
-class KlusterFoldValidation:
-    """KlusterFold cross validator. Create folds based on provided cluster method
+def KlusterFoldValidation(**kwargs):
+    warn(
+        "Please use `ClusterFoldValidation` instead of `KlusterFoldValidation`."
+        "We will use correct spelling going forward and `KlusterFoldValidation` will be deprecated.",
+        DeprecationWarning,
+    )
+    return ClusterFoldValidation(**kwargs)
+
+
+class ClusterFoldValidation:
+    """Cross validator that creates folds based on provided cluster method.
+    This ensures that data points in the same cluster are not split across different folds.
+    
+    !!! info "New in version 0.9.0"
 
     Parameters
     ----------

diff --git a/sklego/pandas_utils.py b/sklego/pandas_utils.py
@@ -141,7 +141,8 @@ def log_step_extra(
     **log_func_kwargs: dict
         Keyword arguments to be passed to `log_functions`
 
-    Returns:
+    Returns
+    -------
     Callable
         The decorated function.