Added examples to models in 'mixture' folder

Added examples to models in 'mixture' folder - GMMClassifier() - BayesianGMMClassifier() - GMMOutlierDetector() - BayesianGMMOutlierDetector()
koaning · Dec 3, 2024 · 7245e72 · 7245e72
1 parent 6e5f948
commit 7245e72
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 0 deletions.
diff --git a/sklego/mixture/bayesian_gmm_classifier.py b/sklego/mixture/bayesian_gmm_classifier.py
@@ -22,6 +22,48 @@ class BayesianGMMClassifier(BaseEstimator, ClassifierMixin):
         A dictionary of Bayesian Gaussian Mixture Models, one for each class.
     classes_ : np.ndarray of shape (n_classes,)
         The classes seen during `fit`.
+
+      Examples
+    --------
+
+    ```python
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklego.mixture import BayesianGMMClassifier
+
+    # Generate datset
+    np.random.seed(1)
+    group0 = np.random.normal(0, 3, (1000, 2))
+    group1 = np.random.normal(2.5, 2, (500, 2))
+    data = np.vstack([group0, group1])
+
+    y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
+    # Create and fit the BayesianGMMClassifier model
+    bgmm = BayesianGMMClassifier(n_components=2, random_state=1)
+    bgmm.fit(data, y)
+
+    # Classify the train dataset into two clusters (n_components=2)
+    labels = bgmm.predict(data)
+
+    # Classify a new point into one of two clusters
+    p = np.array([[1.5, 0.5]])
+    p_prob= bgmm.predict_proba(p) # predict the probabilities p belongs to each cluster
+    print(f'Probability point p belongs to group1 is {p_prob[0,0]:.2f}')
+    print(f'Probability point p belongs to group2 is {p_prob[0,1]:.2f}')
+
+    print(f'It is more probable that point p belongs to group{np.argmax(p_prob)}')
+
+    plt.scatter(group0[:,0], group0[:,1], c='y', label='group0')
+    plt.scatter(group1[:,0], group1[:,1], c='r', label='group1')
+    plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
+    plt.title('Distribution of dataset')
+    plt.legend()
+    plt.show()
+
+    ### Probability point p belongs to group1 is 0.38
+    ### Probability point p belongs to group2 is 0.62
+    ### It is more probable that point p belongs to group1
+    ```
     """
 
     def __init__(

diff --git a/sklego/mixture/bayesian_gmm_detector.py b/sklego/mixture/bayesian_gmm_detector.py
@@ -38,6 +38,43 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
         The trained Bayesian Gaussian Mixture Model.
     likelihood_threshold_ : float
         The threshold value used to determine if something is an outlier.
+
+    Examples
+    --------
+
+    ```python
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklego.mixture import BayesianGMMOutlierDetector
+
+    # Generate datset, it consists of two clusters
+    np.random.seed(1)
+    group0 = np.random.normal(0, 3, (10, 2))
+    group1 = np.random.normal(2.5, 2, (5, 2))
+    data = np.vstack([group0, group1])
+
+    y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
+
+    # Create and fit the BayesianGMMOutlierDetector model with threshold=0.9
+    bgmm = BayesianGMMOutlierDetector(threshold=0.9, n_components=2, random_state=1)
+    bgmm.fit(data, y)
+
+    # Classify a new point as outlier or not
+    p = np.array([[4.5, 0.5]])
+    p_pred= bgmm.predict(p) # predict the probabilities p belongs to each cluster
+    print('The point is an outlier if the score is -1, inlier if the score is 1')
+    print(f'The score for this point is {p_pred}.')
+
+    plt.scatter(data[:,0], data[:,1], c='y', label='train set')
+    plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
+    plt.title('Distribution of dataset')
+    plt.legend()
+    plt.show()
+
+    ### The point is an outlier if the score is -1, inlier if the score is 1
+    ### The score for this point is [1].
+    ```
     """
 
     _ALLOWED_METHODS = ("quantile", "stddev")

diff --git a/sklego/mixture/gmm_classifier.py b/sklego/mixture/gmm_classifier.py
@@ -23,6 +23,49 @@ class GMMClassifier(BaseEstimator, ClassifierMixin):
         A dictionary of Gaussian Mixture Models, one for each class.
     classes_ : np.ndarray of shape (n_classes,)
         The classes seen during `fit`.
+
+       Examples
+    --------
+
+    ```python
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklego.mixture import GMMClassifier
+
+    # Generate datset
+    np.random.seed(1)
+    group0 = np.random.normal(0, 3, (1000, 2))
+    group1 = np.random.normal(2.5, 2, (500, 2))
+    data = np.vstack([group0, group1])
+
+    y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
+
+    # Create and fit the GMMClassifier model
+    gmm = GMMClassifier(n_components=2, random_state=1)
+    gmm.fit(data, y)
+
+    # Classify the train dataset into two clusters (n_components=2)
+    labels = gmm.predict(data)
+
+    # Classify a new point into one of two clusters
+    p = np.array([[1.5, 0.5]])
+    p_prob= gmm.predict_proba(p) # predict the probabilities p belongs to each cluster
+    print(f'Probability point p belongs to group1 is {p_prob[0,0]:.2f}')
+    print(f'Probability point p belongs to group2 is {p_prob[0,1]:.2f}')
+
+    print(f'It is more probable that point p belongs to group{np.argmax(p_prob)}')
+
+    plt.scatter(group0[:,0], group0[:,1], c='y', label='group0')
+    plt.scatter(group1[:,0], group1[:,1], c='r', label='group1')
+    plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
+    plt.title('Distribution of dataset')
+    plt.legend()
+    plt.show()
+
+    ### Probability point p belongs to group1 is 0.41
+    ### Probability point p belongs to group2 is 0.59
+    ### It is more probable that point p belongs to group1
+    ```
     """
 
     def __init__(

diff --git a/sklego/mixture/gmm_outlier_detector.py b/sklego/mixture/gmm_outlier_detector.py
@@ -38,6 +38,42 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator):
         The trained Gaussian Mixture model.
     likelihood_threshold_ : float
         The threshold value used to determine if something is an outlier.
+
+      Examples
+    --------
+
+    ```python
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from sklego.mixture import GMMOutlierDetector
+
+    # Generate datset, it consists of two clusters
+    np.random.seed(1)
+    group0 = np.random.normal(0, 3, (10, 2))
+    group1 = np.random.normal(2.5, 2, (5, 2))
+    data = np.vstack([group0, group1])
+
+    y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
+
+    # Create and fit the GMMOutlierDetector model
+    gmm = GMMOutlierDetector(threshold=0.9, n_components=2, random_state=1)
+    gmm.fit(data, y)
+
+    # Classify a new point as outlier or not
+    p = np.array([[4.5, 0.5]])
+    p_pred= gmm.predict(p) # predict the probabilities p belongs to each cluster
+    print('The point is an outlier if the score is -1, inlier if the score is 1')
+    print(f'The score for this point is {p_pred}.')
+
+    plt.scatter(data[:,0], data[:,1], c='y', label='train set')
+    plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
+    plt.title('Distribution of dataset')
+    plt.legend()
+    plt.show()
+
+    ### The point is an outlier if the score is -1, inlier if the score is 1
+    ### The score for this point is [-1].
+    ```
     """
 
     _ALLOWED_METHODS = ("quantile", "stddev")