Skip to content

Commit

Permalink
Added examples to models in 'mixture' folder
Browse files Browse the repository at this point in the history
Added examples to models in 'mixture' folder

- GMMClassifier()
- BayesianGMMClassifier()
- GMMOutlierDetector()
- BayesianGMMOutlierDetector()
  • Loading branch information
mkalimeri committed Dec 3, 2024
1 parent 6e5f948 commit 7245e72
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 0 deletions.
42 changes: 42 additions & 0 deletions sklego/mixture/bayesian_gmm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,48 @@ class BayesianGMMClassifier(BaseEstimator, ClassifierMixin):
A dictionary of Bayesian Gaussian Mixture Models, one for each class.
classes_ : np.ndarray of shape (n_classes,)
The classes seen during `fit`.
Examples
--------
```python
import numpy as np
import matplotlib.pyplot as plt
from sklego.mixture import BayesianGMMClassifier
# Generate datset
np.random.seed(1)
group0 = np.random.normal(0, 3, (1000, 2))
group1 = np.random.normal(2.5, 2, (500, 2))
data = np.vstack([group0, group1])
y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
# Create and fit the BayesianGMMClassifier model
bgmm = BayesianGMMClassifier(n_components=2, random_state=1)
bgmm.fit(data, y)
# Classify the train dataset into two clusters (n_components=2)
labels = bgmm.predict(data)
# Classify a new point into one of two clusters
p = np.array([[1.5, 0.5]])
p_prob= bgmm.predict_proba(p) # predict the probabilities p belongs to each cluster
print(f'Probability point p belongs to group1 is {p_prob[0,0]:.2f}')
print(f'Probability point p belongs to group2 is {p_prob[0,1]:.2f}')
print(f'It is more probable that point p belongs to group{np.argmax(p_prob)}')
plt.scatter(group0[:,0], group0[:,1], c='y', label='group0')
plt.scatter(group1[:,0], group1[:,1], c='r', label='group1')
plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
plt.title('Distribution of dataset')
plt.legend()
plt.show()
### Probability point p belongs to group1 is 0.38
### Probability point p belongs to group2 is 0.62
### It is more probable that point p belongs to group1
```
"""

def __init__(
Expand Down
37 changes: 37 additions & 0 deletions sklego/mixture/bayesian_gmm_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,43 @@ class BayesianGMMOutlierDetector(OutlierMixin, BaseEstimator):
The trained Bayesian Gaussian Mixture Model.
likelihood_threshold_ : float
The threshold value used to determine if something is an outlier.
Examples
--------
```python
import numpy as np
import matplotlib.pyplot as plt
from sklego.mixture import BayesianGMMOutlierDetector
# Generate datset, it consists of two clusters
np.random.seed(1)
group0 = np.random.normal(0, 3, (10, 2))
group1 = np.random.normal(2.5, 2, (5, 2))
data = np.vstack([group0, group1])
y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
# Create and fit the BayesianGMMOutlierDetector model with threshold=0.9
bgmm = BayesianGMMOutlierDetector(threshold=0.9, n_components=2, random_state=1)
bgmm.fit(data, y)
# Classify a new point as outlier or not
p = np.array([[4.5, 0.5]])
p_pred= bgmm.predict(p) # predict the probabilities p belongs to each cluster
print('The point is an outlier if the score is -1, inlier if the score is 1')
print(f'The score for this point is {p_pred}.')
plt.scatter(data[:,0], data[:,1], c='y', label='train set')
plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
plt.title('Distribution of dataset')
plt.legend()
plt.show()
### The point is an outlier if the score is -1, inlier if the score is 1
### The score for this point is [1].
```
"""

_ALLOWED_METHODS = ("quantile", "stddev")
Expand Down
43 changes: 43 additions & 0 deletions sklego/mixture/gmm_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,49 @@ class GMMClassifier(BaseEstimator, ClassifierMixin):
A dictionary of Gaussian Mixture Models, one for each class.
classes_ : np.ndarray of shape (n_classes,)
The classes seen during `fit`.
Examples
--------
```python
import numpy as np
import matplotlib.pyplot as plt
from sklego.mixture import GMMClassifier
# Generate datset
np.random.seed(1)
group0 = np.random.normal(0, 3, (1000, 2))
group1 = np.random.normal(2.5, 2, (500, 2))
data = np.vstack([group0, group1])
y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
# Create and fit the GMMClassifier model
gmm = GMMClassifier(n_components=2, random_state=1)
gmm.fit(data, y)
# Classify the train dataset into two clusters (n_components=2)
labels = gmm.predict(data)
# Classify a new point into one of two clusters
p = np.array([[1.5, 0.5]])
p_prob= gmm.predict_proba(p) # predict the probabilities p belongs to each cluster
print(f'Probability point p belongs to group1 is {p_prob[0,0]:.2f}')
print(f'Probability point p belongs to group2 is {p_prob[0,1]:.2f}')
print(f'It is more probable that point p belongs to group{np.argmax(p_prob)}')
plt.scatter(group0[:,0], group0[:,1], c='y', label='group0')
plt.scatter(group1[:,0], group1[:,1], c='r', label='group1')
plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
plt.title('Distribution of dataset')
plt.legend()
plt.show()
### Probability point p belongs to group1 is 0.41
### Probability point p belongs to group2 is 0.59
### It is more probable that point p belongs to group1
```
"""

def __init__(
Expand Down
36 changes: 36 additions & 0 deletions sklego/mixture/gmm_outlier_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,42 @@ class GMMOutlierDetector(OutlierMixin, BaseEstimator):
The trained Gaussian Mixture model.
likelihood_threshold_ : float
The threshold value used to determine if something is an outlier.
Examples
--------
```python
import numpy as np
import matplotlib.pyplot as plt
from sklego.mixture import GMMOutlierDetector
# Generate datset, it consists of two clusters
np.random.seed(1)
group0 = np.random.normal(0, 3, (10, 2))
group1 = np.random.normal(2.5, 2, (5, 2))
data = np.vstack([group0, group1])
y = np.hstack([np.zeros((group0.shape[0],), dtype=int), np.ones((group1.shape[0],), dtype=int)])
# Create and fit the GMMOutlierDetector model
gmm = GMMOutlierDetector(threshold=0.9, n_components=2, random_state=1)
gmm.fit(data, y)
# Classify a new point as outlier or not
p = np.array([[4.5, 0.5]])
p_pred= gmm.predict(p) # predict the probabilities p belongs to each cluster
print('The point is an outlier if the score is -1, inlier if the score is 1')
print(f'The score for this point is {p_pred}.')
plt.scatter(data[:,0], data[:,1], c='y', label='train set')
plt.scatter(p[:,0], p[:,1], c='black', marker='x', label='new datapoint')
plt.title('Distribution of dataset')
plt.legend()
plt.show()
### The point is an outlier if the score is -1, inlier if the score is 1
### The score for this point is [-1].
```
"""

_ALLOWED_METHODS = ("quantile", "stddev")
Expand Down

0 comments on commit 7245e72

Please sign in to comment.