Skip to content

Commit

Permalink
Merge pull request #128 from NiklasMelton/issue-127-improve-prepare-data
Browse files Browse the repository at this point in the history
Issue-227 improve error message and add helper functions
  • Loading branch information
NiklasMelton authored Jan 10, 2025
2 parents 2610414 + 6442d95 commit fc0ed56
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 41 deletions.
91 changes: 68 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,17 +76,25 @@ Here are some quick examples to get you started with AdaptiveResonanceLib:
```python
from artlib import FuzzyART
import numpy as np
from tensorflow.keras.datasets import mnist

# Your dataset
train_X = np.array([...]) # shape (n_samples, n_features)
test_X = np.array([...])
# Load the MNIST dataset
n_dim = 28*28
(X_train, _), (X_test, _) = mnist.load_data()
X_train = X_train.reshape((-1, n_dim)) # flatten images
X_test = X_test.reshape((-1, n_dim))

# Initialize the Fuzzy ART model
model = FuzzyART(rho=0.7, alpha = 0.0, beta=1.0)

# (Optional) Tell the model the data limits for normalization
lower_bounds = np.array([0.]*n_dim)
upper_bounds = np.array([255.]*n_dim)
model.set_data_bounds(lower_bounds, upper_bounds)

# Prepare Data
train_X_prep = model.prepare_data(train_X)
test_X_prep = model.prepare_data(test_X)
train_X_prep = model.prepare_data(X_train)
test_X_prep = model.prepare_data(X_test)

# Fit the model
model.fit(train_X_prep)
Expand All @@ -100,25 +108,32 @@ predictions = model.predict(test_X_prep)
```python
from artlib import GaussianART, SimpleARTMAP
import numpy as np
from tensorflow.keras.datasets import mnist

# Your dataset
train_X = np.array([...]) # shape (n_samples, n_features)
train_y = np.array([...]) # shape (n_samples, ), must be integers
test_X = np.array([...])
# Load the MNIST dataset
n_dim = 28*28
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape((-1, n_dim)) # flatten images
X_test = X_test.reshape((-1, n_dim))

# Initialize the Gaussian ART model
sigma_init = np.array([0.5]*train_X.shape[1]) # variance estimate for each feature
sigma_init = np.array([0.5]*X_train.shape[1]) # variance estimate for each feature
module_a = GaussianART(rho=0.0, sigma_init=sigma_init)

# (Optional) Tell the model the data limits for normalization
lower_bounds = np.array([0.]*n_dim)
upper_bounds = np.array([255.]*n_dim)
module_a.set_data_bounds(lower_bounds, upper_bounds)

# Initialize the SimpleARTMAP model
model = SimpleARTMAP(module_a=module_a)

# Prepare Data
train_X_prep = model.prepare_data(train_X)
test_X_prep = model.prepare_data(test_X)
train_X_prep = model.prepare_data(X_train)
test_X_prep = model.prepare_data(X_test)

# Fit the model
model.fit(train_X_prep, train_y)
model.fit(train_X_prep, y_train)

# Predict data labels
predictions = model.predict(test_X_prep)
Expand All @@ -131,22 +146,22 @@ from artlib import FuzzyART, HypersphereART, FusionART
import numpy as np

# Your dataset
train_X = np.array([...]) # shape (n_samples, n_features_X)
train_y = np.array([...]) # shape (n_samples, n_features_y)
X_train = np.array([...]) # shape (n_samples, n_features_X)
y_train = np.array([...]) # shape (n_samples, n_features_y)
test_X = np.array([...])

# Initialize the Fuzzy ART model
module_x = FuzzyART(rho=0.0, alpha = 0.0, beta=1.0)

# Initialize the Hypersphere ART model
r_hat = 0.5*np.sqrt(train_X.shape[1]) # no restriction on hyperpshere size
r_hat = 0.5*np.sqrt(X_train.shape[1]) # no restriction on hyperpshere size
module_y = HypersphereART(rho=0.0, alpha = 0.0, beta=1.0, r_hat=r_hat)

# Initialize the FusionARTMAP model
gamma_values = [0.5, 0.5] # eqaul weight to both channels
channel_dims = [
2*train_X.shape[1], # fuzzy ART complement codes data so channel dim is 2*n_features
train_y.shape[1]
2*X_train.shape[1], # fuzzy ART complement codes data so channel dim is 2*n_features
y_train.shape[1]
]
model = FusionART(
modules=[module_x, module_y],
Expand All @@ -155,18 +170,48 @@ model = FusionART(
)

# Prepare Data
train_Xy = model.join_channel_data(channel_data=[train_X, train_y])
train_Xy = model.join_channel_data(channel_data=[X_train, y_train])
train_Xy_prep = model.prepare_data(train_Xy)
test_Xy = model.join_channel_data(channel_data=[train_X], skip_channels=[1])
test_Xy = model.join_channel_data(channel_data=[X_train], skip_channels=[1])
test_Xy_prep = model.prepare_data(test_Xy)

# Fit the model
model.fit(train_X_prep, train_y)
model.fit(train_Xy_prep)

# Predict y-channel values and clip X values outside previously observed ranges
pred_y = model.predict_regression(test_Xy_prep, target_channels=[1], clip=True)
```

# Predict y-channel values
pred_y = model.predict_regression(test_Xy_prep, target_channels=[1])
### Data Normalization

AdaptiveResonanceLib models require feature data to be normalized between 0.0
and 1.0 inclusively. This requires identifying the boundaries of the data space.

If the first batch of your training data is representative of the entire data space,
you dont need to do anything and artlib will identify the data bounds automatically.
However, this will often not be sufficient and the following work-arounds will be
needed:

Users can manually set the bounds using the following code snippet or similar:
```python
# Set the boundaries of your data for normalization
lower_bounds = np.array([0.]*n_features)
upper_bounds = np.array([1.]*n_features)
model.set_data_bounds(lower_bounds, upper_bounds)
```

Or users can present all batches of data to the model for automatic
boundary identification:
```python
# Find the boundaries of your data for normalization
all_data = [train_X, test_X]
_, _ = model.find_data_bounds(all_data)
```

If only the boundaries of your testing data are unknown, you can call
`model.predict()` with `clip=True` to clip testing data to the bounds seen during
training. Only use this if you understand what you are doing.

<!-- END quick-start -->

<!-- START documentation -->
Expand Down
66 changes: 63 additions & 3 deletions artlib/common/BaseART.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(self, params: Dict):
self.weight_sample_counter_: List[int] = []
self.d_min_ = None
self.d_max_ = None
self.is_fitted_ = False

def __getattr__(self, key):
if key in self.params:
Expand Down Expand Up @@ -106,6 +107,46 @@ def set_params(self, **params):
self.validate_params(local_params)
return self

def set_data_bounds(self, lower_bounds: np.ndarray, upper_bounds: np.ndarray):
"""Manually set the data bounds for normalization.
Parameters
----------
lower_bounds : np.ndarray
The lower bounds for each column.
upper_bounds : np.ndarray
The upper bounds for each column.
"""
if self.is_fitted_:
raise ValueError("Cannot change data limits after fit.")
self.d_min_ = lower_bounds
self.d_max_ = upper_bounds

def find_data_bounds(
self, *data_batches: list[np.ndarray]
) -> Tuple[np.ndarray, np.ndarray]:
"""Automatically find the data bounds for normalization from a list of data
batches.
Parameters
----------
*data_batches : list[np.ndarray]
Batches of data to be presented to the model
Returns
-------
tuple[np.ndarray, np.ndarray]
Lower and upper bounds for data.
"""
all_data = np.vstack(data_batches)
lower_bounds = np.min(all_data, axis=0)
upper_bounds = np.max(all_data, axis=0)

return lower_bounds, upper_bounds

def prepare_data(self, X: np.ndarray) -> np.ndarray:
"""Prepare data for clustering.
Expand Down Expand Up @@ -187,8 +228,23 @@ def validate_data(self, X: np.ndarray):
- X: data set
"""
assert np.all(X >= 0), "Data has not been normalized"
assert np.all(X <= 1.0), "Data has not been normalized"
normalization_message = (
"Data has not been normalized or was not normalized "
"correctly. All values must fall between 0 and 1, "
"inclusively."
)
if self.is_fitted_:
normalization_message += (
"\nThis appears to not be the first batch of "
"data. Data boundaries must be calculated for "
"the entire data space. Prior to fitting, use "
"BaseART.set_data_bounds() to manually set the "
"bounds for your data or use "
"BaseART.find_data_bounds() to identify the "
"bounds automatically for multiple batches."
)
assert np.all(X >= 0), normalization_message
assert np.all(X <= 1.0), normalization_message
self.check_dimensions(X)

def category_choice(
Expand Down Expand Up @@ -742,13 +798,15 @@ def fit_gif(
self.post_fit(X)
return self

def predict(self, X: np.ndarray) -> np.ndarray:
def predict(self, X: np.ndarray, clip: bool = False) -> np.ndarray:
"""Predict labels for the data.
Parameters
----------
X : np.ndarray
The dataset.
clip : bool
clip the input values to be between the previously seen data limits
Returns
-------
Expand All @@ -757,6 +815,8 @@ def predict(self, X: np.ndarray) -> np.ndarray:
"""
check_is_fitted(self)
if clip:
X = np.clip(X, self.d_min_, self.d_max_)
self.validate_data(X)
self.check_dimensions(X)

Expand Down
10 changes: 8 additions & 2 deletions artlib/common/BaseARTMAP.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,15 @@ def partial_fit(
"""
raise NotImplementedError

def predict(self, X: np.ndarray) -> np.ndarray:
def predict(self, X: np.ndarray, clip: bool = False) -> np.ndarray:
"""Predict labels for the data.
Parameters
----------
X : np.ndarray
Dataset A.
clip : bool
clip the input values to be between the previously seen data limits
Returns
-------
Expand All @@ -154,13 +156,17 @@ def predict(self, X: np.ndarray) -> np.ndarray:
"""
raise NotImplementedError

def predict_ab(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
def predict_ab(
self, X: np.ndarray, clip: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""Predict labels for the data, both A-side and B-side.
Parameters
----------
X : np.ndarray
Dataset A.
clip : bool
clip the input values to be between the previously seen data limits
Returns
-------
Expand Down
14 changes: 11 additions & 3 deletions artlib/fusion/FusionART.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,13 +528,17 @@ def step_pred(self, x, skip_channels: List[int] = []) -> int:
c_ = int(np.argmax(T))
return c_

def predict(self, X: np.ndarray, skip_channels: List[int] = []) -> np.ndarray:
def predict(
self, X: np.ndarray, clip: bool = False, skip_channels: List[int] = []
) -> np.ndarray:
"""Predict labels for the input data.
Parameters
----------
X : np.ndarray
Input dataset.
clip : bool
clip the input values to be between the previously seen data limits
skip_channels : list of int, optional
Channels to skip (default is []).
Expand All @@ -545,6 +549,8 @@ def predict(self, X: np.ndarray, skip_channels: List[int] = []) -> np.ndarray:
"""
check_is_fitted(self)
if clip:
X = np.clip(X, self.d_min_, self.d_max_)
self.validate_data(X)
self.check_dimensions(X)

Expand Down Expand Up @@ -673,14 +679,16 @@ def get_channel_centers(self, channel: int) -> List[np.ndarray]:
return self.modules[channel].get_cluster_centers()

def predict_regression(
self, X: np.ndarray, target_channels: List[int] = [-1]
self, X: np.ndarray, clip: bool = False, target_channels: List[int] = [-1]
) -> Union[np.ndarray, List[np.ndarray]]:
"""Predict regression values for the input data using the target channels.
Parameters
----------
X : np.ndarray
Input dataset.
clip : bool
clip the input values to be between the previously seen data limits
target_channels : list of int, optional
List of target channels to use for regression. If negative values are used,
they are considered as channels counting backward from the last channel.
Expand All @@ -695,7 +703,7 @@ def predict_regression(
"""
target_channels = [self.n + k if k < 0 else k for k in target_channels]
C = self.predict(X, skip_channels=target_channels)
C = self.predict(X, clip=clip, skip_channels=target_channels)
centers = [self.get_channel_centers(k) for k in target_channels]
if len(target_channels) == 1:
return np.array([centers[0][c] for c in C])
Expand Down
8 changes: 6 additions & 2 deletions artlib/hierarchical/DeepARTMAP.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,13 +410,17 @@ def partial_fit(
x_i += 1
return self

def predict(self, X: Union[np.ndarray, list[np.ndarray]]) -> list[np.ndarray]:
def predict(
self, X: Union[np.ndarray, list[np.ndarray]], clip: bool = False
) -> list[np.ndarray]:
"""Predict the labels for the input data.
Parameters
----------
X : np.ndarray or list of np.ndarray
The input data set for prediction.
clip : bool
clip the input values to be between the previously seen data limits
Returns
-------
Expand All @@ -428,7 +432,7 @@ def predict(self, X: Union[np.ndarray, list[np.ndarray]]) -> list[np.ndarray]:
x = X[-1]
else:
x = X
pred_a, pred_b = self.layers[-1].predict_ab(x)
pred_a, pred_b = self.layers[-1].predict_ab(x, clip=clip)
pred = [pred_a, pred_b]
for layer in self.layers[:-1][::-1]:
pred.append(layer.map_a2b(pred[-1]))
Expand Down
Loading

0 comments on commit fc0ed56

Please sign in to comment.