Merge pull request #128 from NiklasMelton/issue-127-improve-prepare-data

Issue-227 improve error message and add helper functions
NiklasMelton · Jan 10, 2025 · fc0ed56 · fc0ed56
2 parents 2610414 + 6442d95
commit fc0ed56
Show file tree

Hide file tree

Showing 8 changed files with 253 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -76,17 +76,25 @@ Here are some quick examples to get you started with AdaptiveResonanceLib:
 ```python
 from artlib import FuzzyART
 import numpy as np
+from tensorflow.keras.datasets import mnist
 
-# Your dataset
-train_X = np.array([...]) # shape (n_samples, n_features)
-test_X = np.array([...])
+# Load the MNIST dataset
+n_dim = 28*28
+(X_train, _), (X_test, _) = mnist.load_data()
+X_train = X_train.reshape((-1, n_dim)) # flatten images
+X_test = X_test.reshape((-1, n_dim))
 
 # Initialize the Fuzzy ART model
 model = FuzzyART(rho=0.7, alpha = 0.0, beta=1.0)
 
+# (Optional) Tell the model the data limits for normalization
+lower_bounds = np.array([0.]*n_dim)
+upper_bounds = np.array([255.]*n_dim)
+model.set_data_bounds(lower_bounds, upper_bounds)
+
 # Prepare Data
-train_X_prep = model.prepare_data(train_X)
-test_X_prep = model.prepare_data(test_X)
+train_X_prep = model.prepare_data(X_train)
+test_X_prep = model.prepare_data(X_test)
 
 # Fit the model
 model.fit(train_X_prep)
@@ -100,25 +108,32 @@ predictions = model.predict(test_X_prep)
 ```python
 from artlib import GaussianART, SimpleARTMAP
 import numpy as np
+from tensorflow.keras.datasets import mnist
 
-# Your dataset
-train_X = np.array([...]) # shape (n_samples, n_features)
-train_y = np.array([...]) # shape (n_samples, ), must be integers
-test_X = np.array([...])
+# Load the MNIST dataset
+n_dim = 28*28
+(X_train, y_train), (X_test, y_test) = mnist.load_data()
+X_train = X_train.reshape((-1, n_dim)) # flatten images
+X_test = X_test.reshape((-1, n_dim))
 
 # Initialize the Gaussian ART model
-sigma_init = np.array([0.5]*train_X.shape[1]) # variance estimate for each feature
+sigma_init = np.array([0.5]*X_train.shape[1]) # variance estimate for each feature
 module_a = GaussianART(rho=0.0, sigma_init=sigma_init)
 
+# (Optional) Tell the model the data limits for normalization
+lower_bounds = np.array([0.]*n_dim)
+upper_bounds = np.array([255.]*n_dim)
+module_a.set_data_bounds(lower_bounds, upper_bounds)
+
 # Initialize the SimpleARTMAP model
 model = SimpleARTMAP(module_a=module_a)
 
 # Prepare Data
-train_X_prep = model.prepare_data(train_X)
-test_X_prep = model.prepare_data(test_X)
+train_X_prep = model.prepare_data(X_train)
+test_X_prep = model.prepare_data(X_test)
 
 # Fit the model
-model.fit(train_X_prep, train_y)
+model.fit(train_X_prep, y_train)
 
 # Predict data labels
 predictions = model.predict(test_X_prep)
@@ -131,22 +146,22 @@ from artlib import FuzzyART, HypersphereART, FusionART
 import numpy as np
 
 # Your dataset
-train_X = np.array([...]) # shape (n_samples, n_features_X)
-train_y = np.array([...]) # shape (n_samples, n_features_y)
+X_train = np.array([...]) # shape (n_samples, n_features_X)
+y_train = np.array([...]) # shape (n_samples, n_features_y)
 test_X = np.array([...])
 
 # Initialize the Fuzzy ART model
 module_x = FuzzyART(rho=0.0, alpha = 0.0, beta=1.0)
 
 # Initialize the Hypersphere ART model
-r_hat = 0.5*np.sqrt(train_X.shape[1]) # no restriction on hyperpshere size
+r_hat = 0.5*np.sqrt(X_train.shape[1]) # no restriction on hyperpshere size
 module_y = HypersphereART(rho=0.0, alpha = 0.0, beta=1.0, r_hat=r_hat)
 
 # Initialize the FusionARTMAP model
 gamma_values = [0.5, 0.5] # eqaul weight to both channels
 channel_dims = [
-  2*train_X.shape[1], # fuzzy ART complement codes data so channel dim is 2*n_features
-  train_y.shape[1]
+  2*X_train.shape[1], # fuzzy ART complement codes data so channel dim is 2*n_features
+  y_train.shape[1]
 ]
 model = FusionART(
   modules=[module_x, module_y],
@@ -155,18 +170,48 @@ model = FusionART(
 )
 
 # Prepare Data
-train_Xy = model.join_channel_data(channel_data=[train_X, train_y])
+train_Xy = model.join_channel_data(channel_data=[X_train, y_train])
 train_Xy_prep = model.prepare_data(train_Xy)
-test_Xy = model.join_channel_data(channel_data=[train_X], skip_channels=[1])
+test_Xy = model.join_channel_data(channel_data=[X_train], skip_channels=[1])
 test_Xy_prep = model.prepare_data(test_Xy)
 
 # Fit the model
-model.fit(train_X_prep, train_y)
+model.fit(train_Xy_prep)
+
+# Predict y-channel values and clip X values outside previously observed ranges
+pred_y = model.predict_regression(test_Xy_prep, target_channels=[1], clip=True)
+```
 
-# Predict y-channel values
-pred_y = model.predict_regression(test_Xy_prep, target_channels=[1])
+### Data Normalization
+
+AdaptiveResonanceLib models require feature data to be normalized between 0.0
+and 1.0 inclusively. This requires identifying the boundaries of the data space.
+
+If the first batch of your training data is representative of the entire data space,
+you dont need to do anything and artlib will identify the data bounds automatically.
+However, this will often not be sufficient and the following work-arounds will be
+needed:
+
+Users can manually set the bounds using the following code snippet or similar:
+```python
+# Set the boundaries of your data for normalization
+lower_bounds = np.array([0.]*n_features)
+upper_bounds = np.array([1.]*n_features)
+model.set_data_bounds(lower_bounds, upper_bounds)
 ```
 
+Or users can present all batches of data to the model for automatic
+boundary identification:
+```python
+# Find the boundaries of your data for normalization
+all_data = [train_X, test_X]
+_, _ = model.find_data_bounds(all_data)
+```
+
+If only the boundaries of your testing data are unknown, you can call
+`model.predict()` with `clip=True` to clip testing data to the bounds seen during
+training. Only use this if you understand what you are doing.
+
 <!-- END quick-start -->
 
 <!-- START documentation -->

diff --git a/artlib/common/BaseART.py b/artlib/common/BaseART.py
@@ -28,6 +28,7 @@ def __init__(self, params: Dict):
         self.weight_sample_counter_: List[int] = []
         self.d_min_ = None
         self.d_max_ = None
+        self.is_fitted_ = False
 
     def __getattr__(self, key):
         if key in self.params:
@@ -106,6 +107,46 @@ def set_params(self, **params):
         self.validate_params(local_params)
         return self
 
+    def set_data_bounds(self, lower_bounds: np.ndarray, upper_bounds: np.ndarray):
+        """Manually set the data bounds for normalization.
+
+        Parameters
+        ----------
+        lower_bounds : np.ndarray
+            The lower bounds for each column.
+
+        upper_bounds : np.ndarray
+            The upper bounds for each column.
+
+        """
+        if self.is_fitted_:
+            raise ValueError("Cannot change data limits after fit.")
+        self.d_min_ = lower_bounds
+        self.d_max_ = upper_bounds
+
+    def find_data_bounds(
+        self, *data_batches: list[np.ndarray]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Automatically find the data bounds for normalization from a list of data
+        batches.
+
+        Parameters
+        ----------
+        *data_batches : list[np.ndarray]
+            Batches of data to be presented to the model
+
+        Returns
+        -------
+        tuple[np.ndarray, np.ndarray]
+            Lower and upper bounds for data.
+
+        """
+        all_data = np.vstack(data_batches)
+        lower_bounds = np.min(all_data, axis=0)
+        upper_bounds = np.max(all_data, axis=0)
+
+        return lower_bounds, upper_bounds
+
     def prepare_data(self, X: np.ndarray) -> np.ndarray:
         """Prepare data for clustering.
 
@@ -187,8 +228,23 @@ def validate_data(self, X: np.ndarray):
         - X: data set
 
         """
-        assert np.all(X >= 0), "Data has not been normalized"
-        assert np.all(X <= 1.0), "Data has not been normalized"
+        normalization_message = (
+            "Data has not been normalized or was not normalized "
+            "correctly. All values must fall between 0 and 1, "
+            "inclusively."
+        )
+        if self.is_fitted_:
+            normalization_message += (
+                "\nThis appears to not be the first batch of "
+                "data. Data boundaries must be calculated for "
+                "the entire data space. Prior to fitting, use "
+                "BaseART.set_data_bounds() to manually set the "
+                "bounds for your data or use "
+                "BaseART.find_data_bounds() to identify the "
+                "bounds automatically for multiple batches."
+            )
+        assert np.all(X >= 0), normalization_message
+        assert np.all(X <= 1.0), normalization_message
         self.check_dimensions(X)
 
     def category_choice(
@@ -742,13 +798,15 @@ def fit_gif(
             self.post_fit(X)
             return self
 
-    def predict(self, X: np.ndarray) -> np.ndarray:
+    def predict(self, X: np.ndarray, clip: bool = False) -> np.ndarray:
         """Predict labels for the data.
 
         Parameters
         ----------
         X : np.ndarray
             The dataset.
+        clip : bool
+            clip the input values to be between the previously seen data limits
 
         Returns
         -------
@@ -757,6 +815,8 @@ def predict(self, X: np.ndarray) -> np.ndarray:
 
         """
         check_is_fitted(self)
+        if clip:
+            X = np.clip(X, self.d_min_, self.d_max_)
         self.validate_data(X)
         self.check_dimensions(X)
 

diff --git a/artlib/common/BaseARTMAP.py b/artlib/common/BaseARTMAP.py
@@ -138,13 +138,15 @@ def partial_fit(
         """
         raise NotImplementedError
 
-    def predict(self, X: np.ndarray) -> np.ndarray:
+    def predict(self, X: np.ndarray, clip: bool = False) -> np.ndarray:
         """Predict labels for the data.
 
         Parameters
         ----------
         X : np.ndarray
             Dataset A.
+        clip : bool
+            clip the input values to be between the previously seen data limits
 
         Returns
         -------
@@ -154,13 +156,17 @@ def predict(self, X: np.ndarray) -> np.ndarray:
         """
         raise NotImplementedError
 
-    def predict_ab(self, X: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    def predict_ab(
+        self, X: np.ndarray, clip: bool = False
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Predict labels for the data, both A-side and B-side.
 
         Parameters
         ----------
         X : np.ndarray
             Dataset A.
+        clip : bool
+            clip the input values to be between the previously seen data limits
 
         Returns
         -------

diff --git a/artlib/fusion/FusionART.py b/artlib/fusion/FusionART.py
@@ -528,13 +528,17 @@ def step_pred(self, x, skip_channels: List[int] = []) -> int:
         c_ = int(np.argmax(T))
         return c_
 
-    def predict(self, X: np.ndarray, skip_channels: List[int] = []) -> np.ndarray:
+    def predict(
+        self, X: np.ndarray, clip: bool = False, skip_channels: List[int] = []
+    ) -> np.ndarray:
         """Predict labels for the input data.
 
         Parameters
         ----------
         X : np.ndarray
             Input dataset.
+        clip : bool
+            clip the input values to be between the previously seen data limits
         skip_channels : list of int, optional
             Channels to skip (default is []).
 
@@ -545,6 +549,8 @@ def predict(self, X: np.ndarray, skip_channels: List[int] = []) -> np.ndarray:
 
         """
         check_is_fitted(self)
+        if clip:
+            X = np.clip(X, self.d_min_, self.d_max_)
         self.validate_data(X)
         self.check_dimensions(X)
 
@@ -673,14 +679,16 @@ def get_channel_centers(self, channel: int) -> List[np.ndarray]:
         return self.modules[channel].get_cluster_centers()
 
     def predict_regression(
-        self, X: np.ndarray, target_channels: List[int] = [-1]
+        self, X: np.ndarray, clip: bool = False, target_channels: List[int] = [-1]
     ) -> Union[np.ndarray, List[np.ndarray]]:
         """Predict regression values for the input data using the target channels.
 
         Parameters
         ----------
         X : np.ndarray
             Input dataset.
+        clip : bool
+            clip the input values to be between the previously seen data limits
         target_channels : list of int, optional
             List of target channels to use for regression. If negative values are used,
             they are considered as channels counting backward from the last channel.
@@ -695,7 +703,7 @@ def predict_regression(
 
         """
         target_channels = [self.n + k if k < 0 else k for k in target_channels]
-        C = self.predict(X, skip_channels=target_channels)
+        C = self.predict(X, clip=clip, skip_channels=target_channels)
         centers = [self.get_channel_centers(k) for k in target_channels]
         if len(target_channels) == 1:
             return np.array([centers[0][c] for c in C])

diff --git a/artlib/hierarchical/DeepARTMAP.py b/artlib/hierarchical/DeepARTMAP.py
@@ -410,13 +410,17 @@ def partial_fit(
             x_i += 1
         return self
 
-    def predict(self, X: Union[np.ndarray, list[np.ndarray]]) -> list[np.ndarray]:
+    def predict(
+        self, X: Union[np.ndarray, list[np.ndarray]], clip: bool = False
+    ) -> list[np.ndarray]:
         """Predict the labels for the input data.
 
         Parameters
         ----------
         X : np.ndarray or list of np.ndarray
             The input data set for prediction.
+        clip : bool
+            clip the input values to be between the previously seen data limits
 
         Returns
         -------
@@ -428,7 +432,7 @@ def predict(self, X: Union[np.ndarray, list[np.ndarray]]) -> list[np.ndarray]:
             x = X[-1]
         else:
             x = X
-        pred_a, pred_b = self.layers[-1].predict_ab(x)
+        pred_a, pred_b = self.layers[-1].predict_ab(x, clip=clip)
         pred = [pred_a, pred_b]
         for layer in self.layers[:-1][::-1]:
             pred.append(layer.map_a2b(pred[-1]))