From ad6d979e32f3d850e3ec74c95912246b4af554f4 Mon Sep 17 00:00:00 2001
From: Daniel <dcolinmorgan@gmail.com>
Date: Thu, 29 Feb 2024 17:43:00 +0800
Subject: [PATCH] (fix)cuml.CV string lower limit

---
 cu_cat/_gap_encoder.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/cu_cat/_gap_encoder.py b/cu_cat/_gap_encoder.py
index c44417977..5dd41f72f 100644
--- a/cu_cat/_gap_encoder.py
+++ b/cu_cat/_gap_encoder.py
@@ -200,6 +200,7 @@ def _init_vars(self, X) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
         the topics W.
         """
         self.Xt_ = df_type(X)
+        X = X[X.str.len() >3]  # cudf CV has trouble with shorter strings
         # if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
             # X.apply(lambda x: str((x)).zfill(4)) ## need at least >3 chars for gap encoder
         # cuml.set_global_output_type('cupy')
@@ -238,8 +239,12 @@ def _init_vars(self, X) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
             unq_X, lookup = np.unique(X.astype(str), return_inverse=True)
         elif 'cudf' in str(getmodule(X)) and 'cuml' in self.engine:
             unq_X = X.unique()
-            tmp, lookup = np.unique(X.to_arrow(), return_inverse=True)
-        unq_V = self.ngrams_count_.fit_transform(unq_X)
+            tmp, lookup = np.unique(X.to_pandas(), return_inverse=True)
+        try:
+            unq_V = self.ngrams_count_.fit_transform(unq_X)
+        except IndexError:
+            unq_X = unq_X[unq_X.str.len() > 3]  # cuml CV has trouble with shorter strings
+            unq_V = self.ngrams_count_.fit_transform(unq_X)
         if self.add_words:  # Add word counts to unq_V
             unq_V2 = self.word_count_.fit_transform(unq_X)
             unq_V = sparse.hstack((unq_V, unq_V2), format="csr")
@@ -349,6 +354,7 @@ def fit(self, X, y=None) -> "GapEncoderColumn":
         # Check if first item has str or np.str_ type
 
         self.Xt_= df_type(X)
+        X = X[X.str.len() >3]
         # Make n-grams counts matrix unq_V
         # if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
         #     X = X.replace('nan',np.nan).fillna('0o0o0')
@@ -540,6 +546,7 @@ def _add_unseen_keys_to_H_dict(self, X) -> None:
             unseen_X = cudf.Series(unseen_X)
         else:
             unseen_X = np.setdiff1d(X.astype(str), np.array([*self.H_dict_]))
+        
         if unseen_X.size > 0:
             unseen_V = self.ngrams_count_.transform(unseen_X)
             if self.add_words:
@@ -570,6 +577,7 @@ def transform(self, X) -> np.array:
             Transformed input.
         """
         t = time()
+        X = X[X.str.len() >3]
         check_is_fitted(self, "H_dict_")
         # Check if first item has str or np.str_ type
         # if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
@@ -581,7 +589,12 @@ def transform(self, X) -> np.array:
             unq_X = X.unique()
             self.gmem = get_gpu_memory()[0]
         # Build the n-grams counts matrix V for the string data to encode
-        unq_V = self.ngrams_count_.transform(unq_X)#.astype(str))
+        try:
+            unq_V = self.ngrams_count_.transform(unq_X)
+        except IndexError:
+            unq_X = unq_X[unq_X.str.len() > 3]  # cuml CV has trouble with shorter strings
+            unq_V = self.ngrams_count_.transform(unq_X)
+        # unq_V = self.ngrams_count_.transform(unq_X)#.astype(str))
         if self.add_words:  # Add words counts
             unq_V2 = self.word_count_.transform(unq_X.astype(str))
             unq_V = sparse.hstack((unq_V, unq_V2), format="csr")
@@ -950,7 +963,6 @@ def fit(self, X, y=None) -> "GapEncoder":
         :class:`~cu_cat.GapEncoder`
             Fitted :class:`~cu_cat.GapEncoder` instance (self).
         """
-
         X, y = make_safe_gpu_dataframes(X, None, self.engine)
 
         # Check that n_samples >= n_components
@@ -1013,6 +1025,7 @@ def transform(self, X) -> np.array:
         """
         check_is_fitted(self, "fitted_models_")
         # Check input data shape
+
         X = check_input(X)
         X = self._handle_missing(X)
         X_enc = []
@@ -1115,7 +1128,7 @@ def _multiplicative_update_w(
         W = cp.multiply(A, cp.reciprocal(B))
         if rescale_W:
             _rescale_W(W, A)
-        gc.collect()
+        # gc.collect()
 
     else:
         try:
@@ -1164,7 +1177,7 @@ def _multiplicative_update_w_smallfast(
         if rescale_W:
             _rescale_W(W, A)
         del C,R,T,Ht,Vt
-        gc.collect()
+        # gc.collect()
         cp._default_memory_pool.free_all_blocks()
 
     else: