You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When a preprocessing pipeline is ran on a text that will be empty at the end of the preprocessing, it returns an IndexError.
This is because Pipeline.run uses sklearn Pipeline().fit_transform instead of Pipeline().transform. At first sight, each call to Preprocessor().run has one string in input, so there is no need to fit: transforming should be enough.
🔬 How To Reproduce
Steps to reproduce the behavior:
text="@tweeteruser 😢"Preprocessor().run(text)
Should get empty string, but raises IndexError
Environment
OS: macOS
Python version: 3.10
Screenshots
IndexError Traceback (most recent call last)
Cell In[65], line 11
9 text = df.loc[5, "rawContent"]
10 text = "@fannyguinochet 😢"
---> 11 Preprocessor().run(text)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/nlpretext/preprocessor.py:86, in Preprocessor.run(self, text)
82 operations = [
83 {"operation": operation, "args": None} for operation in operations_to_pipe
84 ]
85 self.pipeline = self.build_pipeline(operations)
---> 86 text = self.pipeline.fit_transform(text)
87 return text
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:437, in Pipeline.fit_transform(self, X, y, **fit_params)
410 """Fit the model and transform with the final estimator.
411
412 Fits all the transformers one after the other and transform the
(...)
434 Transformed samples.
435 """
436 fit_params_steps = self._check_fit_params(**fit_params)
--> 437 Xt = self._fit(X, y, **fit_params_steps)
439 last_step = self._final_estimator
440 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:359, in Pipeline._fit(self, X, y, **fit_params_steps)
357 cloned_transformer = clone(transformer)
358 # Fit or load from cache the current transformer
--> 359 X, fitted_transformer = fit_transform_one_cached(
360 cloned_transformer,
361 X,
362 y,
363 None,
364 message_clsname="Pipeline",
365 message=self._log_message(step_idx),
366 **fit_params_steps[name],
367 )
368 # Replace the transformer of the step with the fitted
369 # transformer. This is necessary when loading the transformer
370 # from the cache.
371 self.steps[step_idx] = (name, fitted_transformer)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/utils/_set_output.py:142, in _wrap_method_output..wrapped(self, X, *args, **kwargs)
140 @wraps(f)
141 def wrapped(self, X, *args, **kwargs):
--> 142 data_to_wrap = f(self, X, *args, **kwargs)
143 if isinstance(data_to_wrap, tuple):
144 # only wrap the first output for cross decomposition
145 return (
146 _wrap_data_with_container(method, data_to_wrap[0], X, self),
147 *data_to_wrap[1:],
148 )
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/base.py:859, in TransformerMixin.fit_transform(self, X, y, **fit_params)
855 # non-optimized default implementation; override when a better
856 # method is possible for a given clustering algorithm
857 if y is None:
858 # fit method of arity 1 (unsupervised transformation)
--> 859 return self.fit(X, **fit_params).transform(X)
860 else:
861 # fit method of arity 2 (supervised transformation)
862 return self.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:217, in FunctionTransformer.fit(self, X, y)
199 """Fit transformer by checking X.
200
201 If validate is True, X will be checked.
(...)
214 FunctionTransformer class instance.
215 """
216 self._validate_params()
--> 217 X = self._check_input(X, reset=True)
218 if self.check_inverse and not (self.func is None or self.inverse_func is None):
219 self._check_inverse_transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:168, in FunctionTransformer.check_input(self, X, reset)
163 return self.validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
164 elif reset:
165 # Set feature_names_in and n_features_in even if validate=False
166 # We run this only when reset==True to store the attributes but not
167 # validate them, because validate=False
--> 168 self._check_n_features(X, reset=reset)
169 self._check_feature_names(X, reset=reset)
170 return X
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/base.py:347, in BaseEstimator._check_n_features(self, X, reset)
330 """Set the n_features_in_ attribute, or check against it.
331
332 Parameters
(...)
344 should set reset=False.
345 """
346 try:
--> 347 n_features = num_features(X)
348 except TypeError as e:
349 if not reset and hasattr(self, "n_features_in"):
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/utils/validation.py:304, in _num_features(X)
301 raise TypeError(message)
302 return X.shape[1]
--> 304 first_sample = X[0]
306 # Do not consider an array-like of strings or dicts to be a 2D array
307 if isinstance(first_sample, (str, bytes, dict)):
IndexError: string index out of range
📈 Expected behavior
📎 Additional context
The text was updated successfully, but these errors were encountered:
🐛 Bug Report
When a preprocessing pipeline is ran on a text that will be empty at the end of the preprocessing, it returns an IndexError.
This is because Pipeline.run uses sklearn
Pipeline().fit_transform
instead ofPipeline().transform
. At first sight, each call toPreprocessor().run
has one string in input, so there is no need to fit: transforming should be enough.🔬 How To Reproduce
Steps to reproduce the behavior:
Should get empty string, but raises IndexError
Environment
Screenshots
IndexError Traceback (most recent call last)
Cell In[65], line 11
9 text = df.loc[5, "rawContent"]
10 text = "@fannyguinochet 😢"
---> 11 Preprocessor().run(text)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/nlpretext/preprocessor.py:86, in Preprocessor.run(self, text)
82 operations = [
83 {"operation": operation, "args": None} for operation in operations_to_pipe
84 ]
85 self.pipeline = self.build_pipeline(operations)
---> 86 text = self.pipeline.fit_transform(text)
87 return text
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:437, in Pipeline.fit_transform(self, X, y, **fit_params)
410 """Fit the model and transform with the final estimator.
411
412 Fits all the transformers one after the other and transform the
(...)
434 Transformed samples.
435 """
436 fit_params_steps = self._check_fit_params(**fit_params)
--> 437 Xt = self._fit(X, y, **fit_params_steps)
439 last_step = self._final_estimator
440 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:359, in Pipeline._fit(self, X, y, **fit_params_steps)
357 cloned_transformer = clone(transformer)
358 # Fit or load from cache the current transformer
--> 359 X, fitted_transformer = fit_transform_one_cached(
360 cloned_transformer,
361 X,
362 y,
363 None,
364 message_clsname="Pipeline",
365 message=self._log_message(step_idx),
366 **fit_params_steps[name],
367 )
368 # Replace the transformer of the step with the fitted
369 # transformer. This is necessary when loading the transformer
370 # from the cache.
371 self.steps[step_idx] = (name, fitted_transformer)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/joblib/memory.py:349, in NotMemorizedFunc.call(self, *args, **kwargs)
348 def call(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/utils/_set_output.py:142, in _wrap_method_output..wrapped(self, X, *args, **kwargs)
140 @wraps(f)
141 def wrapped(self, X, *args, **kwargs):
--> 142 data_to_wrap = f(self, X, *args, **kwargs)
143 if isinstance(data_to_wrap, tuple):
144 # only wrap the first output for cross decomposition
145 return (
146 _wrap_data_with_container(method, data_to_wrap[0], X, self),
147 *data_to_wrap[1:],
148 )
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/base.py:859, in TransformerMixin.fit_transform(self, X, y, **fit_params)
855 # non-optimized default implementation; override when a better
856 # method is possible for a given clustering algorithm
857 if y is None:
858 # fit method of arity 1 (unsupervised transformation)
--> 859 return self.fit(X, **fit_params).transform(X)
860 else:
861 # fit method of arity 2 (supervised transformation)
862 return self.fit(X, y, **fit_params).transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:217, in FunctionTransformer.fit(self, X, y)
199 """Fit transformer by checking X.
200
201 If
validate
isTrue
,X
will be checked.(...)
214 FunctionTransformer class instance.
215 """
216 self._validate_params()
--> 217 X = self._check_input(X, reset=True)
218 if self.check_inverse and not (self.func is None or self.inverse_func is None):
219 self._check_inverse_transform(X)
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:168, in FunctionTransformer.check_input(self, X, reset)
163 return self.validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
164 elif reset:
165 # Set feature_names_in and n_features_in even if validate=False
166 # We run this only when reset==True to store the attributes but not
167 # validate them, because validate=False
--> 168 self._check_n_features(X, reset=reset)
169 self._check_feature_names(X, reset=reset)
170 return X
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/base.py:347, in BaseEstimator._check_n_features(self, X, reset)
330 """Set the
n_features_in_
attribute, or check against it.331
332 Parameters
(...)
344 should set
reset=False
.345 """
346 try:
--> 347 n_features = num_features(X)
348 except TypeError as e:
349 if not reset and hasattr(self, "n_features_in"):
File ~/miniconda3/envs/subway-failure-prediction/lib/python3.10/site-packages/sklearn/utils/validation.py:304, in _num_features(X)
301 raise TypeError(message)
302 return X.shape[1]
--> 304 first_sample = X[0]
306 # Do not consider an array-like of strings or dicts to be a 2D array
307 if isinstance(first_sample, (str, bytes, dict)):
IndexError: string index out of range
📈 Expected behavior
📎 Additional context
The text was updated successfully, but these errors were encountered: