fix: preprocess output format & some mistake in spelling (#358)

* spaceship: format type of y = pd.series; fix a wrong spelling in xgb * s3e11: format of y -- pd.series * spaceship: format of y & fit nn * spaceship: wrong spelling in xgb * ci issue
microsoft · Sep 26, 2024 · b8b2cd6 · b8b2cd6
1 parent b054017
commit b8b2cd6
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 4 deletions.
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/fea_share_preprocess.py
@@ -16,6 +16,8 @@ def preprocess_script():
         y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
         X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
         others = pd.read_pickle("/kaggle/input/others.pkl")
+        y_train = pd.Series(y_train).reset_index(drop=True)
+        y_valid = pd.Series(y_valid).reset_index(drop=True)
 
         return X_train, X_valid, y_train, y_valid, X_test, *others
 
@@ -38,6 +40,8 @@ def preprocess_script():
     X_train, X_valid, y_train, y_valid = train_test_split(
         train[most_important_features], train["log_cost"], test_size=0.2, random_state=2023
     )
+    y_train = pd.Series(y_train).reset_index(drop=True)
+    y_valid = pd.Series(y_valid).reset_index(drop=True)
 
     # test
     test = pd.read_csv("/kaggle/input/test.csv")

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/fea_share_preprocess.py
@@ -79,12 +79,16 @@ def preprocess_script():
         y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
         X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
         others = pd.read_pickle("/kaggle/input/others.pkl")
+        y_train = pd.Series(y_train).reset_index(drop=True)
+        y_valid = pd.Series(y_valid).reset_index(drop=True)
 
         return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
 
     # Fit the preprocessor on the training data
     preprocessor, numerical_cols, categorical_cols = preprocess_fit(X_train)
+    y_train = pd.Series(y_train).reset_index(drop=True)
+    y_valid = pd.Series(y_valid).reset_index(drop=True)
 
     # Preprocess the train, validation, and test data
     X_train = preprocess_transform(X_train, preprocessor, numerical_cols, categorical_cols)

diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py
@@ -20,7 +20,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
 
     # TODO: for quick running....
     params = {
-        "nthred": -1,
+        "nthread": -1,
     }
     num_round = 180
 

diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py
@@ -85,9 +85,13 @@ def preprocess_script():
         y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
         X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
         others = pd.read_pickle("/kaggle/input/others.pkl")
+        y_train = pd.Series(y_train).reset_index(drop=True)
+        y_valid = pd.Series(y_valid).reset_index(drop=True)
 
         return X_train, X_valid, y_train, y_valid, X_test, *others
     X_train, X_valid, y_train, y_valid = prepreprocess()
+    y_train = pd.Series(y_train).reset_index(drop=True)
+    y_valid = pd.Series(y_valid).reset_index(drop=True)
 
     # Fit the preprocessor on the training data
     preprocessor, label_encoders = preprocess_fit(X_train)

diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py
@@ -37,10 +37,12 @@ def fit(X_train, y_train, X_valid, y_valid):
 
     # Convert to TensorDataset and create DataLoader
     train_dataset = TensorDataset(
-        torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.reshape(-1), dtype=torch.float32)
+        torch.tensor(X_train.to_numpy(), dtype=torch.float32),
+        torch.tensor(y_train.to_numpy().reshape(-1), dtype=torch.float32),
     )
     valid_dataset = TensorDataset(
-        torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.reshape(-1), dtype=torch.float32)
+        torch.tensor(X_valid.to_numpy(), dtype=torch.float32),
+        torch.tensor(y_valid.to_numpy().reshape(-1), dtype=torch.float32),
     )
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
     valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py
@@ -20,7 +20,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
 
     # TODO: for quick running....
     params = {
-        "nthred": -1,
+        "nthread": -1,
     }
     num_round = 100