From da752ec806e6f5f5679bc27ac1c072ed9a319251 Mon Sep 17 00:00:00 2001 From: Haoran Pan <167847254+TPLin22@users.noreply.github.com> Date: Sun, 22 Sep 2024 22:11:29 +0800 Subject: [PATCH] feat: add more templates for kaggle (#291) * init for forest-cover-type-prediction * add nn model for forest-cover-type-prediction * add cross_validation for forest-cover-type-prediction * edit path to file * CI issues * CI Issue * edit dir name * fix a bug in s4e8 ensemble & init spaceship-titanic * add nn model for s4e8 & spaceship-titanic * init for s4e9 * ci issues * ci issue --- .../cross_validation.py | 101 ++++++++++++++++ .../fea_share_preprocess.py | 72 ++++++++++++ .../feature/feature.py | 23 ++++ .../model/model_nn.py | 78 ++++++++++++ .../model/model_randomforest.py | 53 +++++++++ .../model/model_xgboost.py | 41 +++++++ .../train.py | 104 ++++++++++++++++ .../cross_validation_tpl.py | 87 -------------- .../model/model_nn.py | 76 ++++++++++++ .../model/model_xgboost.py | 2 +- .../playground-series-s4e8_template/train.py | 21 +--- .../fea_share_preprocess.py | 108 +++++++++++++++++ .../feature/feature.py | 23 ++++ .../model/model_nn.py | 78 ++++++++++++ .../model/model_randomforest.py | 48 ++++++++ .../model/model_xgboost.py | 37 ++++++ .../playground-series-s4e9_template/train.py | 103 ++++++++++++++++ .../model/model_randomforest.py | 4 +- .../fea_share_preprocess.py | 111 ++++++++++++++++++ .../feature/feature.py | 23 ++++ .../model/model_nn.py | 76 ++++++++++++ .../model/model_randomforest.py | 54 +++++++++ .../model/model_xgboost.py | 40 +++++++ .../spaceship-titanic_template/train.py | 109 +++++++++++++++++ 24 files changed, 1366 insertions(+), 106 deletions(-) create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/feature/feature.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py create mode 100644 rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py delete mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_nn.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/feature/feature.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py create mode 100644 rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/feature/feature.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py create mode 100644 rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py new file mode 100644 index 00000000..3085077b --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py @@ -0,0 +1,101 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from scipy import stats +from sklearn.impute import SimpleImputer +from sklearn.metrics import accuracy_score +from sklearn.model_selection import KFold + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +data_df = pd.read_csv( + "/data/userdata/v-haoranpan/RD-Agent/git_ignore_folder/data/forest-cover-type-prediction/train.csv" +) +data_df = data_df.drop(["Id"], axis=1) + +X_train = data_df.drop(["Cover_Type"], axis=1) +y_train = data_df["Cover_Type"] - 1 + +# Set up KFold +kf = KFold(n_splits=5, shuffle=True, random_state=SEED) + +# Store results +accuracies = [] + +# 3) Train and evaluate using KFold +fold_number = 1 +for train_index, valid_index in kf.split(X_train): + print(f"Starting fold {fold_number}...") + + X_train_l, X_valid_l = [], [] # Reset feature lists for each fold + X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index] + y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index] + + # Feature engineering + for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_tr) + X_train_f = cls.transform(X_tr) + X_valid_f = cls.transform(X_val) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + + X_tr = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_val = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + + print("Shape of X_tr: ", X_tr.shape, " Shape of X_val: ", X_val.shape) + + # Replace inf and -inf with NaN + X_tr.replace([np.inf, -np.inf], np.nan, inplace=True) + X_val.replace([np.inf, -np.inf], np.nan, inplace=True) + + # Impute missing values + imputer = SimpleImputer(strategy="mean") + X_tr = pd.DataFrame(imputer.fit_transform(X_tr), columns=X_tr.columns) + X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns) + + # Remove duplicate columns + X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()] + X_val = X_val.loc[:, ~X_val.columns.duplicated()] + + # Train the model + model_l = [] # list[tuple[model, predict_func]] + for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_tr, y_tr, X_val, y_val), m.predict)) + + # Evaluate the model on the validation set + y_valid_pred_l = [] + for model, predict_func in model_l: + y_valid_pred = predict_func(model, X_val) + y_valid_pred_l.append(y_valid_pred) + + # Majority vote ensemble + y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() + + # Compute metrics + accuracy = accuracy_score(y_val, y_valid_pred_ensemble) + accuracies.append(accuracy) + print(f"Fold {fold_number} accuracy: {accuracy}") + + fold_number += 1 + +# Print average accuracy +print(f"Average accuracy across folds: {np.mean(accuracies)}") diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py new file mode 100644 index 00000000..8ec1dc03 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py @@ -0,0 +1,72 @@ +import os + +import numpy as np +import pandas as pd +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split + + +def prepreprocess(): + """ + This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. + """ + # Load and preprocess the data + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["Id"], axis=1) + + X = data_df.drop(["Cover_Type"], axis=1) + y = data_df["Cover_Type"] - 1 + + # Split the data into training and validation sets + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("X_train.pkl"): + X_train = pd.read_pickle("X_train.pkl") + X_valid = pd.read_pickle("X_valid.pkl") + y_train = pd.read_pickle("y_train.pkl") + y_valid = pd.read_pickle("y_valid.pkl") + X_test = pd.read_pickle("X_test.pkl") + ids = pd.read_pickle("ids.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, ids + + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["Id"] + X_test = submission_df.drop(["Id"], axis=1) + + return X_train, X_valid, y_train, y_valid, X_test, ids + + +def clean_and_impute_data(X_train, X_valid, X_test): + """ + Handles inf and -inf values by replacing them with NaN, + then imputes missing values using the mean strategy. + Also removes duplicate columns. + """ + # Replace inf and -inf with NaN + X_train.replace([np.inf, -np.inf], np.nan, inplace=True) + X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) + X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + + # Impute missing values + imputer = SimpleImputer(strategy="mean") + X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) + X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) + X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + + # Remove duplicate columns + X_train = X_train.loc[:, ~X_train.columns.duplicated()] + X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] + X_test = X_test.loc[:, ~X_test.columns.duplicated()] + + return X_train, X_valid, X_test diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py new file mode 100644 index 00000000..53214626 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py @@ -0,0 +1,78 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +# Check if a GPU is available +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +# Modified model for multi-class classification +class HybridFeatureInteractionModel(nn.Module): + def __init__(self, num_features, num_classes): + super(HybridFeatureInteractionModel, self).__init__() + self.fc1 = nn.Linear(num_features, 128) + self.bn1 = nn.BatchNorm1d(128) + self.fc2 = nn.Linear(128, 64) + self.bn2 = nn.BatchNorm1d(64) + self.fc3 = nn.Linear(64, num_classes) # Output nodes equal to num_classes + self.dropout = nn.Dropout(0.3) + + def forward(self, x): + x = F.relu(self.bn1(self.fc1(x))) + x = F.relu(self.bn2(self.fc2(x))) + x = self.dropout(x) + x = self.fc3(x) # No activation here, use CrossEntropyLoss + return x + + +# Training function +def fit(X_train, y_train, X_valid, y_valid): + num_features = X_train.shape[1] + num_classes = len(np.unique(y_train)) # Determine number of classes + model = HybridFeatureInteractionModel(num_features, num_classes).to(device) + criterion = nn.CrossEntropyLoss() # Use CrossEntropyLoss for multi-class + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # Convert to TensorDataset and create DataLoader + train_dataset = TensorDataset( + torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.to_numpy(), dtype=torch.long) + ) + valid_dataset = TensorDataset( + torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.to_numpy(), dtype=torch.long) + ) + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) + + # Train the model + model.train() + for epoch in range(5): # just for quick run + print(f"Epoch {epoch + 1}/5") + epoch_loss = 0 + for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): + X_batch, y_batch = X_batch.to(device), y_batch.to(device) + optimizer.zero_grad() + outputs = model(X_batch) + loss = criterion(outputs, y_batch) + loss.backward() + optimizer.step() + epoch_loss += loss.item() + print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") + + return model + + +# Prediction function +def predict(model, X): + model.eval() + predictions = [] + with torch.no_grad(): + X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) + for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): + batch = X_tensor[i : i + 32] + pred = model(batch) + pred = torch.argmax(pred, dim=1).cpu().numpy() # Use argmax to get class + predictions.extend(pred) + return np.array(predictions) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py new file mode 100644 index 00000000..879867f7 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py @@ -0,0 +1,53 @@ +""" +Motivation of the model: +The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. +It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good +baseline model for many classification tasks. +""" + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1) + + # Select features (if any feature selection is needed) + X_train_selected = select(X_train) + X_valid_selected = select(X_valid) + + # Fit the model + model.fit(X_train_selected, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid_selected) + accuracy = accuracy_score(y_valid, y_valid_pred) + print(f"Validation Accuracy: {accuracy:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Select features (if any feature selection is needed) + X_selected = select(X) + + # Predict using the trained model + y_pred = model.predict(X_selected) + + return y_pred diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py new file mode 100644 index 00000000..932aa92f --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py @@ -0,0 +1,41 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + X_train = select(X_train) + X_valid = select(X_valid) + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + params = { + "objective": "multi:softmax", # Use softmax for multi-class classification + "num_class": len(set(y_train)), # Number of classes + "nthread": -1, + } + num_round = 20 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + X = select(X) + dtest = xgb.DMatrix(X) + y_pred = model.predict(dtest) + return y_pred.astype(int) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py new file mode 100644 index 00000000..104cb4c7 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py @@ -0,0 +1,104 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import clean_and_impute_data, preprocess_script +from scipy import stats +from sklearn.metrics import accuracy_score, matthews_corrcoef +from sklearn.preprocessing import LabelEncoder + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +# support various method for metrics calculation +def compute_metrics_for_classification(y_true, y_pred): + """Compute accuracy metric for classification.""" + accuracy = accuracy_score(y_true, y_pred) + return accuracy + + +def compute_metrics_for_classification(y_true, y_pred): + """Compute MCC for classification.""" + mcc = matthews_corrcoef(y_true, y_pred) + return mcc + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +# TODO 如果已经做过数据预处理了,不需要再做了 +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test) + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred = predict_func(model, X_valid) + y_valid_pred_l.append(y_valid_pred) + print(y_valid_pred) + print(y_valid_pred.shape) + +# 5) Ensemble +# Majority vote ensemble +y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() + +# Compute metrics +accuracy = accuracy_score(y_valid, y_valid_pred_ensemble) +print(f"final accuracy on valid set: {accuracy}") + +# 6) Save the validation metrics +pd.Series(data=[accuracy], index=["multi-class accuracy"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +y_test_pred_l = [] +for model, predict_func in model_l: + y_test_pred_l.append(predict_func(model, X_test)) + +# For multiclass classification, use the mode of the predictions +y_test_pred = stats.mode(y_test_pred_l, axis=0)[0].flatten() + 1 + + +submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"]) +submission_result.insert(0, "Id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py deleted file mode 100644 index 90ec0c2a..00000000 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/cross_validation_tpl.py +++ /dev/null @@ -1,87 +0,0 @@ -from pathlib import Path - -import numpy as np -import pandas as pd -from sklearn.model_selection import KFold -from sklearn.preprocessing import LabelEncoder - -from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess - - -def compute_metrics_for_classification(y_true, y_pred): - """Compute MCC for classification.""" - from sklearn.metrics import matthews_corrcoef - - return matthews_corrcoef(y_true, y_pred) - - -def perform_kfold_cross_validation(X, y, n_splits=2, random_seed=42): - kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) - fold_metrics = [] - - DIRNAME = Path(__file__).absolute().resolve().parent - - for fold, (train_idx, valid_idx) in enumerate(kf.split(X)): - X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx] - y_train_fold, y_valid_fold = y[train_idx], y[valid_idx] - - # TODO: Preprocess and Feature Engineering before K-Fold CV - - # Preprocess the data - X_train_fold = preprocess(X_train_fold) - X_valid_fold = preprocess(X_valid_fold) - - # Feature Engineering - X_train_l_fold, X_valid_l_fold = [], [] - for f in DIRNAME.glob("feat*.py"): - m = __import__(f.name.strip(".py")) - X_train_fold = m.feat_eng(X_train_fold) - X_valid_fold = m.feat_eng(X_valid_fold) - - X_train_l_fold.append(X_train_fold) - X_valid_l_fold.append(X_valid_fold) - - X_train_fold = pd.concat(X_train_l_fold, axis=1) - X_valid_fold = pd.concat(X_valid_l_fold, axis=1) - - # Align features - X_valid_fold = X_valid_fold.reindex(columns=X_train_fold.columns, fill_value=0) - - # Train and evaluate models - mcc_scores = [] - model_l = [] # Reinitialize model list - for f in DIRNAME.glob("model*.py"): - m = __import__(f.name.strip(".py")) - model = m.fit(X_train_fold, y_train_fold, X_valid_fold, y_valid_fold) - y_valid_pred = m.predict(model, X_valid_fold) - mcc = compute_metrics_for_classification(y_valid_fold, y_valid_pred) - mcc_scores.append(mcc) - print(f"Fold {fold+1}, Model {f.name}: MCC = {mcc}") - - # Store the average MCC score for this fold - avg_mcc = np.mean(mcc_scores) - fold_metrics.append(avg_mcc) - print(f"Fold {fold+1} average MCC: {avg_mcc}") - - # Calculate the overall average MCC - overall_avg_mcc = np.mean(fold_metrics) - result_df = pd.DataFrame({"Overall Average MCC": [overall_avg_mcc]}) - result_df.to_csv(f"path/to/playground-series-s4e8/cv_score_{f.name.strip('.py')}.csv", index=False) - - print(f"Overall Average MCC across all folds: {overall_avg_mcc}") - return overall_avg_mcc - - -# This allows the script to be run directly -if __name__ == "__main__": - # Load and preprocess the data - data_df = pd.read_csv("path/to/playground-series-s4e8/train.csv") - data_df = data_df.drop(["id"], axis=1) - - X = data_df.drop(["class"], axis=1) - y = data_df[["class"]] - - label_encoder = LabelEncoder() - # transfrom y to 1D - y = label_encoder.fit_transform(y) - result = perform_kfold_cross_validation(X, y) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_nn.py new file mode 100644 index 00000000..00431100 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_nn.py @@ -0,0 +1,76 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +# Check if a GPU is available +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +# Restored three-layer model structure +class HybridFeatureInteractionModel(nn.Module): + def __init__(self, num_features): + super(HybridFeatureInteractionModel, self).__init__() + self.fc1 = nn.Linear(num_features, 128) + self.bn1 = nn.BatchNorm1d(128) + self.fc2 = nn.Linear(128, 64) + self.bn2 = nn.BatchNorm1d(64) + self.fc3 = nn.Linear(64, 1) + self.dropout = nn.Dropout(0.3) + + def forward(self, x): + x = F.relu(self.bn1(self.fc1(x))) + x = F.relu(self.bn2(self.fc2(x))) + x = self.dropout(x) + x = torch.sigmoid(self.fc3(x)) + return x + + +# Training function +def fit(X_train, y_train, X_valid, y_valid): + num_features = X_train.shape[1] + model = HybridFeatureInteractionModel(num_features).to(device) + criterion = nn.BCELoss() # Binary classification problem + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # Convert to TensorDataset and create DataLoader + train_dataset = TensorDataset( + torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.reshape(-1), dtype=torch.float32) + ) + valid_dataset = TensorDataset( + torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.reshape(-1), dtype=torch.float32) + ) + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) + + # Train the model + model.train() + for epoch in range(5): + print(f"Epoch {epoch + 1}/5") + epoch_loss = 0 + for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): + X_batch, y_batch = X_batch.to(device), y_batch.to(device) # Move data to the device + optimizer.zero_grad() + outputs = model(X_batch).squeeze(1) # Reshape outputs to [32] + loss = criterion(outputs, y_batch) # Adjust target shape + loss.backward() + optimizer.step() + epoch_loss += loss.item() + print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") + + return model + + +# Prediction function +def predict(model, X): + model.eval() + predictions = [] + with torch.no_grad(): + X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) # Move data to the device + for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): + batch = X_tensor[i : i + 32] # Predict in batches + pred = model(batch).squeeze().cpu().numpy() # Move results back to CPU + predictions.extend(pred) + return np.array(predictions) # Return boolean predictions diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py index 56b81c9a..a70fa680 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/model_xgboost.py @@ -22,7 +22,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v params = { "nthred": -1, } - num_round = 200 + num_round = 180 evallist = [(dtrain, "train"), (dvalid, "eval")] bst = xgb.train(params, dtrain, num_round, evallist) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py index 79975ba7..ea32b625 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py @@ -16,12 +16,6 @@ # support various method for metrics calculation -def compute_metrics_for_classification(y_true, y_pred): - """Compute accuracy metric for classification.""" - accuracy = accuracy_score(y_true, y_pred) - return accuracy - - def compute_metrics_for_classification(y_true, y_pred): """Compute MCC for classification.""" mcc = matthews_corrcoef(y_true, y_pred) @@ -102,20 +96,15 @@ def import_module_from_path(module_name, module_path): pd.Series(data=[mcc], index=["MCC"]).to_csv("submission_score.csv") # 7) Make predictions on the test set and save them -label_encoder = LabelEncoder() -label_encoder.fit(y_train) -y_test_pred_bool_l = [] +y_test_pred_l = [] for m, m_pred in model_l: - y_test_pred_bool_l.append( - m_pred(m, X_test).astype(int) - ) # TODO Make this an ensemble. Currently it uses the last prediction + y_test_pred_l.append(m_pred(m, X_test)) # TODO Make this an ensemble. Currently it uses the last prediction -y_test_pred = np.mean(y_test_pred_bool_l, axis=0) +y_test_pred = np.mean(y_test_pred_l, axis=0) y_test_pred = (y_test_pred > 0.5).astype(int) -y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) # 将整数转换回 'e' 或 'p' - -submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels}) +y_test_pred_labels = np.where(y_test_pred == 1, "p", "e") # 将整数转换回 'e' 或 'p' # 8) Submit predictions for the test set +submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels}) submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py new file mode 100644 index 00000000..dd045340 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/fea_share_preprocess.py @@ -0,0 +1,108 @@ +import os + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + + +def prepreprocess(): + """ + This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. + """ + # Load and preprocess the data + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["id"], axis=1) + + X = data_df.drop(["price"], axis=1) + y = data_df["price"] + + # Split the data into training and validation sets + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + """ + Fits the preprocessor on the training data and returns the fitted preprocessor. + """ + # Identify numerical and categorical features + numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] + categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"] + + # Define preprocessors for numerical and categorical features + categorical_transformer = Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore")), + ] + ) + + numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) + + # Combine preprocessing steps + preprocessor = ColumnTransformer( + transformers=[ + ("cat", categorical_transformer, categorical_cols), + ("num", numerical_transformer, numerical_cols), + ] + ) + + # Fit the preprocessor on the training data + preprocessor.fit(X_train) + + return preprocessor + + +def preprocess_transform(X: pd.DataFrame, preprocessor): + """ + Transforms the given DataFrame using the fitted preprocessor. + Ensures the processed data has consistent features across train, validation, and test sets. + """ + # Transform the data using the fitted preprocessor + X_array = preprocessor.transform(X).toarray() + + # Get feature names for the columns in the transformed data + categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] + feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out( + categorical_cols + ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]] + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index) + + return X_transformed + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("X_train.pkl"): + X_train = pd.read_pickle("X_train.pkl") + X_valid = pd.read_pickle("X_valid.pkl") + y_train = pd.read_pickle("y_train.pkl") + y_valid = pd.read_pickle("y_valid.pkl") + X_test = pd.read_pickle("X_test.pkl") + passenger_ids = pd.read_pickle("passenger_ids.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Fit the preprocessor on the training data + preprocessor = preprocess_fit(X_train) + + # Preprocess the train, validation, and test data + X_train = preprocess_transform(X_train, preprocessor) + X_valid = preprocess_transform(X_valid, preprocessor) + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py new file mode 100644 index 00000000..b45175e8 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_nn.py @@ -0,0 +1,78 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +# Check if a GPU is available +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +# Modified model for regression +class HybridFeatureInteractionModel(nn.Module): + def __init__(self, num_features): + super(HybridFeatureInteractionModel, self).__init__() + self.fc1 = nn.Linear(num_features, 128) + self.bn1 = nn.BatchNorm1d(128) + self.fc2 = nn.Linear(128, 64) + self.bn2 = nn.BatchNorm1d(64) + self.fc3 = nn.Linear(64, 1) # Output a single value for regression + self.dropout = nn.Dropout(0.3) + + def forward(self, x): + x = F.relu(self.bn1(self.fc1(x))) + x = F.relu(self.bn2(self.fc2(x))) + x = self.dropout(x) + x = self.fc3(x) # No activation for regression + return x + + +# Training function +def fit(X_train, y_train, X_valid, y_valid): + num_features = X_train.shape[1] + model = HybridFeatureInteractionModel(num_features).to(device) + criterion = nn.MSELoss() # Use MSELoss for regression + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # Convert to TensorDataset and create DataLoader + train_dataset = TensorDataset( + torch.tensor(X_train.to_numpy(), dtype=torch.float32), + torch.tensor(y_train.to_numpy().reshape(-1), dtype=torch.float32), # Convert to NumPy array + ) + valid_dataset = TensorDataset( + torch.tensor(X_valid.to_numpy(), dtype=torch.float32), + torch.tensor(y_valid.to_numpy().reshape(-1), dtype=torch.float32), # Convert to NumPy array + ) + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) + + # Train the model + model.train() + for epoch in range(5): + print(f"Epoch {epoch + 1}/5") + epoch_loss = 0 + for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): + X_batch, y_batch = X_batch.to(device), y_batch.to(device) # Move data to the device + optimizer.zero_grad() + outputs = model(X_batch).squeeze(1) # Reshape outputs to [32] + loss = criterion(outputs, y_batch) # Adjust target shape + loss.backward() + optimizer.step() + epoch_loss += loss.item() + print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") + + return model + + +# Prediction function +def predict(model, X): + model.eval() + predictions = [] + with torch.no_grad(): + X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) # Move data to the device + for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): + batch = X_tensor[i : i + 32] # Predict in batches + pred = model(batch).squeeze().cpu().numpy() # Move results back to CPU + predictions.extend(pred) + return np.array(predictions) # Return predicted values diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py new file mode 100644 index 00000000..867e6b64 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_randomforest.py @@ -0,0 +1,48 @@ +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1) + + # Select features (if any feature selection is needed) + X_train_selected = select(X_train) + X_valid_selected = select(X_valid) + + # Fit the model + model.fit(X_train_selected, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid_selected) + mse = mean_squared_error(y_valid, y_valid_pred) + rmse = np.sqrt(mse) + print(f"Validation RMSE: {rmse:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Select features (if any feature selection is needed) + X_selected = select(X) + + # Predict using the trained model + y_pred = model.predict(X_selected) + + return y_pred diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py new file mode 100644 index 00000000..84a8f5e2 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/model_xgboost.py @@ -0,0 +1,37 @@ +import pandas as pd +import xgboost as xgb + + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + X_train = select(X_train) + X_valid = select(X_valid) + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + # Parameters for regression + params = { + "objective": "reg:squarederror", # Use squared error for regression + "nthread": -1, + } + num_round = 200 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + X = select(X) + dtest = xgb.DMatrix(X) + y_pred = model.predict(dtest) + return y_pred diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py new file mode 100644 index 00000000..3e98be58 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/train.py @@ -0,0 +1,103 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import mean_squared_error +from sklearn.preprocessing import LabelEncoder + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def compute_rmse(y_true, y_pred): + """Compute RMSE for regression.""" + mse = mean_squared_error(y_true, y_pred) + rmse = np.sqrt(mse) + return rmse + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + +from sklearn.impute import SimpleImputer + +imputer = SimpleImputer(strategy="mean") + +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + +# Remove duplicate columns +X_train = X_train.loc[:, ~X_train.columns.duplicated()] +X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] +X_test = X_test.loc[:, ~X_test.columns.duplicated()] + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func,]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred_l.append(predict_func(model, X_valid)) + +# 5) Ensemble +y_valid_pred = np.mean(y_valid_pred_l, axis=0) + +rmse = compute_rmse(y_valid, y_valid_pred) +print("Final RMSE on validation set: ", rmse) + +# 6) Save the validation RMSE +pd.Series(data=[rmse], index=["RMSE"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +y_test_pred_l = [] +for m, m_pred in model_l: + y_test_pred_l.append(m_pred(m, X_test)) + +y_test_pred = np.mean(y_test_pred_l, axis=0) + +# 8) Submit predictions for the test set +submission_result = pd.DataFrame({"id": ids, "price": y_test_pred}) +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py index 806dd7ae..33ed3eb7 100644 --- a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py +++ b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py @@ -34,8 +34,8 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali # Validate the model y_valid_pred = model.predict(X_valid_selected) - accuracy = accuracy_score(y_valid, y_valid_pred) - print(f"Validation Accuracy: {accuracy:.4f}") + # accuracy = accuracy_score(y_valid, y_valid_pred) + # print(f"Validation Accuracy: {accuracy:.4f}") return model diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py new file mode 100644 index 00000000..02536382 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/fea_share_preprocess.py @@ -0,0 +1,111 @@ +import os + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import LabelEncoder, OneHotEncoder + + +def prepreprocess(): + """ + This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. + """ + # Load and preprocess the data + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["PassengerId"], axis=1) + + X = data_df.drop(["Transported"], axis=1) + y = data_df[["Transported"]] + + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) # Convert class labels to numeric + + # Split the data into training and validation sets + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + """ + Fits the preprocessor on the training data and returns the fitted preprocessor. + """ + # Identify numerical and categorical features + numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] + categorical_cols = [cname for cname in X_train.columns if X_train[cname].dtype == "object"] + + # Define preprocessors for numerical and categorical features + categorical_transformer = Pipeline( + steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore")), + ] + ) + + numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) + + # Combine preprocessing steps + preprocessor = ColumnTransformer( + transformers=[ + ("cat", categorical_transformer, categorical_cols), + ("num", numerical_transformer, numerical_cols), + ] + ) + + # Fit the preprocessor on the training data + preprocessor.fit(X_train) + + return preprocessor + + +def preprocess_transform(X: pd.DataFrame, preprocessor): + """ + Transforms the given DataFrame using the fitted preprocessor. + Ensures the processed data has consistent features across train, validation, and test sets. + """ + # Transform the data using the fitted preprocessor + X_array = preprocessor.transform(X).toarray() + + # Get feature names for the columns in the transformed data + categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] + feature_names = preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out( + categorical_cols + ).tolist() + [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]] + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index) + + return X_transformed + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("X_train.pkl"): + X_train = pd.read_pickle("X_train.pkl") + X_valid = pd.read_pickle("X_valid.pkl") + y_train = pd.read_pickle("y_train.pkl") + y_valid = pd.read_pickle("y_valid.pkl") + X_test = pd.read_pickle("X_test.pkl") + passenger_ids = pd.read_pickle("passenger_ids.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids + X_train, X_valid, y_train, y_valid = prepreprocess() + + # Fit the preprocessor on the training data + preprocessor = preprocess_fit(X_train) + + # Preprocess the train, validation, and test data + X_train = preprocess_transform(X_train, preprocessor) + X_valid = preprocess_transform(X_valid, preprocessor) + + # Load and preprocess the test data + submission_df = pd.read_csv("/kaggle/input/test.csv") + passenger_ids = submission_df["PassengerId"] + submission_df = submission_df.drop(["PassengerId"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor) + + return X_train, X_valid, y_train, y_valid, X_test, passenger_ids diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py new file mode 100644 index 00000000..00431100 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py @@ -0,0 +1,76 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, TensorDataset +from tqdm import tqdm + +# Check if a GPU is available +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +# Restored three-layer model structure +class HybridFeatureInteractionModel(nn.Module): + def __init__(self, num_features): + super(HybridFeatureInteractionModel, self).__init__() + self.fc1 = nn.Linear(num_features, 128) + self.bn1 = nn.BatchNorm1d(128) + self.fc2 = nn.Linear(128, 64) + self.bn2 = nn.BatchNorm1d(64) + self.fc3 = nn.Linear(64, 1) + self.dropout = nn.Dropout(0.3) + + def forward(self, x): + x = F.relu(self.bn1(self.fc1(x))) + x = F.relu(self.bn2(self.fc2(x))) + x = self.dropout(x) + x = torch.sigmoid(self.fc3(x)) + return x + + +# Training function +def fit(X_train, y_train, X_valid, y_valid): + num_features = X_train.shape[1] + model = HybridFeatureInteractionModel(num_features).to(device) + criterion = nn.BCELoss() # Binary classification problem + optimizer = torch.optim.Adam(model.parameters(), lr=0.001) + + # Convert to TensorDataset and create DataLoader + train_dataset = TensorDataset( + torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.reshape(-1), dtype=torch.float32) + ) + valid_dataset = TensorDataset( + torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.reshape(-1), dtype=torch.float32) + ) + train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) + valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) + + # Train the model + model.train() + for epoch in range(5): + print(f"Epoch {epoch + 1}/5") + epoch_loss = 0 + for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): + X_batch, y_batch = X_batch.to(device), y_batch.to(device) # Move data to the device + optimizer.zero_grad() + outputs = model(X_batch).squeeze(1) # Reshape outputs to [32] + loss = criterion(outputs, y_batch) # Adjust target shape + loss.backward() + optimizer.step() + epoch_loss += loss.item() + print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") + + return model + + +# Prediction function +def predict(model, X): + model.eval() + predictions = [] + with torch.no_grad(): + X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) # Move data to the device + for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): + batch = X_tensor[i : i + 32] # Predict in batches + pred = model(batch).squeeze().cpu().numpy() # Move results back to CPU + predictions.extend(pred) + return np.array(predictions) # Return boolean predictions diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py new file mode 100644 index 00000000..3c64a094 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_randomforest.py @@ -0,0 +1,54 @@ +""" +Motivation of the model: +The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. +It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good +baseline model for many classification tasks. +""" + +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1) + + # Select features (if any feature selection is needed) + X_train_selected = select(X_train) + X_valid_selected = select(X_valid) + + # Fit the model + model.fit(X_train_selected, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid_selected) + accuracy = accuracy_score(y_valid, y_valid_pred) + print(f"Validation Accuracy: {accuracy:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Select features (if any feature selection is needed) + X_selected = select(X) + + # Predict using the trained model + y_pred_prob = model.predict_proba(X_selected)[:, 1] + + # Apply threshold to get boolean predictions + return y_pred_prob diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py new file mode 100644 index 00000000..a70fa680 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_xgboost.py @@ -0,0 +1,40 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def select(X: pd.DataFrame) -> pd.DataFrame: + # Ignore feature selection logic + return X + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + X_train = select(X_train) + X_valid = select(X_valid) + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + # TODO: for quick running.... + params = { + "nthred": -1, + } + num_round = 180 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + X = select(X) + dtest = xgb.DMatrix(X) + y_pred_prob = model.predict(dtest) + return y_pred_prob diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py new file mode 100644 index 00000000..261ea364 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py @@ -0,0 +1,109 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import accuracy_score, matthews_corrcoef +from sklearn.preprocessing import LabelEncoder + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +# support various method for metrics calculation +def compute_metrics_for_classification(y_true, y_pred): + """Compute accuracy metric for classification.""" + accuracy = accuracy_score(y_true, y_pred) + return accuracy + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +# TODO 如果已经做过数据预处理了,不需要再做了 +X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + +from sklearn.impute import SimpleImputer + +imputer = SimpleImputer(strategy="mean") + +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + +# Remove duplicate columns +X_train = X_train.loc[:, ~X_train.columns.duplicated()] +X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] +X_test = X_test.loc[:, ~X_test.columns.duplicated()] + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func,]] +for f in DIRNAME.glob("model/model*.py"): + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) + +# 4) Evaluate the model on the validation set +y_valid_pred_l = [] +for model, predict_func in model_l: + y_valid_pred_l.append(predict_func(model, X_valid)) + +# 5) Ensemble +# TODO: ensemble method in a script +# Average the predictions and apply a threshold to determine class labels +y_valid_pred = np.mean(y_valid_pred_l, axis=0) +y_valid_pred = (y_valid_pred > 0.5).astype(int) + +mcc = compute_metrics_for_classification(y_valid, y_valid_pred) +print("Final on validation set: ", mcc) + +# 6) Save the validation accuracy +pd.Series(data=[mcc], index=["MCC"]).to_csv("submission_score.csv") + +# 7) Make predictions on the test set and save them +y_test_pred_l = [] +for m, m_pred in model_l: + y_test_pred_l.append(m_pred(m, X_test)) # TODO Make this an ensemble. Currently it uses the last prediction + +y_test_pred = np.mean(y_test_pred_l, axis=0) +y_test_pred = (y_test_pred > 0.5).astype(bool) + +submission_result = pd.DataFrame({"PassengerId": passenger_ids, "Transported": y_test_pred}) + +# 8) Submit predictions for the test set +submission_result.to_csv("submission.csv", index=False)