-
-
Notifications
You must be signed in to change notification settings - Fork 97
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add more templates for kaggle (#291)
* init for forest-cover-type-prediction * add nn model for forest-cover-type-prediction * add cross_validation for forest-cover-type-prediction * edit path to file * CI issues * CI Issue * edit dir name * fix a bug in s4e8 ensemble & init spaceship-titanic * add nn model for s4e8 & spaceship-titanic * init for s4e9 * ci issues * ci issue
- Loading branch information
Showing
24 changed files
with
1,366 additions
and
106 deletions.
There are no files selected for viewing
101 changes: 101 additions & 0 deletions
101
...ent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/cross_validation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import importlib.util | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from scipy import stats | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.model_selection import KFold | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
def import_module_from_path(module_name, module_path): | ||
spec = importlib.util.spec_from_file_location(module_name, module_path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) | ||
return module | ||
|
||
|
||
# 1) Preprocess the data | ||
data_df = pd.read_csv( | ||
"/data/userdata/v-haoranpan/RD-Agent/git_ignore_folder/data/forest-cover-type-prediction/train.csv" | ||
) | ||
data_df = data_df.drop(["Id"], axis=1) | ||
|
||
X_train = data_df.drop(["Cover_Type"], axis=1) | ||
y_train = data_df["Cover_Type"] - 1 | ||
|
||
# Set up KFold | ||
kf = KFold(n_splits=5, shuffle=True, random_state=SEED) | ||
|
||
# Store results | ||
accuracies = [] | ||
|
||
# 3) Train and evaluate using KFold | ||
fold_number = 1 | ||
for train_index, valid_index in kf.split(X_train): | ||
print(f"Starting fold {fold_number}...") | ||
|
||
X_train_l, X_valid_l = [], [] # Reset feature lists for each fold | ||
X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index] | ||
y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index] | ||
|
||
# Feature engineering | ||
for f in DIRNAME.glob("feature/feat*.py"): | ||
cls = import_module_from_path(f.stem, f).feature_engineering_cls() | ||
cls.fit(X_tr) | ||
X_train_f = cls.transform(X_tr) | ||
X_valid_f = cls.transform(X_val) | ||
|
||
X_train_l.append(X_train_f) | ||
X_valid_l.append(X_valid_f) | ||
|
||
X_tr = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) | ||
X_val = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) | ||
|
||
print("Shape of X_tr: ", X_tr.shape, " Shape of X_val: ", X_val.shape) | ||
|
||
# Replace inf and -inf with NaN | ||
X_tr.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_val.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
|
||
# Impute missing values | ||
imputer = SimpleImputer(strategy="mean") | ||
X_tr = pd.DataFrame(imputer.fit_transform(X_tr), columns=X_tr.columns) | ||
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns) | ||
|
||
# Remove duplicate columns | ||
X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()] | ||
X_val = X_val.loc[:, ~X_val.columns.duplicated()] | ||
|
||
# Train the model | ||
model_l = [] # list[tuple[model, predict_func]] | ||
for f in DIRNAME.glob("model/model*.py"): | ||
m = import_module_from_path(f.stem, f) | ||
model_l.append((m.fit(X_tr, y_tr, X_val, y_val), m.predict)) | ||
|
||
# Evaluate the model on the validation set | ||
y_valid_pred_l = [] | ||
for model, predict_func in model_l: | ||
y_valid_pred = predict_func(model, X_val) | ||
y_valid_pred_l.append(y_valid_pred) | ||
|
||
# Majority vote ensemble | ||
y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten() | ||
|
||
# Compute metrics | ||
accuracy = accuracy_score(y_val, y_valid_pred_ensemble) | ||
accuracies.append(accuracy) | ||
print(f"Fold {fold_number} accuracy: {accuracy}") | ||
|
||
fold_number += 1 | ||
|
||
# Print average accuracy | ||
print(f"Average accuracy across folds: {np.mean(accuracies)}") |
72 changes: 72 additions & 0 deletions
72
...scenarios/kaggle/experiment/forest-cover-type-prediction_template/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.model_selection import train_test_split | ||
|
||
|
||
def prepreprocess(): | ||
""" | ||
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. | ||
""" | ||
# Load and preprocess the data | ||
data_df = pd.read_csv("/kaggle/input/train.csv") | ||
data_df = data_df.drop(["Id"], axis=1) | ||
|
||
X = data_df.drop(["Cover_Type"], axis=1) | ||
y = data_df["Cover_Type"] - 1 | ||
|
||
# Split the data into training and validation sets | ||
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) | ||
|
||
return X_train, X_valid, y_train, y_valid | ||
|
||
|
||
def preprocess_script(): | ||
""" | ||
This method applies the preprocessing steps to the training, validation, and test datasets. | ||
""" | ||
if os.path.exists("X_train.pkl"): | ||
X_train = pd.read_pickle("X_train.pkl") | ||
X_valid = pd.read_pickle("X_valid.pkl") | ||
y_train = pd.read_pickle("y_train.pkl") | ||
y_valid = pd.read_pickle("y_valid.pkl") | ||
X_test = pd.read_pickle("X_test.pkl") | ||
ids = pd.read_pickle("ids.pkl") | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, ids | ||
|
||
X_train, X_valid, y_train, y_valid = prepreprocess() | ||
|
||
# Load and preprocess the test data | ||
submission_df = pd.read_csv("/kaggle/input/test.csv") | ||
ids = submission_df["Id"] | ||
X_test = submission_df.drop(["Id"], axis=1) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, ids | ||
|
||
|
||
def clean_and_impute_data(X_train, X_valid, X_test): | ||
""" | ||
Handles inf and -inf values by replacing them with NaN, | ||
then imputes missing values using the mean strategy. | ||
Also removes duplicate columns. | ||
""" | ||
# Replace inf and -inf with NaN | ||
X_train.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_test.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
|
||
# Impute missing values | ||
imputer = SimpleImputer(strategy="mean") | ||
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) | ||
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) | ||
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) | ||
|
||
# Remove duplicate columns | ||
X_train = X_train.loc[:, ~X_train.columns.duplicated()] | ||
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] | ||
X_test = X_test.loc[:, ~X_test.columns.duplicated()] | ||
|
||
return X_train, X_valid, X_test |
23 changes: 23 additions & 0 deletions
23
rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/feature/feature.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with a class that has a fit and transform method. | ||
Remember | ||
""" | ||
|
||
|
||
class IdentityFeature: | ||
def fit(self, train_df: pd.DataFrame): | ||
""" | ||
Fit the feature engineering model to the training data. | ||
""" | ||
pass | ||
|
||
def transform(self, X: pd.DataFrame): | ||
""" | ||
Transform the input data. | ||
""" | ||
return X | ||
|
||
|
||
feature_engineering_cls = IdentityFeature |
78 changes: 78 additions & 0 deletions
78
rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import numpy as np | ||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
from torch.utils.data import DataLoader, TensorDataset | ||
from tqdm import tqdm | ||
|
||
# Check if a GPU is available | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
|
||
|
||
# Modified model for multi-class classification | ||
class HybridFeatureInteractionModel(nn.Module): | ||
def __init__(self, num_features, num_classes): | ||
super(HybridFeatureInteractionModel, self).__init__() | ||
self.fc1 = nn.Linear(num_features, 128) | ||
self.bn1 = nn.BatchNorm1d(128) | ||
self.fc2 = nn.Linear(128, 64) | ||
self.bn2 = nn.BatchNorm1d(64) | ||
self.fc3 = nn.Linear(64, num_classes) # Output nodes equal to num_classes | ||
self.dropout = nn.Dropout(0.3) | ||
|
||
def forward(self, x): | ||
x = F.relu(self.bn1(self.fc1(x))) | ||
x = F.relu(self.bn2(self.fc2(x))) | ||
x = self.dropout(x) | ||
x = self.fc3(x) # No activation here, use CrossEntropyLoss | ||
return x | ||
|
||
|
||
# Training function | ||
def fit(X_train, y_train, X_valid, y_valid): | ||
num_features = X_train.shape[1] | ||
num_classes = len(np.unique(y_train)) # Determine number of classes | ||
model = HybridFeatureInteractionModel(num_features, num_classes).to(device) | ||
criterion = nn.CrossEntropyLoss() # Use CrossEntropyLoss for multi-class | ||
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) | ||
|
||
# Convert to TensorDataset and create DataLoader | ||
train_dataset = TensorDataset( | ||
torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.to_numpy(), dtype=torch.long) | ||
) | ||
valid_dataset = TensorDataset( | ||
torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.to_numpy(), dtype=torch.long) | ||
) | ||
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) | ||
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) | ||
|
||
# Train the model | ||
model.train() | ||
for epoch in range(5): # just for quick run | ||
print(f"Epoch {epoch + 1}/5") | ||
epoch_loss = 0 | ||
for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): | ||
X_batch, y_batch = X_batch.to(device), y_batch.to(device) | ||
optimizer.zero_grad() | ||
outputs = model(X_batch) | ||
loss = criterion(outputs, y_batch) | ||
loss.backward() | ||
optimizer.step() | ||
epoch_loss += loss.item() | ||
print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") | ||
|
||
return model | ||
|
||
|
||
# Prediction function | ||
def predict(model, X): | ||
model.eval() | ||
predictions = [] | ||
with torch.no_grad(): | ||
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) | ||
for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): | ||
batch = X_tensor[i : i + 32] | ||
pred = model(batch) | ||
pred = torch.argmax(pred, dim=1).cpu().numpy() # Use argmax to get class | ||
predictions.extend(pred) | ||
return np.array(predictions) |
53 changes: 53 additions & 0 deletions
53
...arios/kaggle/experiment/forest-cover-type-prediction_template/model/model_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
""" | ||
Motivation of the model: | ||
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. | ||
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good | ||
baseline model for many classification tasks. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.metrics import accuracy_score | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
return X | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): | ||
""" | ||
Define and train the Random Forest model. Merge feature selection into the pipeline. | ||
""" | ||
# Initialize the Random Forest model | ||
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1) | ||
|
||
# Select features (if any feature selection is needed) | ||
X_train_selected = select(X_train) | ||
X_valid_selected = select(X_valid) | ||
|
||
# Fit the model | ||
model.fit(X_train_selected, y_train) | ||
|
||
# Validate the model | ||
y_valid_pred = model.predict(X_valid_selected) | ||
accuracy = accuracy_score(y_valid, y_valid_pred) | ||
print(f"Validation Accuracy: {accuracy:.4f}") | ||
|
||
return model | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature selection's consistency and make predictions. | ||
""" | ||
# Select features (if any feature selection is needed) | ||
X_selected = select(X) | ||
|
||
# Predict using the trained model | ||
y_pred = model.predict(X_selected) | ||
|
||
return y_pred |
41 changes: 41 additions & 0 deletions
41
.../scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
""" | ||
motivation of the model | ||
""" | ||
|
||
import pandas as pd | ||
import xgboost as xgb | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
# Ignore feature selection logic | ||
return X | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): | ||
"""Define and train the model. Merge feature_select""" | ||
X_train = select(X_train) | ||
X_valid = select(X_valid) | ||
dtrain = xgb.DMatrix(X_train, label=y_train) | ||
dvalid = xgb.DMatrix(X_valid, label=y_valid) | ||
|
||
params = { | ||
"objective": "multi:softmax", # Use softmax for multi-class classification | ||
"num_class": len(set(y_train)), # Number of classes | ||
"nthread": -1, | ||
} | ||
num_round = 20 | ||
|
||
evallist = [(dtrain, "train"), (dvalid, "eval")] | ||
bst = xgb.train(params, dtrain, num_round, evallist) | ||
|
||
return bst | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature select's consistency. | ||
""" | ||
X = select(X) | ||
dtest = xgb.DMatrix(X) | ||
y_pred = model.predict(dtest) | ||
return y_pred.astype(int) |
Oops, something went wrong.