Skip to content

Commit

Permalink
feat: add more templates for kaggle (#291)
Browse files Browse the repository at this point in the history
* init for forest-cover-type-prediction

* add nn model for forest-cover-type-prediction

* add cross_validation for forest-cover-type-prediction

* edit path to file

* CI issues

* CI Issue

* edit dir name

* fix a bug in s4e8 ensemble & init spaceship-titanic

* add nn model for s4e8 & spaceship-titanic

* init for s4e9

* ci issues

* ci issue
  • Loading branch information
TPLin22 authored Sep 22, 2024
1 parent c8efdd5 commit da752ec
Show file tree
Hide file tree
Showing 24 changed files with 1,366 additions and 106 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
data_df = pd.read_csv(
"/data/userdata/v-haoranpan/RD-Agent/git_ignore_folder/data/forest-cover-type-prediction/train.csv"
)
data_df = data_df.drop(["Id"], axis=1)

X_train = data_df.drop(["Cover_Type"], axis=1)
y_train = data_df["Cover_Type"] - 1

# Set up KFold
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# Store results
accuracies = []

# 3) Train and evaluate using KFold
fold_number = 1
for train_index, valid_index in kf.split(X_train):
print(f"Starting fold {fold_number}...")

X_train_l, X_valid_l = [], [] # Reset feature lists for each fold
X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

# Feature engineering
for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_tr)
X_train_f = cls.transform(X_tr)
X_valid_f = cls.transform(X_val)

X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)

X_tr = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_val = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])

print("Shape of X_tr: ", X_tr.shape, " Shape of X_val: ", X_val.shape)

# Replace inf and -inf with NaN
X_tr.replace([np.inf, -np.inf], np.nan, inplace=True)
X_val.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_tr = pd.DataFrame(imputer.fit_transform(X_tr), columns=X_tr.columns)
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Remove duplicate columns
X_tr = X_tr.loc[:, ~X_tr.columns.duplicated()]
X_val = X_val.loc[:, ~X_val.columns.duplicated()]

# Train the model
model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_tr, y_tr, X_val, y_val), m.predict))

# Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred = predict_func(model, X_val)
y_valid_pred_l.append(y_valid_pred)

# Majority vote ensemble
y_valid_pred_ensemble = stats.mode(y_valid_pred_l, axis=0)[0].flatten()

# Compute metrics
accuracy = accuracy_score(y_val, y_valid_pred_ensemble)
accuracies.append(accuracy)
print(f"Fold {fold_number} accuracy: {accuracy}")

fold_number += 1

# Print average accuracy
print(f"Average accuracy across folds: {np.mean(accuracies)}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop(["Id"], axis=1)

X = data_df.drop(["Cover_Type"], axis=1)
y = data_df["Cover_Type"] - 1

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("X_train.pkl"):
X_train = pd.read_pickle("X_train.pkl")
X_valid = pd.read_pickle("X_valid.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
ids = pd.read_pickle("ids.pkl")

return X_train, X_valid, y_train, y_valid, X_test, ids

X_train, X_valid, y_train, y_valid = prepreprocess()

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
ids = submission_df["Id"]
X_test = submission_df.drop(["Id"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids


def clean_and_impute_data(X_train, X_valid, X_test):
"""
Handles inf and -inf values by replacing them with NaN,
then imputes missing values using the mean strategy.
Also removes duplicate columns.
"""
# Replace inf and -inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

return X_train, X_valid, X_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Modified model for multi-class classification
class HybridFeatureInteractionModel(nn.Module):
def __init__(self, num_features, num_classes):
super(HybridFeatureInteractionModel, self).__init__()
self.fc1 = nn.Linear(num_features, 128)
self.bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, num_classes) # Output nodes equal to num_classes
self.dropout = nn.Dropout(0.3)

def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = F.relu(self.bn2(self.fc2(x)))
x = self.dropout(x)
x = self.fc3(x) # No activation here, use CrossEntropyLoss
return x


# Training function
def fit(X_train, y_train, X_valid, y_valid):
num_features = X_train.shape[1]
num_classes = len(np.unique(y_train)) # Determine number of classes
model = HybridFeatureInteractionModel(num_features, num_classes).to(device)
criterion = nn.CrossEntropyLoss() # Use CrossEntropyLoss for multi-class
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Convert to TensorDataset and create DataLoader
train_dataset = TensorDataset(
torch.tensor(X_train.to_numpy(), dtype=torch.float32), torch.tensor(y_train.to_numpy(), dtype=torch.long)
)
valid_dataset = TensorDataset(
torch.tensor(X_valid.to_numpy(), dtype=torch.float32), torch.tensor(y_valid.to_numpy(), dtype=torch.long)
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# Train the model
model.train()
for epoch in range(5): # just for quick run
print(f"Epoch {epoch + 1}/5")
epoch_loss = 0
for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False):
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
outputs = model(X_batch)
loss = criterion(outputs, y_batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}")

return model


# Prediction function
def predict(model, X):
model.eval()
predictions = []
with torch.no_grad():
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False):
batch = X_tensor[i : i + 32]
pred = model(batch)
pred = torch.argmax(pred, dim=1).cpu().numpy() # Use argmax to get class
predictions.extend(pred)
return np.array(predictions)
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred = model.predict(X_selected)

return y_pred
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
"objective": "multi:softmax", # Use softmax for multi-class classification
"num_class": len(set(y_train)), # Number of classes
"nthread": -1,
}
num_round = 20

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred = model.predict(dtest)
return y_pred.astype(int)
Loading

0 comments on commit da752ec

Please sign in to comment.