Skip to content

Commit

Permalink
feat: supporting Mnist competition (#375)
Browse files Browse the repository at this point in the history
* Initialised a version

* Fixes

* feat mnist & fix some bugs(with TODO)

---------

Co-authored-by: TPLin22 <[email protected]>
  • Loading branch information
xisen-w and TPLin22 authored Sep 27, 2024
1 parent c41686f commit e958a34
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
# data_df = data_df.drop(["ImageId"], axis=1)

X = data_df.drop(["label"], axis=1)
y = data_df["label"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
# ids = submission_df["ImageId"]
X_test = submission_df

X_train = X_train / 255
X_valid = X_valid / 255
X_test = X_test / 255

return X_train, X_valid, y_train, y_valid, X_test


def clean_and_impute_data(X_train, X_valid, X_test):
"""
Handles inf and -inf values by replacing them with NaN,
then imputes missing values using the mean strategy.
Also removes duplicate columns.
"""
# Replace inf and -inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

return X_train, X_valid, X_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def fit(X_train, y_train, X_valid, y_valid):
"""Define and train the model. Merge feature_select"""
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
"objective": "multi:softmax",
"eval_metric": "mlogloss",
"num_class": 10,
"nthread": -1,
"tree_method": "gpu_hist",
"device": "cuda",
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
model = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)

return model


def predict(model, X):
"""
Keep feature select's consistency.
"""
dtest = xgb.DMatrix(X)
return model.predict(dtest).astype(int)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(col)).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import clean_and_impute_data, preprocess_script
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_metrics_for_classification(y_true, y_pred):
"""Compute accuracy for classification."""
return accuracy_score(y_true, y_pred)


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])

print(X_train.shape, X_valid.shape, X_test.shape)

# Handle inf and -inf values
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)


model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
select_m = import_module_from_path(select_python_path.stem, select_python_path)
X_train_selected = select_m.select(X_train.copy())
X_valid_selected = select_m.select(X_valid.copy())

m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))

# 4) Evaluate the model on the validation set
metrics_all = []
for model, predict_func, select_m in model_l:
X_valid_selected = select_m.select(X_valid.copy())
y_valid_pred = predict_func(model, X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"final accuracy on valid set: {accuracy}")
metrics_all.append(accuracy)

# 5) Save the validation accuracy
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Submit predictions for the test
ids = range(1, len(X_test) + 1)

# TODO: fix selection
print(X_valid_selected.columns)
y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test))
submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred})
submission_result.to_csv("submission.csv", index=False)

0 comments on commit e958a34

Please sign in to comment.