diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py index 2f1111ff..e448366b 100644 --- a/rdagent/app/kaggle/loop.py +++ b/rdagent/app/kaggle/loop.py @@ -35,20 +35,16 @@ def __init__(self, PROP_SETTING: BasePropSetting): with logger.tag("init"): scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition) logger.log_object(scen, tag="scenario") - knowledge_base = ( import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen) if PROP_SETTING.knowledge_base != "" else None ) logger.log_object(knowledge_base, tag="knowledge_base") - self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen) logger.log_object(self.hypothesis_gen, tag="hypothesis generator") - self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)() logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment") - self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen) logger.log_object(self.feature_coder, tag="feature coder") self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)( @@ -57,12 +53,10 @@ def __init__(self, PROP_SETTING: BasePropSetting): logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder") self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen) logger.log_object(self.model_coder, tag="model coder") - self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen) logger.log_object(self.feature_runner, tag="feature runner") self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen) logger.log_object(self.model_runner, tag="model runner") - self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen) logger.log_object(self.summarizer, tag="summarizer") self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base) @@ -88,7 +82,6 @@ def running(self, prev_out: dict[str, Any]): else: exp = self.model_runner.develop(prev_out["coding"]) logger.log_object(exp, tag="runner result") - if KAGGLE_IMPLEMENT_SETTING.competition in [ "optiver-realized-volatility-prediction", "covid19-global-forecasting-week-1", @@ -99,7 +92,6 @@ def running(self, prev_out: dict[str, Any]): ) except Exception as e: logger.error(f"Merge python files to one file failed: {e}") - if KAGGLE_IMPLEMENT_SETTING.auto_submit: csv_path = exp.experiment_workspace.workspace_path / "submission.csv" try: @@ -129,21 +121,16 @@ def running(self, prev_out: dict[str, Any]): def main(path=None, step_n=None, competition=None): """ Auto R&D Evolving loop for models in a kaggle{} scenario. - You can continue running session by - .. code-block:: bash - dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional parameter rdagent kaggle --competition playground-series-s4e8 # You are encouraged to use this one. - """ if competition: KAGGLE_IMPLEMENT_SETTING.competition = competition download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path) else: logger.error("Please specify competition name.") - if path is None: kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING) else: diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py index 8020e30c..3f6dd5d0 100644 --- a/rdagent/log/ui/app.py +++ b/rdagent/log/ui/app.py @@ -88,6 +88,9 @@ if "lround" not in state: state.lround = 0 # RD Loop Round +if "times" not in state: + state.times = defaultdict(lambda: defaultdict(list)) + if "erounds" not in state: state.erounds = defaultdict(int) # Evolving Rounds in each RD Loop @@ -186,6 +189,17 @@ def get_msgs_until(end_func: Callable[[Message], bool] = lambda _: True): ) state.msgs[state.lround][msg.tag].append(msg) + + # Update Times + if "init" in tags: + state.times[state.lround]["init"].append(msg.timestamp) + if "r" in tags: + state.times[state.lround]["r"].append(msg.timestamp) + if "d" in tags: + state.times[state.lround]["d"].append(msg.timestamp) + if "ef" in tags: + state.times[state.lround]["ef"].append(msg.timestamp) + # Stop Getting Logs if end_func(msg): break @@ -224,6 +238,7 @@ def refresh(same_trace: bool = False): state.last_msg = None state.current_tags = [] state.alpha158_metrics = None + state.times = defaultdict(lambda: defaultdict(list)) def evolving_feedback_window(wsf: FactorSingleFeedback | ModelCoderFeedback): @@ -741,6 +756,18 @@ def evolving_window(): st.markdown(state.scenario.rich_style_description + css, unsafe_allow_html=True) +def show_times(round: int): + for k, v in state.times[round].items(): + if len(v) > 1: + diff = v[-1] - v[0] + else: + diff = v[0] - v[0] + total_seconds = diff.seconds + seconds = total_seconds % 60 + minutes = total_seconds // 60 + st.markdown(f"**:blue[{k}]**: :red[**{minutes}**] minutes :orange[**{seconds}**] seconds") + + if state.scenario is not None: summary_window() @@ -754,8 +781,12 @@ def evolving_window(): round = st.radio("**Loops**", horizontal=True, options=r_options, index=state.lround - 1) else: round = 1 + + show_times(round) rf_c, d_c = st.columns([2, 2]) elif isinstance(state.scenario, GeneralModelScenario): + show_times(round) + rf_c = st.container() d_c = st.container() round = 1 diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py index c7572f09..c57cbe4a 100644 --- a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py @@ -75,14 +75,14 @@ def import_module_from_path(module_name, module_path): metrics_all.append(accuracy) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") # 6) Submit predictions for the test ids = range(1, len(X_test) + 1) # TODO: fix selection print(X_valid_selected.columns) -y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test)).flatten() +y_test_pred = model_l[max_index][1](model_l[max_index][0], model_l[max_index][2].select(X_test)).flatten() submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred}) submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py deleted file mode 100644 index 13ddc8e5..00000000 --- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py +++ /dev/null @@ -1,78 +0,0 @@ -import numpy as np -import pandas as pd -import torch -import torch.nn as nn -import torch.optim as optim -from torch.utils.data import DataLoader, TensorDataset - - -# Define the neural network model with Batch Normalization -class NeuralNetwork(nn.Module): - def __init__(self, input_size, num_classes): - super(NeuralNetwork, self).__init__() - self.layer1 = nn.Linear(input_size, 128) - self.bn1 = nn.BatchNorm1d(128) - self.layer2 = nn.Linear(128, 64) - self.bn2 = nn.BatchNorm1d(64) - self.layer3 = nn.Linear(64, num_classes) - - def forward(self, x): - x = torch.relu(self.bn1(self.layer1(x))) - x = torch.relu(self.bn2(self.layer2(x))) - x = torch.softmax(self.layer3(x), dim=1) - return x - - -def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): - # Convert data to PyTorch tensors - X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32) - y_train_tensor = torch.tensor(y_train.values, dtype=torch.long) - X_valid_tensor = torch.tensor(X_valid.values, dtype=torch.float32) - y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.long) - - # Create datasets and dataloaders - train_dataset = TensorDataset(X_train_tensor, y_train_tensor) - valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor) - train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) - valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) - - # Initialize the model, loss function and optimizer - model = NeuralNetwork(input_size=X_train.shape[1], num_classes=len(set(y_train))) - criterion = nn.CrossEntropyLoss() - optimizer = optim.Adam(model.parameters(), lr=0.001) - - # Train the model - num_epochs = 150 - for epoch in range(num_epochs): - model.train() - for X_batch, y_batch in train_loader: - optimizer.zero_grad() - outputs = model(X_batch) - loss = criterion(outputs, y_batch) - loss.backward() - optimizer.step() - - # Validate the model - model.eval() - valid_loss = 0 - correct = 0 - with torch.no_grad(): - for X_batch, y_batch in valid_loader: - outputs = model(X_batch) - valid_loss += criterion(outputs, y_batch).item() - _, predicted = torch.max(outputs, 1) - correct += (predicted == y_batch).sum().item() - - accuracy = correct / len(valid_loader.dataset) - print(f"Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}") - - return model - - -def predict(model, X): - X_tensor = torch.tensor(X.values, dtype=torch.float32) - model.eval() - with torch.no_grad(): - outputs = model(X_tensor) - _, predicted = torch.max(outputs, 1) - return predicted.numpy().reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py index 1bacd3d0..619cb2e2 100644 --- a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py @@ -76,12 +76,12 @@ def import_module_from_path(module_name, module_path): metrics_all.append(accuracy) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1 +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1 # 7) Submit predictions for the test set diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py new file mode 100644 index 00000000..6b615386 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py @@ -0,0 +1,38 @@ +import os + +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +from sklearn.model_selection import train_test_split + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + return X_train, X_valid, y_train, y_valid, X_test, *others + + # train + train = pd.read_csv("/kaggle/input/train.csv") + X_train, X_valid, y_train, y_valid = train_test_split( + train.drop(["yield", "id"], axis=1), train["yield"], test_size=0.2, random_state=2023 + ) + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + # test + test = pd.read_csv("/kaggle/input/test.csv") + + ids = test["id"] + X_test = test.drop(["id"], axis=1) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py new file mode 100644 index 00000000..82b6712a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_randomforest.py @@ -0,0 +1,27 @@ +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the Random Forest model. Merge feature_select""" + rf_params = { + "n_estimators": 100, + "max_depth": 10, + "min_samples_split": 2, + "min_samples_leaf": 1, + "max_features": "sqrt", + "random_state": 2023, + "n_jobs": -1, + "verbose": 1, + } + model = RandomForestRegressor(**rf_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py new file mode 100644 index 00000000..16cb7c34 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + xgb_params = { + "n_estimators": 280, + "learning_rate": 0.05, + "max_depth": 10, + "subsample": 1.0, + "colsample_bytree": 1.0, + "tree_method": "hist", + "enable_categorical": True, + "verbosity": 1, + "min_child_weight": 3, + "base_score": 4.6, + "random_state": 2023, + } + model = xgb.XGBRegressor(**xgb_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py new file mode 100644 index 00000000..802f2cc2 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/train.py @@ -0,0 +1,76 @@ +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import mean_absolute_error + +DIRNAME = Path(__file__).absolute().resolve().parent + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +if len(X_train_l) > 1: + X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + metrics = mean_absolute_error(y_valid, y_valid_pred) + print(f"MAE on valid set: {metrics}") + metrics_all.append(metrics) + +# 5) Save the validation accuracy +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["MAE"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[min_index][2].select(X_test.copy()) +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame(y_test_pred, columns=["yield"]) +submission_result.insert(0, "id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py new file mode 100644 index 00000000..90e3b8ed --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/fea_share_preprocess.py @@ -0,0 +1,45 @@ +import os + +import numpy as np # linear algebra +import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + + +def preprocess_script(): + """ + This method applies the preprocessing steps to the training, validation, and test datasets. + """ + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + return X_train, X_valid, y_train, y_valid, X_test, *others + + # train + train = pd.read_csv("/kaggle/input/train.csv") + + le = LabelEncoder() + train["Sex"] = le.fit_transform(train["Sex"]) + + X_train, X_valid, y_train, y_valid = train_test_split( + train.drop(["Age", "id"], axis=1), train["Age"], test_size=0.2, random_state=2023 + ) + y_train = pd.Series(y_train).reset_index(drop=True) + y_valid = pd.Series(y_valid).reset_index(drop=True) + + # test + test = pd.read_csv("/kaggle/input/test.csv") + + test["Sex"] = le.transform(test["Sex"]) + ids = test["id"] + + X_test = test.drop(["id"], axis=1) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py new file mode 100644 index 00000000..82b6712a --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_randomforest.py @@ -0,0 +1,27 @@ +import pandas as pd +from sklearn.ensemble import RandomForestRegressor + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the Random Forest model. Merge feature_select""" + rf_params = { + "n_estimators": 100, + "max_depth": 10, + "min_samples_split": 2, + "min_samples_leaf": 1, + "max_features": "sqrt", + "random_state": 2023, + "n_jobs": -1, + "verbose": 1, + } + model = RandomForestRegressor(**rf_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py new file mode 100644 index 00000000..16cb7c34 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +""" +motivation of the model +""" + +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + xgb_params = { + "n_estimators": 280, + "learning_rate": 0.05, + "max_depth": 10, + "subsample": 1.0, + "colsample_bytree": 1.0, + "tree_method": "hist", + "enable_categorical": True, + "verbosity": 1, + "min_child_weight": 3, + "base_score": 4.6, + "random_state": 2023, + } + model = xgb.XGBRegressor(**xgb_params) + model.fit(X_train, y_train) + return model + + +def predict(model, X_test): + """ + Keep feature select's consistency. + """ + y_pred = model.predict(X_test) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py new file mode 100644 index 00000000..e04091ee --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e16_template/train.py @@ -0,0 +1,76 @@ +import importlib.util +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import mean_absolute_error + +DIRNAME = Path(__file__).absolute().resolve().parent + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +if len(X_train_l) > 1: + X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) + X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) + X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + metrics = mean_absolute_error(y_valid, y_valid_pred) + print(f"MAE on valid set: {metrics}") + metrics_all.append(metrics) + +# 5) Save the validation accuracy +min_index = np.argmin(metrics_all) +pd.Series(data=[metrics_all[min_index]], index=["MAE"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[min_index][2].select(X_test.copy()) +y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame(np.round(y_test_pred).astype(int), columns=["Age"]) +submission_result.insert(0, "id", ids) + +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s3e26_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py new file mode 100644 index 00000000..8987d00e --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/fea_share_preprocess.py @@ -0,0 +1,70 @@ +import os + +import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.impute import SimpleImputer +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline + + +def prepreprocess(): + data_df = pd.read_csv("/kaggle/input/train.csv") + data_df = data_df.drop(["id"], axis=1) + + X = data_df.drop(["FloodProbability"], axis=1) + y = data_df["FloodProbability"] + + X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=42) + + return X_train, X_valid, y_train, y_valid + + +def preprocess_fit(X_train: pd.DataFrame): + numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ["int64", "float64"]] + + numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) + + preprocessor = ColumnTransformer( + transformers=[ + ("num", numerical_transformer, numerical_cols), + ] + ) + + preprocessor.fit(X_train) + + return preprocessor, numerical_cols + + +def preprocess_transform(X: pd.DataFrame, preprocessor, numerical_cols): + X_transformed = preprocessor.transform(X) + + # Convert arrays back to DataFrames + X_transformed = pd.DataFrame(X_transformed, columns=numerical_cols, index=X.index) + + return X_transformed + + +def preprocess_script(): + if os.path.exists("/kaggle/input/X_train.pkl"): + X_train = pd.read_pickle("/kaggle/input/X_train.pkl") + X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") + y_train = pd.read_pickle("/kaggle/input/y_train.pkl") + y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") + X_test = pd.read_pickle("/kaggle/input/X_test.pkl") + others = pd.read_pickle("/kaggle/input/others.pkl") + + return X_train, X_valid, y_train, y_valid, X_test, *others + + X_train, X_valid, y_train, y_valid = prepreprocess() + + preprocessor, numerical_cols = preprocess_fit(X_train) + + X_train = preprocess_transform(X_train, preprocessor, numerical_cols) + X_valid = preprocess_transform(X_valid, preprocessor, numerical_cols) + + submission_df = pd.read_csv("/kaggle/input/test.csv") + ids = submission_df["id"] + submission_df = submission_df.drop(["id"], axis=1) + X_test = preprocess_transform(submission_df, preprocessor, numerical_cols) + + return X_train, X_valid, y_train, y_valid, X_test, ids diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py new file mode 100644 index 00000000..8ae043ac --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/feature/feature.py @@ -0,0 +1,23 @@ +import pandas as pd + +""" +Here is the feature engineering code for each task, with a class that has a fit and transform method. +Remember +""" + + +class IdentityFeature: + def fit(self, train_df: pd.DataFrame): + """ + Fit the feature engineering model to the training data. + """ + pass + + def transform(self, X: pd.DataFrame): + """ + Transform the input data. + """ + return X + + +feature_engineering_cls = IdentityFeature diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py new file mode 100644 index 00000000..bf1b273d --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_randomforest.py @@ -0,0 +1,33 @@ +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error + + +def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): + """ + Define and train the Random Forest model. Merge feature selection into the pipeline. + """ + # Initialize the Random Forest model + model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1) + + # Fit the model + model.fit(X_train, y_train) + + # Validate the model + y_valid_pred = model.predict(X_valid) + mse = mean_squared_error(y_valid, y_valid_pred) + rmse = np.sqrt(mse) + print(f"Validation RMSE: {rmse:.4f}") + + return model + + +def predict(model, X): + """ + Keep feature selection's consistency and make predictions. + """ + # Predict using the trained model + y_pred = model.predict(X) + + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py new file mode 100644 index 00000000..7e517fb0 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/model_xgboost.py @@ -0,0 +1,34 @@ +import pandas as pd +import xgboost as xgb + + +def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): + """Define and train the model. Merge feature_select""" + dtrain = xgb.DMatrix(X_train, label=y_train) + dvalid = xgb.DMatrix(X_valid, label=y_valid) + + # Parameters for regression + params = { + "objective": "reg:squarederror", # Use squared error for regression + "nthread": -1, + "n_estimators": 8000, + "tree_method": "gpu_hist", + "device": "cuda", + "max_depth": 10, + "learning_rate": 0.01, + } + num_round = 5000 + + evallist = [(dtrain, "train"), (dvalid, "eval")] + bst = xgb.train(params, dtrain, num_round, evallist) + + return bst + + +def predict(model, X): + """ + Keep feature select's consistency. + """ + dtest = xgb.DMatrix(X) + y_pred = model.predict(dtest) + return y_pred.reshape(-1, 1) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_randomforest.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/model/select_xgboost.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py new file mode 100644 index 00000000..2d9a3a95 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e5_template/train.py @@ -0,0 +1,99 @@ +import importlib.util +import random +from pathlib import Path + +import numpy as np +import pandas as pd +from fea_share_preprocess import preprocess_script +from sklearn.metrics import r2_score + +# Set random seed for reproducibility +SEED = 42 +random.seed(SEED) +np.random.seed(SEED) +DIRNAME = Path(__file__).absolute().resolve().parent + + +def compute_r2(y_true, y_pred): + """Compute R² score for regression.""" + return r2_score(y_true, y_pred) + + +def import_module_from_path(module_name, module_path): + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +# 1) Preprocess the data +X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() + +# 2) Auto feature engineering +X_train_l, X_valid_l = [], [] +X_test_l = [] + +for f in DIRNAME.glob("feature/feat*.py"): + cls = import_module_from_path(f.stem, f).feature_engineering_cls() + cls.fit(X_train) + X_train_f = cls.transform(X_train) + X_valid_f = cls.transform(X_valid) + X_test_f = cls.transform(X_test) + + if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: + X_train_l.append(X_train_f) + X_valid_l.append(X_valid_f) + X_test_l.append(X_test_f) + +X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) +X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) +X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) + +print(X_train.shape, X_valid.shape, X_test.shape) + +# Handle inf and -inf values +X_train.replace([np.inf, -np.inf], np.nan, inplace=True) +X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) +X_test.replace([np.inf, -np.inf], np.nan, inplace=True) + +from sklearn.impute import SimpleImputer + +imputer = SimpleImputer(strategy="mean") + +X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) +X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) +X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) + + +# 3) Train the model +model_l = [] # list[tuple[model, predict_func,]] +for f in DIRNAME.glob("model/model*.py"): + select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) + select_m = import_module_from_path(select_python_path.stem, select_python_path) + X_train_selected = select_m.select(X_train.copy()) + X_valid_selected = select_m.select(X_valid.copy()) + + m = import_module_from_path(f.stem, f) + model_name = f.stem + model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m, model_name)) + +# 4) Evaluate the model on the validation set +metrics_all = [] +for model, predict_func, select_m, model_name in model_l: + X_valid_selected = select_m.select(X_valid.copy()) + y_valid_pred = predict_func(model, X_valid_selected) + r2 = compute_r2(y_valid, y_valid_pred) + print(f"R2 on valid set for {model_name}: {r2}") + metrics_all.append(r2) + +# 5) Save the validation accuracy +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["R2"]).to_csv("submission_score.csv") + +# 6) Make predictions on the test set and save them +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).ravel() + +# 7) Submit predictions for the test set +submission_result = pd.DataFrame({"id": ids, "FloodProbability": y_test_pred}) +submission_result.to_csv("submission.csv", index=False) diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py index 10c98a31..42d24dc2 100644 --- a/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e8_template/train.py @@ -88,12 +88,12 @@ def import_module_from_path(module_name, module_path): metrics_all.append(metrics) # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["MCC"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) y_test_pred = (y_test_pred > 0.5).astype(int) y_test_pred_labels = np.where(y_test_pred == 1, "p", "e") # 将整数转换回 'e' 或 'p' diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/playground-series-s4e9_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/select_nn.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py deleted file mode 100644 index 138881c8..00000000 --- a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/model_nn.py +++ /dev/null @@ -1,78 +0,0 @@ -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.utils.data import DataLoader, TensorDataset -from tqdm import tqdm - -# Check if a GPU is available -device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - -# Restored three-layer model structure -class FeatureInteractionModel(nn.Module): - def __init__(self, num_features): - super(FeatureInteractionModel, self).__init__() - self.fc1 = nn.Linear(num_features, 128) - self.bn1 = nn.BatchNorm1d(128) - self.fc2 = nn.Linear(128, 64) - self.bn2 = nn.BatchNorm1d(64) - self.fc3 = nn.Linear(64, 1) - self.dropout = nn.Dropout(0.3) - - def forward(self, x): - x = F.relu(self.bn1(self.fc1(x))) - x = F.relu(self.bn2(self.fc2(x))) - x = self.dropout(x) - x = torch.sigmoid(self.fc3(x)) - return x - - -# Training function -def fit(X_train, y_train, X_valid, y_valid): - num_features = X_train.shape[1] - model = FeatureInteractionModel(num_features).to(device) - criterion = nn.BCELoss() # Binary classification problem - optimizer = torch.optim.Adam(model.parameters(), lr=0.001) - - # Convert to TensorDataset and create DataLoader - train_dataset = TensorDataset( - torch.tensor(X_train.to_numpy(), dtype=torch.float32), - torch.tensor(y_train.to_numpy().reshape(-1), dtype=torch.float32), - ) - valid_dataset = TensorDataset( - torch.tensor(X_valid.to_numpy(), dtype=torch.float32), - torch.tensor(y_valid.to_numpy().reshape(-1), dtype=torch.float32), - ) - train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) - valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False) - - # Train the model - model.train() - for epoch in range(50): - print(f"Epoch {epoch + 1}/50") - epoch_loss = 0 - for X_batch, y_batch in tqdm(train_loader, desc="Training", leave=False): - X_batch, y_batch = X_batch.to(device), y_batch.to(device) # Move data to the device - optimizer.zero_grad() - outputs = model(X_batch).squeeze(1) # Reshape outputs to [32] - loss = criterion(outputs, y_batch) # Adjust target shape - loss.backward() - optimizer.step() - epoch_loss += loss.item() - print(f"End of epoch {epoch + 1}, Avg Loss: {epoch_loss / len(train_loader):.4f}") - - return model - - -# Prediction function -def predict(model, X): - model.eval() - predictions = [] - with torch.no_grad(): - X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) # Move data to the device - for i in tqdm(range(0, len(X_tensor), 32), desc="Predicting", leave=False): - batch = X_tensor[i : i + 32] # Predict in batches - pred = model(batch).squeeze().cpu().numpy() # Move results back to CPU - predictions.extend(pred) - return np.array(predictions).reshape(-1, 1) # Return predictions diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py index fe8e3ad5..c94403b5 100644 --- a/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py +++ b/rdagent/scenarios/kaggle/experiment/spaceship-titanic_template/train.py @@ -95,12 +95,12 @@ def import_module_from_path(module_name, module_path): # 5) Save the validation accuracy -min_index = np.argmax(metrics_all) -pd.Series(data=[metrics_all[min_index]], index=["MCC"]).to_csv("submission_score.csv") +max_index = np.argmax(metrics_all) +pd.Series(data=[metrics_all[max_index]], index=["MCC"]).to_csv("submission_score.csv") # 6) Make predictions on the test set and save them -X_test_selected = model_l[min_index][2].select(X_test.copy()) -y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected) +X_test_selected = model_l[max_index][2].select(X_test.copy()) +y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected) y_test_pred = (y_test_pred > 0.5).astype(bool) y_test_pred = y_test_pred.ravel() diff --git a/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py new file mode 100644 index 00000000..f230f130 --- /dev/null +++ b/rdagent/scenarios/kaggle/experiment/statoil-iceberg-classifier-challenge_template/model/select_lightgbm.py @@ -0,0 +1,12 @@ +import pandas as pd + + +def select(X: pd.DataFrame) -> pd.DataFrame: + """ + Select relevant features. To be used in fit & predict function. + """ + # For now, we assume all features are relevant. This can be expanded to feature selection logic. + if X.columns.nlevels == 1: + return X + X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] + return X diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index be85fdaa..b9201d98 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -1,4 +1,5 @@ # %% +import bisect import json import subprocess import time @@ -114,6 +115,26 @@ def leaderboard_scores(competition: str) -> list[float]: return [float(x.score) for x in ll] +def score_rank(competition: str, score: float) -> tuple[int, float]: + """ + Return + ------ + rank: int + rank_percent: float + """ + scores = leaderboard_scores(competition) + if scores[0] < scores[-1]: # Ascending order + rank = bisect.bisect_right(scores, score) + else: # Descending order + scores = scores[::-1] # Reverse the list to use bisect + rank = len(scores) - bisect.bisect_right(scores, score) + + rank = rank + 1 + rank_percent = rank / len(scores) * 100 + + return rank, rank_percent + + def download_notebooks( competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15 ) -> None: @@ -294,5 +315,6 @@ def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") -> # name = c.ref.split("/")[-1] # crawl_descriptions(name) res = leaderboard_scores(competition="playground-series-s4e8") - + rank, rank_percent = score_rank(competition="playground-series-s4e8", score=0.9832) + print(rank, rank_percent) # %% diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml index 29659b01..e7452713 100644 --- a/rdagent/scenarios/kaggle/prompts.yaml +++ b/rdagent/scenarios/kaggle/prompts.yaml @@ -50,41 +50,55 @@ hypothesis_output_format: |- hypothesis_specification: Feature engineering: |- Action: Feature engineering - description: We engineer the features for the sake of best model performance on the basis of engineering the most influential features. - type_of_feature_and_data_characteristics: - - Clearly define the feature type being introduced. - - Highlight the specific data patterns or characteristics the feature captures. - - Keep it focused—omit unnecessary details. - start_with_simple_features: - - Begin with straightforward and impactful features. - - Briefly explain why these features are expected to work. - - Avoid combining complex features at the outset. - increase_complexity_gradually: - - Add more complex features only after gathering experimental results. - - Discuss potential advantages and the trade-offs involved. - - Combine features only after simpler ones are tested and validated. - new_directions_and_optimizations: - - Justify any new direction based on data analysis or domain knowledge. - - Focus on one new direction at a time for clarity. - - If a hypothesis shows optimization potential (even without surpassing previous best results), explain why and proceed. - feature_library_and_task_management: - - Include features that improve performance in the feature library. - - Each generation should focus on 1-3 feature tasks, balancing simplicity with complexity. + + Description: We engineer the features for the sake of best model performance on the basis of engineering the most influential features. + + 1. Type of Feature and Data Characteristics: + - Clearly define the type of feature being introduced. + - Explain what data characteristics or patterns this feature captures. + - Keep descriptions focused, avoiding redundant details to ensure clarity. + + 2. Simple and Effective Features First: + - Start by introducing features that are simple yet likely to be effective. + - Provide a concise explanation of why these features are expected to perform well. + - Avoid complex or combined features during the initial stages. + + 3. Gradual Complexity Increase: + - After initial feature testing, introduce more complex features. + - Discuss both the potential benefits and any additional complexities of these features. + - Begin combining features only after simpler ones have been tested and validated. + + 4. New Directions and Optimizations: + - If results suggest a need for a new approach, explain why, using data analysis, domain knowledge, or observed patterns. + - Propose one new direction per iteration for clarity and focus. + - If a previous hypothesis did not surpass the previous best but shows promise, continue in the same direction with optimizations. + - Emphasize that features that outperform previous best results are added to the feature library, avoiding redundant work. + + 5. 1-3 Feature Tasks per Generation: + - Each generation should produce 1-3 feature tasks. + - Maintain a balance between simplicity and complexity to develop a diverse and robust feature library. Feature processing: |- Action: Feature processing - Define_the_processing_method: - - Clearly state the type of feature processing. - - Explain how this processing captures data patterns or improves feature usefulness. - - Avoid redundant details. - Begin_with_simple_processing: - - Start with simple, effective processing methods. - - Concisely explain why these methods should improve model performance. - - Introduce complex processing only after gathering experimental results. - Introduce_complexity_gradually: - - Add more sophisticated processing methods step-by-step, after validation. - - Discuss the advantages, challenges, and trade-offs of advanced processing. - - Validate simpler methods before combining them with complex ones. + + 1. Feature Transformation and Normalization: + - Clearly define any transformations applied to features (e.g., scaling, normalization, log transforms). + - Explain how these transformations improve the data's suitability for the model. + - Ensure transformations do not introduce unnecessary complexity early on. + + 2. Handling Missing Values and Outliers: + - Define any imputation methods used for missing data (e.g., mean, median, or more complex methods). + - Explain how outliers are handled (e.g., clipping, removal, or transformation). + - Ensure these processes are straightforward, enhancing data quality without overcomplicating early feature processing. + + 3. Feature Interactions and Combinations: + - After testing individual features, introduce combinations or interactions. + - Discuss the potential advantages of feature interaction terms (e.g., polynomial or multiplicative features). + - Ensure interactions are only applied after simpler, individual features have been processed. + + 4. 1-3 Feature Tasks per Generation: + - Each generation should produce 1-3 feature tasks. + - Maintain a balance between simplicity and complexity to develop a diverse and robust feature library. Model feature selection: |- Selection_based_on_model_type: @@ -95,33 +109,37 @@ hypothesis_specification: - Clarify how the selected features complement the model's strengths and handle its potential weaknesses. Model tuning: |- - Explain the hypothesis clearly with valuable information. What kind of model are you building/tuning? What do you think is true? How you are revising and why? What are some innvations? - Focus_on_architecture_or_hyper_parameter_tuning_or_both: - - Focus on designing new model architectures one at a time OR hyper-parameter tuning OR both. - - Each hypothesis should introduce a novel architecture or a significant modification to an existing one, while leveraging previous experiences and the hypothesis history. - - Optimize one model at a time, iterating until its potential is fully explored. Switch to a new model only when you believe the current model’s potential has been exhausted. - Specific_to_model_type: - - Note that any types of tuning or model design must be specific to the model types available in our workspace. - - Clearly define the model type (e.g., Neural Network Models (eg, MLP, CNN, RNN, LSTM, GRU etc.), XGBoost, RandomForest, LightGBM) and the architecture/tuning being introduced. - - Ensure the architecture or tuning aligns with the data characteristics and the strengths or limitations of the specific model. - Rationale_behind_architecture_and_tuning: - - Explain the innovation or reasoning behind the architectural design or tuning approach. - - Justify how the new structure or parameter change captures data patterns more effectively, improves learning efficiency, or enhances predictive power. - Start_simple_innovate_gradually: - - Start with innovative yet simple changes to ensure each iteration is well-tested and the results are well-understood. - - Gradually introduce more complex architectural changes or hyper-parameter adjustments based on gathered results and insights. - Introduce_one_innovation_at_a_time: - - Focus on testing one key innovation at a time to isolate its impact on performance. - - Avoid combining multiple innovations in a single iteration to maintain clarity in performance results. - Balance_innovation_with_performance: - - Strive for a balance between creative design and practical, effective performance. - - If a design or tuning shows strong performance, document it in a "library" for future iterations. - Iterative_testing_and_refinement: - - After each test, evaluate and refine the model architecture or tuning based on observed performance and data patterns. - - If a hypothesis shows potential but doesn't surpass previous results, continue optimizing in that direction. - Hypothesis_statement: - - For each hypothesis, specify the exact innovation or tuning approach and explain why it's expected to enhance performance for the chosen model type. - + Explain the hypothesis clearly with valuable information. What kind of model are you building/tuning? What do you think is true? How are you revising and why? What are some innovations? Base your hypothesis on the previous history and your understanding of the model code. "Tune" means changing the model architecture or hyperparameters. + Focus_on_architecture_or_hyper_parameter_tuning_or_both: + - Focus on designing new model architectures one at a time OR hyper-parameter tuning OR both. + - Each hypothesis should introduce a novel architecture or a significant modification to an existing one, while leveraging previous experiences and the hypothesis history. + - Optimize one model at a time, iterating until its potential is fully explored. Switch to a new model only when you believe the current model’s potential has been exhausted. + Specific_to_model_type: + - Note that any types of tuning or model design must be specific to the model types available in our workspace. + - Clearly define the model type (e.g., Neural Network Models (MLP, CNN, RNN, LSTM, GRU etc.), XGBoost, RandomForest, LightGBM) and the architecture/tuning being introduced. + - Ensure the architecture or tuning aligns with the data characteristics and the strengths or limitations of the specific model. + Rationale_behind_architecture_and_tuning: + - Explain the innovation or reasoning behind the architectural design or tuning approach. + - Justify how the new structure or parameter change captures data patterns more effectively, improves learning efficiency, or enhances predictive power. + Start_simple_innovate_gradually: + - Start with innovative yet simple changes to ensure each iteration is well-tested and the results are well-understood. + - Gradually introduce more complex architectural changes or hyper-parameter adjustments based on gathered results and insights. + Introduce_one_innovation_at_a_time: + - Focus on testing one key innovation at a time to isolate its impact on performance. + - Avoid combining multiple innovations in a single iteration to maintain clarity in performance results. + Hypothesis_statement: + - For each hypothesis, specify the exact innovation or tuning approach and explain why it's expected to enhance performance for the chosen model type. Eg. Instead of a general "Adjusting", specific the direction & extent. + Hypothesis_examples: (Please note that they are only examples) + 1. "Increasing the dropout rate in an MLP from 0.2 to 0.5 will help reduce overfitting and improve model generalization on validation data." + 2. "Adding a skip connection to the CNN architecture will allow deeper layers to receive gradients more effectively, preventing vanishing gradients and improving feature learning." + 3. "Doubling the GRU hidden units from 128 to 256 will allow the model to capture more complex temporal dependencies in time-series data, potentially improving accuracy." + 4. "Switching the LSTM optimizer from Adam to SGD with momentum will slow down convergence, leading to more stable and refined learning over time in sparse data environments." + 5. "Reducing the learning rate in a LightGBM model from 0.05 to 0.01 will slow down the learning process, allowing for better generalization on larger datasets." + 6. "Incorporating a self-attention layer into the RNN model will enhance its ability to focus on important parts of the input sequence, improving sequence-to-sequence translation accuracy." + 7. "Increasing the maximum depth of trees in a RandomForest model from 10 to 20 will allow the model to capture more complex patterns in high-dimensional data, improving performance." + 8. "Replacing the ReLU activation with Leaky ReLU in a CNN will prevent the dying ReLU problem and improve the model’s ability to learn from negative values." + 9. "Introducing early stopping in XGBoost with a patience of 10 rounds will prevent overfitting by halting training when the validation error no longer improves." + 10. "Expanding the CNN kernel size from 3x3 to 5x5 in early layers will help capture larger spatial dependencies in image data, enhancing performance on image classification tasks." feature_experiment_output_format: |- According to the hypothesis, please help user design one or more feature engineering tasks. diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index 517d5fdb..44a923a5 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -241,8 +241,12 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: ), "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": ( - f"next experiment action is {action}" if self.scen.if_action_choosing_based_on_UCB else None, - prompt_dict["hypothesis_specification"][action], + { + "next_experiment_action": f"next experiment action is {action}", + "specification": prompt_dict["hypothesis_specification"][action], + } + if self.scen.if_action_choosing_based_on_UCB + else None ), } return context_dict, True