feat: supporting various Kaggle competitions & scenarios for RD-Agent (…

…#409) * Fixes on kaggle output * feat: add kaggle s3e14 template (#394) * add s3e14 template * fix CI * Initialisation of a template of competition * add kaggle s3e16 template (#396) * get kaggle competition scores (#397) * Adding a new competition s4e6 * feat: s4e5 (#400) * init for s4e5 * edit s4e5 * ci issue * feat: S4e3 (#402) * Initialisation of a template of competition * Adding a new competition s4e6 * Competition Initialised * Fixed to make sure that now it runs * Fixing for CI * correct evaluation (#403) * find rank in leaderboard (#405) * fix: model templates for KG scenario (#408) * fix feature selection for some models * feat select template * Updating the prompts for a more powerful model tuning * refine the prompt * fix: template error in s4e6 * feat: show simple execution time in demo (#410) * show time in kaggle demo * change color * fix a small bug * edit loop.py and proposal * delete useless files * CI issues * ci issue --------- Co-authored-by: XianBW <[email protected]> Co-authored-by: Haoran Pan <[email protected]> Co-authored-by: Way2Learn <[email protected]> Co-authored-by: WinstonLiyt <[email protected]> Co-authored-by: TPLin22 <[email protected]>
microsoft · Oct 15, 2024 · 75eea22 · 75eea22
1 parent 8f8afea
commit 75eea22
Show file tree

Hide file tree

Showing 55 changed files with 1,162 additions and 246 deletions.
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -35,20 +35,16 @@ def __init__(self, PROP_SETTING: BasePropSetting):
         with logger.tag("init"):
             scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
             logger.log_object(scen, tag="scenario")
-
             knowledge_base = (
                 import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
                 if PROP_SETTING.knowledge_base != ""
                 else None
             )
             logger.log_object(knowledge_base, tag="knowledge_base")
-
             self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
             logger.log_object(self.hypothesis_gen, tag="hypothesis generator")
-
             self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
             logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")
-
             self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
             logger.log_object(self.feature_coder, tag="feature coder")
             self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
@@ -57,12 +53,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
             logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
             self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
             logger.log_object(self.model_coder, tag="model coder")
-
             self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
             logger.log_object(self.feature_runner, tag="feature runner")
             self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
             logger.log_object(self.model_runner, tag="model runner")
-
             self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
             logger.log_object(self.summarizer, tag="summarizer")
             self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
@@ -88,7 +82,6 @@ def running(self, prev_out: dict[str, Any]):
             else:
                 exp = self.model_runner.develop(prev_out["coding"])
             logger.log_object(exp, tag="runner result")
-
             if KAGGLE_IMPLEMENT_SETTING.competition in [
                 "optiver-realized-volatility-prediction",
                 "covid19-global-forecasting-week-1",
@@ -99,7 +92,6 @@ def running(self, prev_out: dict[str, Any]):
                     )
                 except Exception as e:
                     logger.error(f"Merge python files to one file failed: {e}")
-
             if KAGGLE_IMPLEMENT_SETTING.auto_submit:
                 csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
                 try:
@@ -129,21 +121,16 @@ def running(self, prev_out: dict[str, Any]):
 def main(path=None, step_n=None, competition=None):
     """
     Auto R&D Evolving loop for models in a kaggle{} scenario.
-
     You can continue running session by
-
     .. code-block:: bash
-
         dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose  --step_n 1   # `step_n` is a optional parameter
         rdagent kaggle --competition playground-series-s4e8  # You are encouraged to use this one.
-
     """
     if competition:
         KAGGLE_IMPLEMENT_SETTING.competition = competition
         download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
     else:
         logger.error("Please specify competition name.")
-
     if path is None:
         kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
     else:

diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
@@ -88,6 +88,9 @@
 if "lround" not in state:
     state.lround = 0  # RD Loop Round
 
+if "times" not in state:
+    state.times = defaultdict(lambda: defaultdict(list))
+
 if "erounds" not in state:
     state.erounds = defaultdict(int)  # Evolving Rounds in each RD Loop
 
@@ -186,6 +189,17 @@ def get_msgs_until(end_func: Callable[[Message], bool] = lambda _: True):
                             )
 
                     state.msgs[state.lround][msg.tag].append(msg)
+
+                    # Update Times
+                    if "init" in tags:
+                        state.times[state.lround]["init"].append(msg.timestamp)
+                    if "r" in tags:
+                        state.times[state.lround]["r"].append(msg.timestamp)
+                    if "d" in tags:
+                        state.times[state.lround]["d"].append(msg.timestamp)
+                    if "ef" in tags:
+                        state.times[state.lround]["ef"].append(msg.timestamp)
+
                     # Stop Getting Logs
                     if end_func(msg):
                         break
@@ -224,6 +238,7 @@ def refresh(same_trace: bool = False):
     state.last_msg = None
     state.current_tags = []
     state.alpha158_metrics = None
+    state.times = defaultdict(lambda: defaultdict(list))
 
 
 def evolving_feedback_window(wsf: FactorSingleFeedback | ModelCoderFeedback):
@@ -741,6 +756,18 @@ def evolving_window():
             st.markdown(state.scenario.rich_style_description + css, unsafe_allow_html=True)
 
 
+def show_times(round: int):
+    for k, v in state.times[round].items():
+        if len(v) > 1:
+            diff = v[-1] - v[0]
+        else:
+            diff = v[0] - v[0]
+        total_seconds = diff.seconds
+        seconds = total_seconds % 60
+        minutes = total_seconds // 60
+        st.markdown(f"**:blue[{k}]**: :red[**{minutes}**] minutes :orange[**{seconds}**] seconds")
+
+
 if state.scenario is not None:
     summary_window()
 
@@ -754,8 +781,12 @@ def evolving_window():
             round = st.radio("**Loops**", horizontal=True, options=r_options, index=state.lround - 1)
         else:
             round = 1
+
+        show_times(round)
         rf_c, d_c = st.columns([2, 2])
     elif isinstance(state.scenario, GeneralModelScenario):
+        show_times(round)
+
         rf_c = st.container()
         d_c = st.container()
         round = 1

diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_lightgbm.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/model/select_randomforest.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py b/rdagent/scenarios/kaggle/experiment/digit-recognizer_template/train.py
@@ -75,14 +75,14 @@ def import_module_from_path(module_name, module_path):
     metrics_all.append(accuracy)
 
 # 5) Save the validation accuracy
-min_index = np.argmax(metrics_all)
-pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
+max_index = np.argmax(metrics_all)
+pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
 
 # 6) Submit predictions for the test
 ids = range(1, len(X_test) + 1)
 
 # TODO: fix selection
 print(X_valid_selected.columns)
-y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test)).flatten()
+y_test_pred = model_l[max_index][1](model_l[max_index][0], model_l[max_index][2].select(X_test)).flatten()
 submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred})
 submission_result.to_csv("submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/model/model_nn.py
diff --git a/...cenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py b/...cenarios/kaggle/experiment/forest-cover-type-prediction_template/model/select_lightgbm.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/forest-cover-type-prediction_template/train.py
@@ -76,12 +76,12 @@ def import_module_from_path(module_name, module_path):
     metrics_all.append(accuracy)
 
 # 5) Save the validation accuracy
-min_index = np.argmax(metrics_all)
-pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
+max_index = np.argmax(metrics_all)
+pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
 
 # 6) Make predictions on the test set and save them
-X_test_selected = model_l[min_index][2].select(X_test.copy())
-y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1
+X_test_selected = model_l[max_index][2].select(X_test.copy())
+y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1
 
 
 # 7) Submit predictions for the test set

diff --git a/...aggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py b/...aggle/experiment/optiver-realized-volatility-prediction_template/model/select_lightgbm.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...rios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py b/...rios/kaggle/experiment/optiver-realized-volatility-prediction_template/model/select_nn.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...ent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py b/...ent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_lightgbm.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e11_template/model/select_nn.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/playground-series-s3e14_template/fea_share_preprocess.py
@@ -0,0 +1,38 @@
+import os
+
+import numpy as np  # linear algebra
+import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
+from sklearn.model_selection import train_test_split
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+        y_train = pd.Series(y_train).reset_index(drop=True)
+        y_valid = pd.Series(y_valid).reset_index(drop=True)
+
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    # train
+    train = pd.read_csv("/kaggle/input/train.csv")
+    X_train, X_valid, y_train, y_valid = train_test_split(
+        train.drop(["yield", "id"], axis=1), train["yield"], test_size=0.2, random_state=2023
+    )
+    y_train = pd.Series(y_train).reset_index(drop=True)
+    y_valid = pd.Series(y_valid).reset_index(drop=True)
+
+    # test
+    test = pd.read_csv("/kaggle/input/test.csv")
+
+    ids = test["id"]
+    X_test = test.drop(["id"], axis=1)
+
+    return X_train, X_valid, y_train, y_valid, X_test, ids