Skip to content

Commit

Permalink
feat: supporting various Kaggle competitions & scenarios for RD-Agent (
Browse files Browse the repository at this point in the history
…#409)

* Fixes on  kaggle output

* feat: add kaggle s3e14 template (#394)

* add s3e14 template

* fix CI

* Initialisation of a template of competition

* add kaggle s3e16 template (#396)

* get kaggle competition scores (#397)

* Adding a new competition s4e6

* feat: s4e5 (#400)

* init for s4e5

* edit s4e5

* ci issue

* feat: S4e3 (#402)

* Initialisation of a template of competition

* Adding a new competition s4e6

* Competition Initialised

* Fixed to make sure that now it runs

* Fixing for CI

* correct evaluation (#403)

* find rank in leaderboard (#405)

* fix: model templates for KG scenario (#408)

* fix feature selection for some models

* feat select template

* Updating the prompts for a more powerful model tuning

* refine the prompt

* fix: template error in s4e6

* feat: show simple execution time in demo (#410)

* show time in kaggle demo

* change color

* fix a small bug

* edit loop.py and proposal

* delete useless files

* CI issues

* ci issue

---------

Co-authored-by: XianBW <[email protected]>
Co-authored-by: Haoran Pan <[email protected]>
Co-authored-by: Way2Learn <[email protected]>
Co-authored-by: WinstonLiyt <[email protected]>
Co-authored-by: TPLin22 <[email protected]>
  • Loading branch information
6 people authored Oct 15, 2024
1 parent 8f8afea commit 75eea22
Show file tree
Hide file tree
Showing 55 changed files with 1,162 additions and 246 deletions.
13 changes: 0 additions & 13 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,16 @@ def __init__(self, PROP_SETTING: BasePropSetting):
with logger.tag("init"):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

knowledge_base = (
import_class(PROP_SETTING.knowledge_base)(PROP_SETTING.knowledge_base_path, scen)
if PROP_SETTING.knowledge_base != ""
else None
)
logger.log_object(knowledge_base, tag="knowledge_base")

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

self.hypothesis2experiment: Hypothesis2Experiment = import_class(PROP_SETTING.hypothesis2experiment)()
logger.log_object(self.hypothesis2experiment, tag="hypothesis2experiment")

self.feature_coder: Developer = import_class(PROP_SETTING.feature_coder)(scen)
logger.log_object(self.feature_coder, tag="feature coder")
self.model_feature_selection_coder: Developer = import_class(PROP_SETTING.model_feature_selection_coder)(
Expand All @@ -57,12 +53,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
logger.log_object(self.model_feature_selection_coder, tag="model feature selection coder")
self.model_coder: Developer = import_class(PROP_SETTING.model_coder)(scen)
logger.log_object(self.model_coder, tag="model coder")

self.feature_runner: Developer = import_class(PROP_SETTING.feature_runner)(scen)
logger.log_object(self.feature_runner, tag="feature runner")
self.model_runner: Developer = import_class(PROP_SETTING.model_runner)(scen)
logger.log_object(self.model_runner, tag="model runner")

self.summarizer: HypothesisExperiment2Feedback = import_class(PROP_SETTING.summarizer)(scen)
logger.log_object(self.summarizer, tag="summarizer")
self.trace = KGTrace(scen=scen, knowledge_base=knowledge_base)
Expand All @@ -88,7 +82,6 @@ def running(self, prev_out: dict[str, Any]):
else:
exp = self.model_runner.develop(prev_out["coding"])
logger.log_object(exp, tag="runner result")

if KAGGLE_IMPLEMENT_SETTING.competition in [
"optiver-realized-volatility-prediction",
"covid19-global-forecasting-week-1",
Expand All @@ -99,7 +92,6 @@ def running(self, prev_out: dict[str, Any]):
)
except Exception as e:
logger.error(f"Merge python files to one file failed: {e}")

if KAGGLE_IMPLEMENT_SETTING.auto_submit:
csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
try:
Expand Down Expand Up @@ -129,21 +121,16 @@ def running(self, prev_out: dict[str, Any]):
def main(path=None, step_n=None, competition=None):
"""
Auto R&D Evolving loop for models in a kaggle{} scenario.
You can continue running session by
.. code-block:: bash
dotenv run -- python rdagent/app/kaggle/loop.py [--competition titanic] $LOG_PATH/__session__/1/0_propose --step_n 1 # `step_n` is a optional parameter
rdagent kaggle --competition playground-series-s4e8 # You are encouraged to use this one.
"""
if competition:
KAGGLE_IMPLEMENT_SETTING.competition = competition
download_data(competition=competition, local_path=KAGGLE_IMPLEMENT_SETTING.local_data_path)
else:
logger.error("Please specify competition name.")

if path is None:
kaggle_loop = KaggleRDLoop(KAGGLE_IMPLEMENT_SETTING)
else:
Expand Down
31 changes: 31 additions & 0 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@
if "lround" not in state:
state.lround = 0 # RD Loop Round

if "times" not in state:
state.times = defaultdict(lambda: defaultdict(list))

if "erounds" not in state:
state.erounds = defaultdict(int) # Evolving Rounds in each RD Loop

Expand Down Expand Up @@ -186,6 +189,17 @@ def get_msgs_until(end_func: Callable[[Message], bool] = lambda _: True):
)

state.msgs[state.lround][msg.tag].append(msg)

# Update Times
if "init" in tags:
state.times[state.lround]["init"].append(msg.timestamp)
if "r" in tags:
state.times[state.lround]["r"].append(msg.timestamp)
if "d" in tags:
state.times[state.lround]["d"].append(msg.timestamp)
if "ef" in tags:
state.times[state.lround]["ef"].append(msg.timestamp)

# Stop Getting Logs
if end_func(msg):
break
Expand Down Expand Up @@ -224,6 +238,7 @@ def refresh(same_trace: bool = False):
state.last_msg = None
state.current_tags = []
state.alpha158_metrics = None
state.times = defaultdict(lambda: defaultdict(list))


def evolving_feedback_window(wsf: FactorSingleFeedback | ModelCoderFeedback):
Expand Down Expand Up @@ -741,6 +756,18 @@ def evolving_window():
st.markdown(state.scenario.rich_style_description + css, unsafe_allow_html=True)


def show_times(round: int):
for k, v in state.times[round].items():
if len(v) > 1:
diff = v[-1] - v[0]
else:
diff = v[0] - v[0]
total_seconds = diff.seconds
seconds = total_seconds % 60
minutes = total_seconds // 60
st.markdown(f"**:blue[{k}]**: :red[**{minutes}**] minutes :orange[**{seconds}**] seconds")


if state.scenario is not None:
summary_window()

Expand All @@ -754,8 +781,12 @@ def evolving_window():
round = st.radio("**Loops**", horizontal=True, options=r_options, index=state.lround - 1)
else:
round = 1

show_times(round)
rf_c, d_c = st.columns([2, 2])
elif isinstance(state.scenario, GeneralModelScenario):
show_times(round)

rf_c = st.container()
d_c = st.container()
round = 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,14 @@ def import_module_from_path(module_name, module_path):
metrics_all.append(accuracy)

# 5) Save the validation accuracy
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
max_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Submit predictions for the test
ids = range(1, len(X_test) + 1)

# TODO: fix selection
print(X_valid_selected.columns)
y_test_pred = model_l[min_index][1](model_l[min_index][0], model_l[min_index][2].select(X_test)).flatten()
y_test_pred = model_l[max_index][1](model_l[max_index][0], model_l[max_index][2].select(X_test)).flatten()
submission_result = pd.DataFrame({"ImageId": ids, "Label": y_test_pred})
submission_result.to_csv("submission.csv", index=False)

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ def import_module_from_path(module_name, module_path):
metrics_all.append(accuracy)

# 5) Save the validation accuracy
min_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")
max_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
X_test_selected = model_l[min_index][2].select(X_test.copy())
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1
X_test_selected = model_l[max_index][2].select(X_test.copy())
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1


# 7) Submit predictions for the test set
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")
y_train = pd.Series(y_train).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

return X_train, X_valid, y_train, y_valid, X_test, *others

# train
train = pd.read_csv("/kaggle/input/train.csv")
X_train, X_valid, y_train, y_valid = train_test_split(
train.drop(["yield", "id"], axis=1), train["yield"], test_size=0.2, random_state=2023
)
y_train = pd.Series(y_train).reset_index(drop=True)
y_valid = pd.Series(y_valid).reset_index(drop=True)

# test
test = pd.read_csv("/kaggle/input/test.csv")

ids = test["id"]
X_test = test.drop(["id"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids
Loading

0 comments on commit 75eea22

Please sign in to comment.