Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve_execution_time_in_kaggle_loop #279

Merged
merged 4 commits into from
Sep 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rdagent.components.workflow.conf import BasePropSetting
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.developer import Developer
from rdagent.core.exception import ModelEmptyError
from rdagent.core.exception import FactorEmptyError, ModelEmptyError
from rdagent.core.proposal import (
Hypothesis2Experiment,
HypothesisExperiment2Feedback,
Expand Down Expand Up @@ -71,7 +71,7 @@ def running(self, prev_out: dict[str, Any]):
logger.log_object(exp, tag="runner result")
return exp

skip_loop_error = (ModelEmptyError,)
skip_loop_error = (ModelEmptyError, FactorEmptyError)


def main(path=None, step_n=None, competition=None):
Expand Down
2 changes: 0 additions & 2 deletions rdagent/app/qlib_rd_loop/factor_from_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
extract_first_page_screenshot_from_pdf,
load_and_process_pdfs_by_langchain,
)
from rdagent.components.workflow.rd_loop import RDLoop
from rdagent.core.exception import FactorEmptyError
from rdagent.core.prompts import Prompts
from rdagent.core.proposal import Hypothesis
from rdagent.log import rdagent_logger as logger
Expand Down
15 changes: 1 addition & 14 deletions rdagent/components/coder/factor_coder/factor.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,19 +87,6 @@ def __init__(
self.executed_factor_value_dataframe = executed_factor_value_dataframe
self.raise_exception = raise_exception

@staticmethod
def link_data_to_workspace(data_path: Path, workspace_path: Path):
data_path = Path(data_path).absolute() # in case of relative path that will be invalid when we change cwd.
workspace_path = Path(workspace_path)
for data_file_path in data_path.iterdir():
workspace_data_file_path = workspace_path / data_file_path.name
if workspace_data_file_path.exists():
workspace_data_file_path.unlink()
subprocess.run(
["ln", "-s", data_file_path, workspace_data_file_path],
check=False,
)

def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple[str, pd.DataFrame]:
"""
execute the implementation and get the factor value by the following steps:
Expand Down Expand Up @@ -154,7 +141,7 @@ def execute(self, store_result: bool = False, data_type: str = "Debug") -> Tuple
source_data_path.mkdir(exist_ok=True, parents=True)
code_path = self.workspace_path / f"factor.py"

self.link_data_to_workspace(source_data_path, self.workspace_path)
self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)

execution_feedback = self.FB_EXECUTION_SUCCEEDED
execution_success = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import numpy as np
import pandas as pd
from factor import feature_engineering_cls

if os.path.exists("valid.pkl"):
valid_df = pd.read_pickle("valid.pkl")
if os.path.exists("X_valid.pkl"):
valid_df = pd.read_pickle("X_valid.pkl").head(1000)
else:
raise FileNotFoundError("No valid data found.")

Expand Down
14 changes: 13 additions & 1 deletion rdagent/core/experiment.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from __future__ import annotations

import os
import shutil
import uuid
from abc import ABC, abstractmethod
from collections.abc import Sequence
from copy import deepcopy
from pathlib import Path
from typing import Any, Generic, Sequence, TypeVar
from typing import Any, Generic, TypeVar

from rdagent.core.conf import RD_AGENT_SETTINGS

Expand Down Expand Up @@ -111,6 +113,16 @@ def prepare(self) -> None:
"""
self.workspace_path.mkdir(parents=True, exist_ok=True)

@staticmethod
def link_all_files_in_folder_to_workspace(data_path: Path, workspace_path: Path) -> None:
data_path = Path(data_path).absolute() # in case of relative path that will be invalid when we change cwd.
workspace_path = Path(workspace_path)
for data_file_path in data_path.iterdir():
workspace_data_file_path = workspace_path / data_file_path.name
if workspace_data_file_path.exists():
workspace_data_file_path.unlink()
os.symlink(data_file_path, workspace_data_file_path)

def inject_code(self, **files: str) -> None:
"""
Inject the code into the folder.
Expand Down
3 changes: 1 addition & 2 deletions rdagent/core/prompts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from pathlib import Path # noqa: I001
from typing import Dict

import yaml

from rdagent.core.utils import SingletonBaseClass


class Prompts(SingletonBaseClass, Dict[str, str]):
class Prompts(SingletonBaseClass, dict[str, str]):
def __init__(self, file_path: Path) -> None:
super().__init__()
with file_path.open(encoding="utf8") as file:
Expand Down
2 changes: 1 addition & 1 deletion rdagent/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __new__(cls, *args: Any, **kwargs: Any) -> Any:
raise RDAgentException(exception_message)
class_name = [(-1, f"{cls.__module__}.{cls.__name__}")]
args_l = [(i, args[i]) for i in args]
kwargs_l = list(sorted(kwargs.items()))
kwargs_l = sorted(kwargs.items())
all_args = class_name + args_l + kwargs_l
kwargs_hash = hash(tuple(all_args))
if kwargs_hash not in cls._instance_dict:
Expand Down
28 changes: 18 additions & 10 deletions rdagent/scenarios/data_mining/proposal/model_proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
super().__init__(scen)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
hypothesis_and_feedback = (
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)
context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": "",
"hypothesis_and_feedback": hypothesis_and_feedback,
"RAG": None,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": prompt_dict["model_hypothesis_specification"],
}
Expand All @@ -67,9 +71,13 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
experiment_output_format = prompt_dict["model_experiment_output_format"]

hypothesis_and_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
(
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)
if len(trace.hist) > 0
else "No previous hypothesis and feedback available since it's the first round."
)

experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
Expand All @@ -84,7 +92,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
"hypothesis_and_feedback": hypothesis_and_feedback,
"experiment_output_format": experiment_output_format,
"target_list": model_list,
"RAG": ...,
"RAG": None,
}, True

def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
Expand Down Expand Up @@ -82,6 +84,15 @@ def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("X_train.pkl"):
X_train = pd.read_pickle("X_train.pkl")
X_valid = pd.read_pickle("X_valid.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
passenger_ids = pd.read_pickle("passenger_ids.pkl")

return X_train, X_valid, y_train, y_valid, X_test, passenger_ids
X_train, X_valid, y_train, y_valid = prepreprocess()

# Fit the preprocessor on the training data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=32)
model = RandomForestClassifier(n_estimators=100, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
Expand Down
15 changes: 9 additions & 6 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/model/model_xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,23 @@
import xgboost as xgb


def select(X):
"""
Select relevant features. To be used in fit & predict function
"""
def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# TODO: for quick running....
params = {}
num_round = 50
params = {
"nthred": -1,
}
num_round = 200

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
Expand All @@ -32,6 +34,7 @@ def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
9 changes: 5 additions & 4 deletions rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ kg_description_template:
"Target Description": "A description of the target variable to be predicted",
"Competition Features": "A dict of relevant features used in the competition and their descriptions (if available)", # if you are not sure about the meaning of the feature, please add a (guess) before the description. Importantly, your feature name should be exactly the same as the feature name in the dataset!
}
Since these might be very similar column names in data like one_hot_encoded columns, you can use some regex to group them together.


user: |-
Expand Down Expand Up @@ -144,7 +145,7 @@ kg_model_interface: |-
from xgboost import DMatrix


def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic


def fit(
Expand Down Expand Up @@ -178,7 +179,7 @@ kg_model_interface: |-
from sklearn.metrics import accuracy_score


def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic


def fit(
Expand Down Expand Up @@ -207,7 +208,7 @@ kg_model_interface: |-
from lightgbm import LGBMClassifier, LGBMRegressor


def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic


def fit(
Expand Down Expand Up @@ -247,7 +248,7 @@ kg_model_interface: |-
return x


def select(self, X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> torch.nn.Module:
Expand Down
24 changes: 19 additions & 5 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import io
import json
import pickle
from pathlib import Path

import pandas as pd
Expand Down Expand Up @@ -93,9 +95,12 @@ def background(self) -> str:
def source_data(self) -> str:
data_folder = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / self.competition

if (data_folder / "valid.pkl").exists():
X_valid = pd.read_pickle(data_folder / "valid.pkl")
return X_valid.head()
if (data_folder / "X_valid.pkl").exists():
X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
buffer = io.StringIO()
X_valid.info(verbose=True, buf=buffer, show_counts=True)
data_info = buffer.getvalue()
return data_info

preprocess_experiment = KGFactorExperiment([])
(
Expand All @@ -108,8 +113,17 @@ def source_data(self) -> str:
) = preprocess_experiment.experiment_workspace.generate_preprocess_data()

data_folder.mkdir(exist_ok=True, parents=True)
X_valid.to_pickle(data_folder / "valid.pkl")
return X_valid.head()
pickle.dump(X_train, open(data_folder / "X_train.pkl", "wb"))
pickle.dump(X_valid, open(data_folder / "X_valid.pkl", "wb"))
pickle.dump(y_train, open(data_folder / "y_train.pkl", "wb"))
pickle.dump(y_valid, open(data_folder / "y_valid.pkl", "wb"))
pickle.dump(X_test, open(data_folder / "X_test.pkl", "wb"))
pickle.dump(passenger_ids, open(data_folder / "passenger_ids.pkl", "wb"))

buffer = io.StringIO()
X_valid.info(verbose=True, buf=buffer, show_counts=True)
data_info = buffer.getvalue()
return data_info

@property
def output_format(self) -> str:
Expand Down
6 changes: 6 additions & 0 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.core.experiment import FBWorkspace
from rdagent.log import rdagent_logger as logger
from rdagent.utils.env import KGDockerEnv
Expand Down Expand Up @@ -58,6 +59,11 @@ def generate_preprocess_data(

def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
logger.info(f"Running the experiment in {self.workspace_path}")

# link the data to the workspace to speed up the preprocessing
source_data_path = Path(FACTOR_IMPLEMENT_SETTINGS.data_folder) / KAGGLE_IMPLEMENT_SETTING.competition
self.link_all_files_in_folder_to_workspace(source_data_path, self.workspace_path)

kgde = KGDockerEnv(KAGGLE_IMPLEMENT_SETTING.competition)
kgde.prepare()

Expand Down
Loading