Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: revert model and make SOTA model available to COSTEER #351

Merged
merged 4 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rdagent/components/coder/factor_coder/CoSTEER/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def develop(self, exp: FactorExperiment) -> FactorExperiment:
self.rag = FactorGraphRAGStrategy(factor_knowledge_base)

# init intermediate items
factor_experiment = FactorEvolvingItem(sub_tasks=exp.sub_tasks)
factor_experiment = FactorEvolvingItem.from_experiment(exp)

self.evolve_agent = FactorRAGEvoAgent(
max_loop=self.max_loop,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,10 @@ def __init__(
)
else:
self.sub_gt_implementations = sub_gt_implementations

@classmethod
def from_experiment(cls, exp: FactorExperiment) -> "FactorExperiment":
ei = cls(sub_tasks=exp.sub_tasks)
ei.based_experiments = exp.based_experiments
ei.experiment_workspace = exp.experiment_workspace
return ei
2 changes: 1 addition & 1 deletion rdagent/components/coder/model_coder/CoSTEER/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def develop(self, exp: ModelExperiment) -> ModelExperiment:
self.rag = ModelRAGStrategy(model_knowledge_base)

# init intermediate items
model_experiment = ModelEvolvingItem(sub_tasks=exp.sub_tasks)
model_experiment = ModelEvolvingItem.from_experiment(exp)

self.evolve_agent = ModelRAGEvoAgent(
max_loop=self.max_loop,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,10 @@ def __init__(
)
else:
self.sub_gt_implementations = sub_gt_implementations

@classmethod
def from_experiment(cls, exp: ModelExperiment) -> "ModelEvolvingItem":
ei = cls(sub_tasks=exp.sub_tasks)
ei.based_experiments = exp.based_experiments
ei.experiment_workspace = exp.experiment_workspace
return ei
41 changes: 24 additions & 17 deletions rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,31 @@ def implement_one_model(
self,
target_task: ModelTask,
queried_knowledge: ModelQueriedKnowledge = None,
exp: ModelExperiment = None, # Add this parameter
current_exp: ModelExperiment = None, # Add this parameter
) -> str:
model_information_str = target_task.get_task_information()
model_type = target_task.model_type

# Get the current code from the experiment using build_from_SOTA
current_code = ""
if exp is not None:
self.build_from_SOTA(exp)
model_file_mapping = {
"XGBoost": "model_xgb.py",
"RandomForest": "model_rf.py",
"LightGBM": "model_lgb.py",
"NN": "model_nn.py",
}
if model_type in model_file_mapping:
current_code = exp.experiment_workspace.code_dict.get(model_file_mapping[model_type], "")
if len(current_exp.based_experiments) == 0:
current_code = None
else:
current_code = ""
sota_exp_code_dict = current_exp.based_experiments[-1].experiment_workspace.code_dict
if target_task.version == 2:
model_file_mapping = {
"XGBoost": "model/model_xgboost.py",
"RandomForest": "model/model_randomforest.py",
"LightGBM": "model/model_lightgbm.py",
"NN": "model/model_nn.py",
}
if model_type in model_file_mapping:
current_code = sota_exp_code_dict.get(model_file_mapping[model_type], None)
elif "model.py" in sota_exp_code_dict:
current_code = sota_exp_code_dict["model.py"]
else:
current_code = None
elif target_task.version == 1:
current_code = sota_exp_code_dict.get("model.py", None)

if queried_knowledge is not None and model_information_str in queried_knowledge.success_task_to_knowledge_dict:
return queried_knowledge.success_task_to_knowledge_dict[model_information_str].implementation
Expand Down Expand Up @@ -74,7 +82,7 @@ def implement_one_model(
.render(
scenario=self.scen.get_scenario_all_desc(),
queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
current_code=current_code, # Add this line
current_code=current_code,
)
)

Expand All @@ -87,7 +95,6 @@ def implement_one_model(
)
.render(
model_information_str=model_information_str,
model_type=model_type, # Add model type to the prompt
queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
)
Expand Down Expand Up @@ -124,7 +131,7 @@ def evolve(
queried_knowledge: ModelQueriedKnowledge | None = None,
**kwargs,
) -> ModelEvolvingItem:
# 1. Find the models that need to be evolved
# 1.找出需要evolve的model
to_be_finished_task_index = []
for index, target_model_task in enumerate(evo.sub_tasks):
target_model_task_desc = target_model_task.get_task_information()
Expand All @@ -140,7 +147,7 @@ def evolve(

result = multiprocessing_wrapper(
[
(self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge))
(self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge, evo))
for target_index in to_be_finished_task_index
],
n=RD_AGENT_SETTINGS.multi_proc_n,
Expand Down
9 changes: 7 additions & 2 deletions rdagent/components/coder/model_coder/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,14 @@ evolving_strategy_model_coder:

Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.

{% if current_code %}
--------------Current code in the workspace:--------------- You need to tune the model based on this! If it is not None, do not write from scratch.
{% if current_code is not none %}
User has write some code before. You should write the new code based on this code. Here is the latest code:
```python
{{ current_code }}
```
Your code should be very similar to the former code which means your code should be ninety more percent same as the former code! You should not modify the right part of the code.
{% else %}
User has not write any code before. You should write the new code from scratch.
{% endif %}

{% if queried_former_failed_knowledge|length != 0 %}
Expand Down
13 changes: 10 additions & 3 deletions rdagent/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,22 @@ def execute(self) -> object | None:
ASpecificWSForSubTasks = TypeVar("ASpecificWSForSubTasks", bound=Workspace)


class Experiment(ABC, Generic[ASpecificTask, ASpecificWSForExperiment, ASpecificWSForSubTasks]):
class Experiment(
ABC,
Generic[ASpecificTask, ASpecificWSForExperiment, ASpecificWSForSubTasks],
):
"""
The experiment is a sequence of tasks and the implementations of the tasks after generated by the Developer.
"""

def __init__(self, sub_tasks: Sequence[ASpecificTask]) -> None:
def __init__(
self,
sub_tasks: Sequence[ASpecificTask],
based_experiments: Sequence[ASpecificWSForExperiment] = [],
) -> None:
self.sub_tasks = sub_tasks
self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
self.based_experiments: Sequence[ASpecificWSForExperiment] = []
self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
self.result: object = None # The result of the experiment, can be different types in different scenarios.
self.experiment_workspace: ASpecificWSForExperiment | None = None

Expand Down
11 changes: 0 additions & 11 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@


class KGCachedRunner(CachedRunner[ASpecificExp]):
def build_from_SOTA(self, exp: ASpecificExp) -> None:
if len(exp.based_experiments) > 0:
exp.experiment_workspace.inject_code(**exp.based_experiments[-1].experiment_workspace.code_dict)
exp.experiment_workspace.data_description = exp.based_experiments[-1].experiment_workspace.data_description
exp.experiment_workspace.model_description = exp.based_experiments[
-1
].experiment_workspace.model_description.copy()

def get_cache_key(self, exp: ASpecificExp) -> str:
codes = []
for f in sorted((exp.experiment_workspace.workspace_path / "feature").glob("*.py"), key=lambda x: x.name):
Expand All @@ -44,7 +36,6 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
"""
For the initial development, the experiment serves as a benchmark for feature engineering.
"""
self.build_from_SOTA(exp)
if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
Expand Down Expand Up @@ -94,7 +85,6 @@ class KGModelRunner(KGCachedRunner[KGModelExperiment]):
def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
self.build_from_SOTA(exp)

sub_ws = exp.sub_workspace_list[0]
# TODO: There's a possibility of generating a hybrid model (lightgbm + xgboost), which results in having two items in the model_type list.
Expand Down Expand Up @@ -175,7 +165,6 @@ def extract_model_task_from_code(self, code: str) -> str:
def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
self.build_from_SOTA(exp)
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
implemented_factor_count = 0
for sub_ws in exp.sub_workspace_list:
Expand Down
17 changes: 17 additions & 0 deletions rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from pathlib import Path

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
Expand All @@ -20,6 +21,14 @@ def __init__(self, *args, **kwargs) -> None:
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
)
if len(self.based_experiments) > 0:
self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
self.experiment_workspace.data_description = deepcopy(
self.based_experiments[-1].experiment_workspace.data_description
)
self.experiment_workspace.model_description = deepcopy(
self.based_experiments[-1].experiment_workspace.model_description
)


class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWorkspace]):
Expand All @@ -28,3 +37,11 @@ def __init__(self, *args, **kwargs) -> None:
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
)
if len(self.based_experiments) > 0:
self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
self.experiment_workspace.data_description = deepcopy(
self.based_experiments[-1].experiment_workspace.data_description
)
self.experiment_workspace.model_description = deepcopy(
self.based_experiments[-1].experiment_workspace.model_description
)
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ kg_background: |-

For each loop, you need to help user decide which action item to choose and provide the corresponding code to implement the action item.

Most importantly, the output format & submission requirements are listed here: {submission_specifications}
So far, we only focus on Model tuning! So far, we only focus on Model tuning! So far, we only focus on Model tuning!

kg_feature_interface: |-
Your code should contain several parts:
Expand Down
1 change: 0 additions & 1 deletion rdagent/scenarios/kaggle/knowledge_management/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ def __init__(self, path: str | Path | None, scenario: KGScenario) -> None:
documents.append(f.read())
self.load_from_documents(documents=documents, scenario=scenario)
self.dump()
tmp = 1

def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list:
session_system_prompt = (
Expand Down
16 changes: 9 additions & 7 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,9 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
"hypothesis_output_format": Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_output_format"])
.render(if_using_feature_selection=KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection),
"hypothesis_specification": f"next experiment action is {action}"
if self.scen.if_action_choosing_based_on_UCB
else None,
"hypothesis_specification": (
f"next experiment action is {action}" if self.scen.if_action_choosing_based_on_UCB else None
),
}
return context_dict, True

Expand Down Expand Up @@ -314,8 +314,9 @@ def convert_feature_experiment(self, response: str, trace: Trace) -> KGFactorExp
)
)

exp = KGFactorExperiment(tasks)
exp.based_experiments = [KGFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
exp = KGFactorExperiment(
sub_tasks=tasks, based_experiments=([KGFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]])
)
return exp

def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperiment:
Expand All @@ -331,8 +332,9 @@ def convert_model_experiment(self, response: str, trace: Trace) -> KGModelExperi
version=2,
)
)
exp = KGModelExperiment(tasks)
exp.based_experiments = [KGModelExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
exp = KGModelExperiment(
sub_tasks=tasks, based_experiments=([KGModelExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]])
)
return exp

def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
Expand Down