Skip to content

Commit

Permalink
feat: Factor Implement Search Enhancement (#294)
Browse files Browse the repository at this point in the history
* Search enhancement

* refactor: reorganize imports for consistency with isort

* reformatterd by black

---------

Co-authored-by: Tim <[email protected]>
  • Loading branch information
taozhiwang and qew21 authored Sep 23, 2024
1 parent 32cc902 commit 4ecf25f
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 2 deletions.
7 changes: 7 additions & 0 deletions rdagent/components/coder/factor_coder/CoSTEER/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import pickle
from pathlib import Path

Expand Down Expand Up @@ -49,6 +50,11 @@ def __init__(
if FACTOR_IMPLEMENT_SETTINGS.new_knowledge_base_path is not None
else None
)
self.data_tables_knowledge_path = (
Path(FACTOR_IMPLEMENT_SETTINGS.data_tables_knowledge_path)
if FACTOR_IMPLEMENT_SETTINGS.data_tables_knowledge_path is not None
else None
)
self.with_knowledge = with_knowledge
self.with_feedback = with_feedback
self.knowledge_self_gen = knowledge_self_gen
Expand All @@ -72,6 +78,7 @@ def load_or_init_knowledge_base(self, former_knowledge_base_path: Path = None, c
factor_knowledge_base = (
FactorGraphKnowledgeBase(
init_component_list=component_init_list,
data_set_knowledge_path=self.data_tables_knowledge_path,
)
if self.evolving_version == 2
else FactorKnowledgeBaseV1()
Expand Down
50 changes: 50 additions & 0 deletions rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,19 @@ def __init__(self, *args, **kwargs) -> None:
self.num_loop = 0
self.haveSelected = False

def _query_data_tables(self, user_prompt, session):
for _ in range(10): # max attempt to reduce the length of user_prompt
response = session.build_chat_completion(
user_prompt=user_prompt,
json_mode=True,
)
try:
result = json.loads(response)
return result
except json.JSONDecodeError:
continue
return None

def implement_one_factor(
self,
target_task: FactorTask,
Expand Down Expand Up @@ -218,6 +231,42 @@ def implement_one_factor(
queried_knowledge.former_traces[target_factor_task_information] if queried_knowledge is not None else []
)

queried_data_tables = (
queried_knowledge.data_set_knowledge_dict[target_factor_task_information]
if queried_knowledge is not None
else []
)
queried_data_tables_str = json.dumps(queried_data_tables, indent=2)

system_prompt = (
Environment(undefined=StrictUndefined)
.from_string(
implement_prompts["evolving_strategy_search_data_table_system_prompt"],
)
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(
implement_prompts["evolving_strategy_search_data_table"],
)
.render(
scenario=self.scen.get_scenario_all_desc(),
factor_information_str=target_factor_task_information,
data_tables=queried_data_tables_str,
)
)
session = APIBackend(use_chat_cache=FACTOR_IMPLEMENT_SETTINGS.coder_use_cache).build_chat_session(
session_system_prompt=system_prompt,
)

useful_data_table = self._query_data_tables(user_prompt, session)
selected_knowledge_dict = {}
for key in useful_data_table:
if key in queried_knowledge.data_set_knowledge_dict:
selected_knowledge_dict[key] = queried_knowledge.data_set_knowledge_dict[key]

queried_former_failed_knowledge_to_render = queried_former_failed_knowledge

system_prompt = (
Expand All @@ -228,6 +277,7 @@ def implement_one_factor(
.render(
scenario=self.scen.get_scenario_all_desc(),
queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
selected_knowledge_dict=selected_knowledge_dict,
)
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import copy
import heapq
import json
import random
import re
Expand Down Expand Up @@ -204,11 +205,13 @@ def __init__(
former_traces: dict = {},
component_with_success_task: dict = {},
error_with_success_task: dict = {},
data_set_knowledge_dict: dict = {},
**kwargs,
) -> None:
self.former_traces = former_traces
self.component_with_success_task = component_with_success_task
self.error_with_success_task = error_with_success_task
self.data_set_knowledge_dict = data_set_knowledge_dict
super().__init__(**kwargs)


Expand Down Expand Up @@ -308,6 +311,10 @@ def query(self, evo: EvolvableSubjects, evolving_trace: list[EvoStep]) -> Querie
FACTOR_IMPLEMENT_SETTINGS.v2_query_error_limit,
knowledge_sampler=conf_knowledge_sampler,
)
factor_implementation_queried_graph_knowledge = self.dataset_query(
evo,
factor_implementation_queried_graph_knowledge,
)
return factor_implementation_queried_graph_knowledge

def analyze_component(
Expand Down Expand Up @@ -710,9 +717,37 @@ def error_query(

return factor_implementation_queried_graph_knowledge

def dataset_query(
self,
evo: EvolvableSubjects,
factor_implementation_queried_graph_knowledge: FactorQueriedGraphKnowledge,
) -> QueriedKnowledge | None:
for task_index, target_factor_task in enumerate(evo.sub_tasks):
target_factor_task_information = target_factor_task.get_task_information()
related_info = {}

knowledge_dict = self.knowledgebase.data_set_knowledge_dict
table_explanations = [f"{key}: {json.dumps(value)}" for key, value in knowledge_dict.items()]

similarity = calculate_embedding_distance_between_str_list(
[target_factor_task_information], table_explanations
)[0]

top_related_indexes = heapq.nlargest(10, range(len(similarity)), key=lambda i: similarity[i])

for index in top_related_indexes:
key = list(knowledge_dict.keys())[index]
related_info[key] = knowledge_dict[key]

factor_implementation_queried_graph_knowledge.data_set_knowledge_dict[
target_factor_task_information
] = related_info

return factor_implementation_queried_graph_knowledge


class FactorGraphKnowledgeBase(KnowledgeBase):
def __init__(self, init_component_list=None) -> None:
def __init__(self, init_component_list=None, data_set_knowledge_path=None) -> None:
"""
Load knowledge, offer brief information of knowledge and common handle interfaces
"""
Expand Down Expand Up @@ -740,6 +775,12 @@ def __init__(self, init_component_list=None) -> None:
# store the task description to component nodes
self.task_to_component_nodes = {}

# data set: data set information
self.data_set_knowledge_dict = {}
if data_set_knowledge_path:
with open(data_set_knowledge_path, "r") as f:
self.data_set_knowledge_dict = json.load(f)

def get_all_nodes_by_label(self, label: str) -> list[UndirectedNode]:
return self.graph.get_all_nodes_by_label(label)

Expand Down
3 changes: 3 additions & 0 deletions rdagent/components/coder/factor_coder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class Config:
knowledge_base_path: Union[str, None] = None
"""Path to the knowledge base"""

data_tables_knowledge_path: Union[str, None] = None
"""Path to the data tables knowledge"""

new_knowledge_base_path: Union[str, None] = None
"""Path to the new knowledge base"""

Expand Down
31 changes: 30 additions & 1 deletion rdagent/components/coder/factor_coder/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ evolving_strategy_factor_implementation_v1_system: |-
1. The user might provide you the correct code to similar factors. Your should learn from these code to write the correct code.
2. The user might provide you the failed former code and the corresponding feedback to the code. The feedback contains to the execution, the code and the factor value. You should analyze the feedback and try to correct the latest code.
3. The user might provide you the suggestion to the latest fail code and some similar fail to correct pairs. Each pair contains the fail code with similar error and the corresponding corrected version code. You should learn from these suggestion to write the correct code.
4. The user might provide you some data table names and their explanations, as well as their corresponding data fields. You should find the data fields that contains the data needed for the factor implementation.
Your must write your code based on your former latest attempt below which consists of your former code and code feedback, you should read the former attempt carefully and must not modify the right part of your former code.
{% if queried_former_failed_knowledge|length != 0 %}
Expand All @@ -57,6 +58,13 @@ evolving_strategy_factor_implementation_v1_system: |-
{{ queried_former_failed_knowledge[-1].feedback }}
{% endif %}
{% if selected_knowledge_dict|length != 0 %}
--------------Some datasets and their fields you may need:---------------
{% for selected_knowledge in selected_knowledge_dict.keys() %}
{{ selected_knowledge }}: {{ selected_knowledge_dict[selected_knowledge] }}
{% endfor %}
{% endif %}
Please response the code in the following json format. Here is an example structure for the JSON output:
{
"code": "The Python code as a string."
Expand Down Expand Up @@ -119,6 +127,27 @@ evolving_strategy_factor_implementation_v2_user: |-
{% endfor %}
{% endif %}
evolving_strategy_search_data_table_system_prompt: |-
The user will provide you with a task and some data. You need to determine which data tables are needed by the user.
Respond with your analysis in JSON format. The JSON schema should include:
{
"name of data table 1": ["name of data field 1", "name of data field 2"],
"name of data table 2": ["name of data field 1"],
}
evolving_strategy_search_data_table: |-
User is trying to implement some factors in the following scenario:
{{ scenario }}
User is doing the following task:
{{factor_information_str}}
Here are some data table names and their explanations.
You should find the data table and data columns that contains the data needed for the factor implementation.
Please response the data table name in the json format.
-------------------Data Tables-------------------
{{ data_tables }}
evolving_strategy_error_summary_v2_system: |-
User is trying to implement some factors in the following scenario:
{{ scenario }}
Expand Down

0 comments on commit 4ecf25f

Please sign in to comment.