microsoft · WinstonLiyt · Jul 24, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 19, 2024
diff --git a/rdagent/app/qlib_rd_loop/factor.py b/rdagent/app/qlib_rd_loop/factor.py
@@ -36,7 +36,7 @@
 trace = Trace(scen=scen)
 for _ in range(PROP_SETTING.evolving_n):
     try:
-        with logger.tag("r"):  # research
+        with logger.tag("r"):
             hypothesis = hypothesis_gen.gen(trace)
             logger.log_object(hypothesis, tag="hypothesis generation")
 
@@ -49,6 +49,9 @@
 
         with logger.tag("ef"):
             exp = qlib_factor_runner.develop(exp)
+            if exp is None:
+                logger.error(f"Factor extraction failed.")
+                continue
             logger.log_object(exp, tag="factor runner result")
             feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
             logger.log_object(feedback, tag="feedback")

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report_sh.py b/rdagent/app/qlib_rd_loop/factor_from_report_sh.py
@@ -7,7 +7,7 @@
 import pandas as pd
 
 from rdagent.app.qlib_rd_loop.conf import PROP_SETTING
-from rdagent.components.document_reader.document_reader import load_and_process_pdfs_by_langchain
+from rdagent.components.document_reader.document_reader import extract_first_page_screenshot_from_pdf, load_and_process_pdfs_by_langchain
 from rdagent.core.prompts import Prompts
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
@@ -88,7 +88,11 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             exp = FactorExperimentLoaderFromPDFfiles().load(report_file_path)
             if exp is None or exp.sub_tasks == []:
                 return None, None
-
+
+        with logger.tag("load_pdf_screenshot"):
+            pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
+            logger.log_object(pdf_screenshot, tag="load_pdf_screenshot")
+
     docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
 
     factor_result = {
@@ -118,19 +122,30 @@ def extract_factors_and_implement(report_file_path: str) -> tuple:
             report_file_path = Path(file_path.replace(PROP_SETTING.origin_report_path, PROP_SETTING.local_report_path))
             if report_file_path.exists():
                 logger.info(f"Processing {report_file_path}")
-                exp, hypothesis = extract_factors_and_implement(str(report_file_path))
-                if exp is None:
-                    continue
-                exp.based_experiments = [t[1] for t in trace.hist if t[2]]
-                if len(exp.based_experiments) == 0:
-                    exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
-                exp = qlib_factor_coder.develop(exp)
-                exp = qlib_factor_runner.develop(exp)
-                if exp is None:
-                    logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
-                    continue
-                feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
-
+
+                with logger.tag("r"):
+                    exp, hypothesis = extract_factors_and_implement(str(report_file_path))
+                    if exp is None:
+                        continue
+                    exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+                    if len(exp.based_experiments) == 0:
+                        exp.based_experiments.append(QlibFactorExperiment(sub_tasks=[]))
+                    logger.log_object(hypothesis, tag="hypothesis generation")
+                    logger.log_object(exp.sub_tasks, tag="experiment generation")
+
+                with logger.tag("d"):
+                    exp = qlib_factor_coder.develop(exp)
+                    logger.log_object(exp.sub_workspace_list)
+
+                with logger.tag("ef"):
+                    exp = qlib_factor_runner.develop(exp)
+                    if exp is None:
+                        logger.error(f"Factor extraction failed for {report_file_path}. Skipping to the next report.")
+                        continue
+                    logger.log_object(exp, tag="factor runner result")
+                    feedback = qlib_factor_summarizer.generate_feedback(exp, hypothesis, trace)
+                    logger.log_object(feedback, tag="feedback")
+
                 trace.hist.append((hypothesis, exp, feedback))
                 logger.info(f"Processed {report_file_path}: Result: {exp}")
 

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -437,21 +437,36 @@ def evaluate(
             else:
                 break
 
-        final_evaluation_dict = json.loads(
-            APIBackend().build_messages_and_create_chat_completion(
-                user_prompt=user_prompt,
-                system_prompt=system_prompt,
-                json_mode=True,
-            ),
-        )
-        if isinstance(final_evaluation_dict["final_decision"], str) and final_evaluation_dict[
-            "final_decision"
-        ].lower() in ("true", "false"):
-            final_evaluation_dict["final_decision"] = bool(final_evaluation_dict["final_decision"])
-        return (
-            final_evaluation_dict["final_decision"],
-            final_evaluation_dict["final_feedback"],
-        )
+        # TODO:  with retry_context(retry_n=3, except_list=[KeyError]):
+        final_evaluation_dict = None
+        attempts = 0
+        max_attempts = 3
+
+        while attempts < max_attempts:
+            try:
+                final_evaluation_dict = json.loads(
+                    APIBackend().build_messages_and_create_chat_completion(
+                        user_prompt=user_prompt,
+                        system_prompt=system_prompt,
+                        json_mode=True,
+                    ),
+                )
+                final_decision = final_evaluation_dict["final_decision"]
+                final_feedback = final_evaluation_dict["final_feedback"]
+
+                if isinstance(final_decision, str) and final_decision.lower() in ("true", "false"):
+                    final_decision = bool(final_decision)
+
+                return final_decision, final_feedback
+
+            except json.JSONDecodeError as e:
+                raise ValueError("Failed to decode JSON response from API.") from e
+            except KeyError as e:
+                attempts += 1
+                if attempts >= max_attempts:
+                    raise KeyError("Response from API is missing 'final_decision' or 'final_feedback' key after multiple attempts.") from e
+
+        return None, None
 
 
 class FactorSingleFeedback:

diff --git a/rdagent/components/document_reader/document_reader.py b/rdagent/components/document_reader/document_reader.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 from pathlib import Path
+import fitz
+from PIL import Image
 from typing import TYPE_CHECKING
 
 from azure.ai.formrecognizer import DocumentAnalysisClient
@@ -96,3 +98,11 @@ def load_and_process_pdfs_by_azure_document_intelligence(path: Path) -> dict[str
                     RD_AGENT_SETTINGS.azure_document_intelligence_endpoint,
                 )
     return content_dict
+
+def extract_first_page_screenshot_from_pdf(pdf_path: Path) -> Image:
+    doc = fitz.open(pdf_path)
+    page = doc.load_page(0)
+    pix = page.get_pixmap()
+    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+
+    return image
diff --git a/rdagent/scenarios/qlib/experiment/workspace.py b/rdagent/scenarios/qlib/experiment/workspace.py
@@ -17,12 +17,6 @@ def execute(self, qlib_config_name: str = "conf.yaml", run_env: dict = {}, *args
         qtde = QTDockerEnv()
         qtde.prepare()
 
-        # Run the Docker command
-        execute_log = qtde.run(
-            local_path=str(self.workspace_path),
-            entry="rm -r mlruns",
-            env=run_env,
-        )
         # Run the Qlib backtest
         execute_log = qtde.run(
             local_path=str(self.workspace_path),

diff --git a/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py b/rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py
@@ -70,7 +70,7 @@ def classify_report_from_dict(
         if isinstance(value, str):
             content = value
         else:
-            logger.warning(f"输入格式不符合要求: {file_name}")
+            logger.warning(f"Input format does not meet the requirements: {file_name}")
             res_dict[file_name] = {"class": 0}
             continue
 
@@ -102,7 +102,7 @@ def classify_report_from_dict(
                 res = json.loads(res)
                 vote_list.append(int(res["class"]))
             except json.JSONDecodeError:
-                logger.warning(f"返回值无法解析: {file_name}")
+                logger.warning(f"Return value could not be parsed: {file_name}")
                 res_dict[file_name] = {"class": 0}
             count_0 = vote_list.count(0)
             count_1 = vote_list.count(1)
@@ -243,7 +243,7 @@ def extract_factors_from_report_dict(
     )
     for index, file_name in enumerate(file_name_list):
         final_report_factor_dict[file_name] = factor_dict_list[index]
-    logger.info(f"已经完成{len(final_report_factor_dict)}个报告的因子提取")
+    logger.info(f"Factor extraction completed for {len(final_report_factor_dict)} reports")
 
     return final_report_factor_dict
 
@@ -507,13 +507,24 @@ def deduplicate_factors_by_llm(  # noqa: C901, PLR0912
 
 class FactorExperimentLoaderFromPDFfiles(FactorExperimentLoader):
     def load(self, file_or_folder_path: Path) -> dict:
-        docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
+        with logger.tag("docs"):
+            docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
+            logger.log_object(docs_dict, tag="docs dict")
 
         selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)
-        file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
-        factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
+
+        with logger.tag("file_to_factor_result"):
+            file_to_factor_result = extract_factors_from_report_dict(docs_dict, selected_report_dict)
+            logger.log_object(file_to_factor_result, tag="file_to_factor_result")
+
+        with logger.tag("factor_dict"):
+            factor_dict = merge_file_to_factor_dict_to_factor_dict(file_to_factor_result)
+            logger.log_object(factor_dict, tag="factor_dict")
+
+        with logger.tag("filtered_factor_dict"):
+            factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
+            logger.log_object(filtered_factor_dict, tag="filtered_factor_dict")
 
-        factor_viability, filtered_factor_dict = check_factor_viability(factor_dict)
         # factor_dict, duplication_names_list = deduplicate_factors_by_llm(factor_dict, factor_viability)
 
         return FactorExperimentLoaderFromDict().load(filtered_factor_dict)
diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
@@ -57,6 +57,61 @@ factor_hypothesis_specification: |-
     - "Combine value and momentum factors using a weighted average approach."
     - "Filter stocks by market capitalization before calculating the factors."
 
+factor_hypothesis_specification: |-
+  Additional Specifications:
+    - Hypotheses should grow and evolve based on the previous hypothesis. If there is no previous hypothesis, start with something simple.
+    - Gradually build upon previous hypotheses and feedback.
+    - Ensure that the hypothesis focuses on the creation and selection of factors in quantitative finance.
+    - Each hypothesis should address specific factor characteristics such as type (momentum, value, quality), calculation methods, or inclusion criteria.
+    - Avoid hypotheses related to model architecture or optimization processes.
+    - If a hypothesis can be improved further, refine it. If it achieves the desired results, explore a new direction. Previous factors exceeding SOTA (State of the Art) are preserved and combined with new factors for subsequent evaluations.
+
+  Guiding Principles:
+  1. Diversity and Depth:
+    - Ensure a wide range of factor types, incorporating various financial dimensions (e.g., momentum, value, quality, volatility, sentiment).
+    - Explore different calculation methods and inclusion criteria to understand their impact.
+    - Consider combining multiple factors or filtering criteria for more sophisticated hypotheses.
+
+  2. Iterative Improvement:
+    - Build upon previous hypotheses, incorporating feedback and observed results.
+    - Aim for continuous refinement and complexity over iterations, starting from basic factors to more advanced combinations and techniques.
+
+  3. Contextual Relevance:
+    - Tailor hypotheses to the specific financial context and current market conditions.
+    - Leverage domain knowledge and recent financial research to inform hypothesis creation.
+
+  Sample Hypotheses (Use the format for guidance, not the specific content):
+  - "Include a momentum factor based on the last 12 months' returns."
+  - "Add a value factor calculated as the book-to-market ratio."
+  - "Incorporate a quality factor derived from return on equity (ROE)."
+  - "Use a volatility factor based on the standard deviation of returns over the past 6 months."
+  - "Include a sentiment factor derived from news sentiment scores."
+  - "The momentum factor should be calculated using a 6-month look-back period."
+  - "Combine value and momentum factors using a weighted average approach."
+  - "Filter stocks by market capitalization before calculating the factors."
+  - "Explore a liquidity factor based on the trading volume and bid-ask spread."
+  - "Investigate the impact of an earnings surprise factor calculated from recent earnings announcements."
+  - "Develop a composite factor integrating ESG (Environmental, Social, Governance) scores with traditional financial metrics."
+
+  Detailed Workflow:
+  1. Initial Hypothesis:
+    - Begin with a simple factor, such as "Include a momentum factor based on the last 12 months' returns."
+
+  2. Refine Hypothesis:
+    - If the initial hypothesis is promising, refine it further, e.g., "The momentum factor should be calculated using a 6-month look-back period."
+
+  3. Combine Factors:
+    - As individual factors show potential, combine them, e.g., "Combine value and momentum factors using a weighted average approach."
+
+  4. Contextual Adjustments:
+    - Adjust factors based on market conditions or new financial insights, e.g., "Incorporate a quality factor derived from return on equity (ROE)."
+
+  5. Advanced Hypotheses:
+    - Explore sophisticated combinations or new types of factors, e.g., "Develop a composite factor integrating ESG scores with traditional financial metrics."
+
+  Remember: If a hypothesis achieves the desired results, start a new direction while preserving the effective factors from previous hypotheses. New evaluations should combine the newly proposed factors with previously successful factors that surpassed SOTA.
+
+
 factor_experiment_output_format: |-
   The output should follow JSON format. The schema is as follows:
   {
@@ -98,7 +153,7 @@ model_experiment_output_format: |-
 
 factor_feedback_generation:
   system: |-
-    You are a professional result analysis assistant on data driven R&D.
+    You are a professional result analysis assistant in data-driven R&D.
     The task is described in the following scenario:
     {{ scenario }}
     You will receive a hypothesis, multiple tasks with their factors, and some results.
@@ -121,8 +176,7 @@ factor_feedback_generation:
     {{ combined_result }}
     Analyze the combined result in the context of its ability to:
     1. Support or refute the hypothesis.
-    2. Show improvement or deterioration compared to the last experiment.
-    3. Demonstrate positive or negative effects when compared to Alpha158.
+    2. Show improvement or deterioration compared to the SOTA experiment.
 
     Evaluation Metrics Explanations:
     Below are the financial meanings of each metric, which should be used to judge the results:
@@ -136,8 +190,17 @@ factor_feedback_generation:
     - IC: Measures the correlation between predicted returns (\hat{y}) and actual returns (y), using Pearson correlation.
     - 1day.excess_return_with_cost.information_ratio: Evaluates the excess return per unit of risk considering transaction costs.
 
-    When judging the results, prioritize metrics that consider transaction costs (with cost), as they provide a more accurate representation of real-world performance. Among these, the annualized return considering transaction costs is particularly important as it gives a clear picture of long-term profitability.
-    Provide detailed feedback and recommend whether to replace the best result if the new factor proves superior.
+    When judging the results:
+    1. Prioritize metrics that consider transaction costs (with cost):
+    - These metrics provide a more accurate representation of real-world performance.
+    2. Evaluate all metrics:
+        - Compare the combined results against the current best results across all metrics to get a comprehensive view of performance.
+    3. Focus on the annualized return considering transaction costs:
+        - This metric is particularly important as it gives a clear picture of long-term profitability.
+    4. Recommendation for replacement:
+        - If the new factor demonstrates a significant improvement in the annualized return considering transaction costs, it should be recommended to replace the current best result, even if other metrics show minor variations.
+
+    Please provide detailed feedback and recommend whether to replace the best result if the new factor proves superior.
 
 model_feedback_generation:
   system: |-

diff --git a/requirements/package.txt b/requirements/package.txt
@@ -19,6 +19,7 @@ langchain
 tiktoken
 scikit-learn
 docker
+fitz  # Extract shotsreens from pdf
 
 # azure identity related
 azure.identity