feat: Expanded benchmarking to 100 samples and added further analysis…

… of the resulting metrics.
kurisu · Oct 24, 2024 · b561eb0 · b561eb0
1 parent af5828d
commit b561eb0
Show file tree

Hide file tree

Showing 7 changed files with 639 additions and 543 deletions.
diff --git a/README.md b/README.md
@@ -57,7 +57,11 @@ TBD
 
 TBD
 
-## Future Work
+## Related Research
+
+* [Retro: A Generalist Agent for Science](https://arxiv.org/abs/2112.04426)
+* [RETRO-pytorch](https://github.com/lucidrains/RETRO-pytorch)
+* [Why isn't Retro mainstream? State-of-the-art within reach](https://www.reddit.com/r/MachineLearning/comments/1cffgkt/d_why_isnt_retro_mainstream_stateoftheart_within/)
 
 TBD
 

diff --git a/app.py b/app.py
@@ -14,7 +14,10 @@
 )
 from tools.text_to_image import TextToImageTool
 from transformers import load_tool
-from prompts import DEFAULT_SQUAD_REACT_CODE_SYSTEM_PROMPT, FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT
+from prompts import (
+    DEFAULT_SQUAD_REACT_CODE_SYSTEM_PROMPT,
+    FOCUSED_SQUAD_REACT_CODE_SYSTEM_PROMPT,
+)
 from pygments.formatters import HtmlFormatter
 
 
@@ -65,7 +68,7 @@
     model_name=model_name,
     toolbox=TASK_SOLVING_TOOLBOX,
     system_prompt=system_prompt,
-    use_openai=True, # Use OpenAI instead of a local or HF model as the base LLM engine
+    use_openai=True,  # Use OpenAI instead of a local or HF model as the base LLM engine
 )
 
 app = None
@@ -130,24 +133,28 @@ def update_session(value, request: Request):
 
     return component
 
+
 from gradio.components import (
     Component as GradioComponent,
 )
-from gradio.components.chatbot import Chatbot, FileDataDict, FileData, ComponentMessage, FileMessage
+from gradio.components.chatbot import (
+    Chatbot,
+    FileDataDict,
+    FileData,
+    ComponentMessage,
+    FileMessage,
+)
+
 
 class CleanChatBot(Chatbot):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def _postprocess_content(
         self,
-        chat_message: str
-        | tuple
-        | list
-        | FileDataDict
-        | FileData
-        | GradioComponent
-        | None,
+        chat_message: (
+            str | tuple | list | FileDataDict | FileData | GradioComponent | None
+        ),
     ) -> str | FileMessage | ComponentMessage | None:
         response = super()._postprocess_content(chat_message)
         print(f"Post processing content: {response}")
@@ -156,6 +163,7 @@ def _postprocess_content(
             response.props["open"] = False
         return response
 
+
 with gr.Blocks(
     fill_height=True,
     css=".gradio-container .message .content {text-align: left;}"

diff --git a/benchmarking.ipynb b/benchmarking.ipynb
diff --git a/benchmarks/baseline.pkl b/benchmarks/baseline.pkl
diff --git a/benchmarks/focused.pkl b/benchmarks/focused.pkl
diff --git a/benchmarks/succinct.pkl b/benchmarks/succinct.pkl
diff --git a/samples/samples.pkl b/samples/samples.pkl