Merge branch 'embeddings-benchmark:main' into main

embeddings-benchmark · Dec 12, 2024 · a3a126f · a3a126f
2 parents 67e5200 + 9c0b208
commit a3a126f
Show file tree

Hide file tree

Showing 19 changed files with 2,304 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -401,6 +401,28 @@ results = mteb.load_results(models=models, tasks=tasks)
 df = results_to_dataframe(results)
 ```
 
+</details>
+
+
+<details>
+  <summary>  Annotate Contamination in the training data of a model  </summary>
+
+### Annotate Contamination
+
+have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR
+annotatig the training datasets of the model:
+
+```py
+model_w_contamination = ModelMeta(
+    name = "model-with-contamination"
+    ...
+    training_datasets: {"ArguAna": # name of dataset within MTEB
+                        ["test"]} # the splits that have been trained on
+    ...
+)
+```
+
+
 </details>
 
 <details>

diff --git a/docs/tasks.md b/docs/tasks.md
diff --git a/mteb/evaluation/evaluators/RetrievalEvaluator.py b/mteb/evaluation/evaluators/RetrievalEvaluator.py
@@ -483,10 +483,10 @@ def __call__(
         if self.is_cross_encoder:
             return self.retriever.search_cross_encoder(corpus, queries, self.top_k)
         elif (
-            hasattr(self.retriever.model, "mteb_model_meta")
-            and self.retriever.model.mteb_model_meta.name == "bm25s"
+            hasattr(self.retriever.model.model, "mteb_model_meta")
+            and self.retriever.model.model.mteb_model_meta.name == "bm25s"
         ):
-            return self.retriever.model.search(
+            return self.retriever.model.model.search(
                 corpus,
                 queries,
                 self.top_k,

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
@@ -9,7 +9,7 @@
 
 import mteb
 from mteb.caching import json_cache
-from mteb.leaderboard.figures import performance_size_plot
+from mteb.leaderboard.figures import performance_size_plot, radar_chart
 from mteb.leaderboard.table import scores_to_tables
 
 
@@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
             )
             citation = gr.Markdown(update_citation, inputs=[benchmark_select])
         with gr.Column():
-            plot = gr.Plot(performance_size_plot, inputs=[summary_table])
-            gr.Markdown(
-                "*We only display models that have been run on all tasks in the benchmark*"
-            )
+            with gr.Tab("Performance-Size Plot"):
+                plot = gr.Plot(performance_size_plot, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all tasks in the benchmark*"
+                )
+            with gr.Tab("Top 5 Radar Chart"):
+                radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all task types in the benchmark*"
+                )
     with gr.Tab("Summary"):
         summary_table.render()
     with gr.Tab("Performance per task"):

diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py
@@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
         margin=dict(b=20, t=10, l=20, r=10),  # noqa
     )
     return fig
+
+
+TOP_N = 5
+task_types = [
+    "BitextMining",
+    "Classification",
+    "MultilabelClassification",
+    "Clustering",
+    "PairClassification",
+    "Reranking",
+    "Retrieval",
+    "STS",
+    "Summarization",
+    # "InstructionRetrieval",
+    # Not displayed, because the scores are negative,
+    # doesn't work well with the radar chart.
+    "Speed",
+]
+
+line_colors = [
+    "#EE4266",
+    "#00a6ed",
+    "#ECA72C",
+    "#B42318",
+    "#3CBBB1",
+]
+fill_colors = [
+    "rgba(238,66,102,0.2)",
+    "rgba(0,166,237,0.2)",
+    "rgba(236,167,44,0.2)",
+    "rgba(180,35,24,0.2)",
+    "rgba(60,187,177,0.2)",
+]
+
+
+def radar_chart(df: pd.DataFrame) -> go.Figure:
+    df = df.copy()
+    df["Model"] = df["Model"].map(parse_model_name)
+    # Remove whitespace
+    task_type_columns = [
+        column for column in df.columns if "".join(column.split()) in task_types
+    ]
+    df = df[["Model", *task_type_columns]].set_index("Model")
+    df = df.replace("", np.nan)
+    df = df.dropna()
+    df = df.head(TOP_N)
+    df = df.iloc[::-1]
+    fig = go.Figure()
+    for i, (model_name, row) in enumerate(df.iterrows()):
+        fig.add_trace(
+            go.Scatterpolar(
+                name=model_name,
+                r=[row[task_type] for task_type in task_type_columns]
+                + [row[task_type_columns[0]]],
+                theta=task_type_columns + [task_type_columns[0]],
+                showlegend=True,
+                mode="lines",
+                line=dict(width=2, color=line_colors[i]),
+                fill="toself",
+                fillcolor=fill_colors[i],
+            )
+        )
+    fig.update_layout(
+        font=dict(size=16, color="black"),  # noqa
+        template="plotly_white",
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                gridcolor="black",
+                linecolor="rgba(0,0,0,0)",
+                gridwidth=1,
+                showticklabels=False,
+                ticks="",
+            ),
+            angularaxis=dict(
+                gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
+            ),
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.6,
+            xanchor="left",
+            x=-0.05,
+            entrywidthmode="fraction",
+            entrywidth=1 / 5,
+        ),
+    )
+    return fig
diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py
@@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
     task_names_per_type = defaultdict(list)
     for task_name, task_type in zip(df["task_name"], df["task_type"]):
         task_names_per_type[task_type].append(task_name)
-    groups = df.groupby(["model_name", "model_revision"])
+    groups = df.groupby("model_name")
     records = []
-    for (model_name, model_revision), group_data in groups:
+    for (model_name), group_data in groups:
         name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
         for task_type, task_names in task_names_per_type.items():
             type_mean = np.mean(
@@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
             records.append(
                 dict(  # noqa
                     model_name=model_name,
-                    model_revision=model_revision,
                     task_type=task_type,
                     score=type_mean,
                 )
@@ -125,24 +124,23 @@ def scores_to_tables(
     )
     mean_per_type = get_means_per_types(data)
     mean_per_type = mean_per_type.pivot(
-        index=["model_name", "model_revision"], columns="task_type", values="score"
+        index="model_name", columns="task_type", values="score"
     )
     mean_per_type.columns = [
         split_on_capital(column) for column in mean_per_type.columns
     ]
-    per_task = data.pivot(
-        index=["model_name", "model_revision"], columns="task_name", values="score"
-    )
+    per_task = data.pivot(index="model_name", columns="task_name", values="score")
     to_remove = per_task.isna().all(axis="columns")
     if search_query:
         names = per_task.index.get_level_values("model_name")
         names = pd.Series(names, index=per_task.index)
         to_remove |= ~names.str.contains(search_query, regex=True)
+    models_to_remove = list(per_task[to_remove].index)
     typed_mean = mean_per_type.mean(skipna=False, axis=1)
     overall_mean = per_task.mean(skipna=False, axis=1)
     joint_table = mean_per_type.copy()
-    per_task = per_task[~to_remove]
-    joint_table = joint_table[~to_remove]
+    per_task = per_task.drop(models_to_remove, axis=0)
+    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean", overall_mean)
     joint_table.insert(1, "mean_by_task_type", typed_mean)
     joint_table["borda_rank"] = get_borda_rank(per_task)
@@ -166,10 +164,7 @@ def scores_to_tables(
         model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
     )
     joint_table = joint_table.sort_values("borda_rank", ascending=True)
-    per_task = per_task.loc[
-        joint_table.set_index(["model_name", "model_revision"]).index
-    ]
-    joint_table = joint_table.drop(columns=["model_revision"])
+    per_task = per_task.loc[joint_table.set_index("model_name").index]
     # Removing HF organization from model
     joint_table["model_name"] = joint_table["model_name"].map(
         lambda name: name.split("/")[-1]
@@ -189,7 +184,7 @@ def scores_to_tables(
             "mean": "Mean (Task)",
         }
     )
-    per_task = per_task.reset_index().drop(columns=["model_revision"])
+    per_task = per_task.reset_index()
     per_task["model_name"] = per_task["model_name"].map(
         lambda name: name.split("/")[-1]
     )

diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -75,8 +75,9 @@ class ModelMeta(BaseModel):
             in the Latin script.
         use_instructions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
             input such as "query: {document}" or "passage: {document}".
-        zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
-            are evaluated non-zero-shot unless specified otherwise.
+        training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example
+            {"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to
+            a benchmark as well as mark dataset contaminations.
         adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
         superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
     """
@@ -100,7 +101,7 @@ class ModelMeta(BaseModel):
     reference: STR_URL | None = None
     similarity_fn_name: DISTANCE_METRICS | None = None
     use_instructions: bool | None = None
-    zero_shot_benchmarks: list[str] | None = None
+    training_datasets: dict[str, list[str]] | None = None
     adapted_from: str | None = None
     superseded_by: str | None = None