Merge branch 'refs/heads/main' into v2.0.0

# Conflicts: # docs/tasks.md # mteb/abstasks/AbsTaskClassification.py # mteb/abstasks/AbsTaskClusteringFast.py # mteb/abstasks/AbsTaskInstructionRetrieval.py # mteb/abstasks/AbsTaskMultilabelClassification.py # mteb/abstasks/AbsTaskPairClassification.py # mteb/abstasks/AbsTaskReranking.py # mteb/abstasks/AbsTaskRetrieval.py # mteb/abstasks/AbsTaskSTS.py # mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json # mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json # mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json # mteb/descriptive_stats/Reranking/ESCIReranking.json # mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json # mteb/descriptive_stats/Retrieval/AppsRetrieval.json # mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json # mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json # mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json # mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json # mteb/descriptive_stats/Retrieval/CodeFeedbackST.json # mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json # mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json # mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json # mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json # mteb/descriptive_stats/Retrieval/CosQA.json # mteb/descriptive_stats/Retrieval/JaqketRetrieval.json # mteb/descriptive_stats/Retrieval/NFCorpus.json # mteb/descriptive_stats/Retrieval/StackOverflowQA.json # mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json # mteb/descriptive_stats/Retrieval/Touche2020.json # mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json # mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json # mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json # mteb/evaluation/MTEB.py # mteb/evaluation/evaluators/RetrievalEvaluator.py # mteb/leaderboard/app.py # mteb/leaderboard/figures.py # mteb/leaderboard/table.py # mteb/model_meta.py # mteb/models/arctic_models.py # mteb/models/e5_models.py # mteb/models/nomic_models.py # mteb/models/overview.py # mteb/models/sentence_transformers_models.py # mteb/tasks/Reranking/zho/CMTEBReranking.py # mteb/tasks/Retrieval/__init__.py # mteb/tasks/STS/por/SickBrSTS.py # pyproject.toml # tests/test_benchmark/mock_tasks.py
embeddings-benchmark · Dec 10, 2024 · f16deb6 · f16deb6
2 parents d0aa3a7 + e605c7b
commit f16deb6
Show file tree

Hide file tree

Showing 9 changed files with 2,009 additions and 30 deletions.
diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
@@ -9,7 +9,7 @@
 
 import mteb
 from mteb.caching import json_cache
-from mteb.leaderboard.figures import performance_size_plot
+from mteb.leaderboard.figures import performance_size_plot, radar_chart
 from mteb.leaderboard.table import scores_to_tables
 
 
@@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
             )
             citation = gr.Markdown(update_citation, inputs=[benchmark_select])
         with gr.Column():
-            plot = gr.Plot(performance_size_plot, inputs=[summary_table])
-            gr.Markdown(
-                "*We only display models that have been run on all tasks in the benchmark*"
-            )
+            with gr.Tab("Performance-Size Plot"):
+                plot = gr.Plot(performance_size_plot, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all tasks in the benchmark*"
+                )
+            with gr.Tab("Top 5 Radar Chart"):
+                radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all task types in the benchmark*"
+                )
     with gr.Tab("Summary"):
         summary_table.render()
     with gr.Tab("Performance per task"):

diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py
@@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
         margin=dict(b=20, t=10, l=20, r=10),  # noqa
     )
     return fig
+
+
+TOP_N = 5
+task_types = [
+    "BitextMining",
+    "Classification",
+    "MultilabelClassification",
+    "Clustering",
+    "PairClassification",
+    "Reranking",
+    "Retrieval",
+    "STS",
+    "Summarization",
+    # "InstructionRetrieval",
+    # Not displayed, because the scores are negative,
+    # doesn't work well with the radar chart.
+    "Speed",
+]
+
+line_colors = [
+    "#EE4266",
+    "#00a6ed",
+    "#ECA72C",
+    "#B42318",
+    "#3CBBB1",
+]
+fill_colors = [
+    "rgba(238,66,102,0.2)",
+    "rgba(0,166,237,0.2)",
+    "rgba(236,167,44,0.2)",
+    "rgba(180,35,24,0.2)",
+    "rgba(60,187,177,0.2)",
+]
+
+
+def radar_chart(df: pd.DataFrame) -> go.Figure:
+    df = df.copy()
+    df["Model"] = df["Model"].map(parse_model_name)
+    # Remove whitespace
+    task_type_columns = [
+        column for column in df.columns if "".join(column.split()) in task_types
+    ]
+    df = df[["Model", *task_type_columns]].set_index("Model")
+    df = df.replace("", np.nan)
+    df = df.dropna()
+    df = df.head(TOP_N)
+    df = df.iloc[::-1]
+    fig = go.Figure()
+    for i, (model_name, row) in enumerate(df.iterrows()):
+        fig.add_trace(
+            go.Scatterpolar(
+                name=model_name,
+                r=[row[task_type] for task_type in task_type_columns]
+                + [row[task_type_columns[0]]],
+                theta=task_type_columns + [task_type_columns[0]],
+                showlegend=True,
+                mode="lines",
+                line=dict(width=2, color=line_colors[i]),
+                fill="toself",
+                fillcolor=fill_colors[i],
+            )
+        )
+    fig.update_layout(
+        font=dict(size=16, color="black"),  # noqa
+        template="plotly_white",
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                gridcolor="black",
+                linecolor="rgba(0,0,0,0)",
+                gridwidth=1,
+                showticklabels=False,
+                ticks="",
+            ),
+            angularaxis=dict(
+                gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
+            ),
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.6,
+            xanchor="left",
+            x=-0.05,
+            entrywidthmode="fraction",
+            entrywidth=1 / 5,
+        ),
+    )
+    return fig
diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py
@@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
     task_names_per_type = defaultdict(list)
     for task_name, task_type in zip(df["task_name"], df["task_type"]):
         task_names_per_type[task_type].append(task_name)
-    groups = df.groupby(["model_name", "model_revision"])
+    groups = df.groupby("model_name")
     records = []
-    for (model_name, model_revision), group_data in groups:
+    for (model_name), group_data in groups:
         name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
         for task_type, task_names in task_names_per_type.items():
             type_mean = np.mean(
@@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
             records.append(
                 dict(  # noqa
                     model_name=model_name,
-                    model_revision=model_revision,
                     task_type=task_type,
                     score=type_mean,
                 )
@@ -125,24 +124,23 @@ def scores_to_tables(
     )
     mean_per_type = get_means_per_types(data)
     mean_per_type = mean_per_type.pivot(
-        index=["model_name", "model_revision"], columns="task_type", values="score"
+        index="model_name", columns="task_type", values="score"
     )
     mean_per_type.columns = [
         split_on_capital(column) for column in mean_per_type.columns
     ]
-    per_task = data.pivot(
-        index=["model_name", "model_revision"], columns="task_name", values="score"
-    )
+    per_task = data.pivot(index="model_name", columns="task_name", values="score")
     to_remove = per_task.isna().all(axis="columns")
     if search_query:
         names = per_task.index.get_level_values("model_name")
         names = pd.Series(names, index=per_task.index)
         to_remove |= ~names.str.contains(search_query, regex=True)
+    models_to_remove = list(per_task[to_remove].index)
     typed_mean = mean_per_type.mean(skipna=False, axis=1)
     overall_mean = per_task.mean(skipna=False, axis=1)
     joint_table = mean_per_type.copy()
-    per_task = per_task[~to_remove]
-    joint_table = joint_table[~to_remove]
+    per_task = per_task.drop(models_to_remove, axis=0)
+    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean", overall_mean)
     joint_table.insert(1, "mean_by_task_type", typed_mean)
     joint_table["borda_rank"] = get_borda_rank(per_task)
@@ -166,10 +164,7 @@ def scores_to_tables(
         model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
     )
     joint_table = joint_table.sort_values("borda_rank", ascending=True)
-    per_task = per_task.loc[
-        joint_table.set_index(["model_name", "model_revision"]).index
-    ]
-    joint_table = joint_table.drop(columns=["model_revision"])
+    per_task = per_task.loc[joint_table.set_index("model_name").index]
     # Removing HF organization from model
     joint_table["model_name"] = joint_table["model_name"].map(
         lambda name: name.split("/")[-1]
@@ -189,7 +184,7 @@ def scores_to_tables(
             "mean": "Mean (Task)",
         }
     )
-    per_task = per_task.reset_index().drop(columns=["model_revision"])
+    per_task = per_task.reset_index()
     per_task["model_name"] = per_task["model_name"].map(
         lambda name: name.split("/")[-1]
     )

diff --git a/mteb/model_meta.py b/mteb/model_meta.py
@@ -73,8 +73,6 @@ class ModelMeta(BaseModel):
             in the Latin script.
         use_instructions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
             input such as "query: {document}" or "passage: {document}".
-        zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
-            are evaluated non-zero-shot unless specified otherwise.
         citation: The citation for the model. This is a bibtex string.
         training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example
             {"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to