Skip to content

Commit

Permalink
Merge branch 'refs/heads/main' into v2.0.0
Browse files Browse the repository at this point in the history
# Conflicts:
#	docs/tasks.md
#	mteb/abstasks/AbsTaskClassification.py
#	mteb/abstasks/AbsTaskClusteringFast.py
#	mteb/abstasks/AbsTaskInstructionRetrieval.py
#	mteb/abstasks/AbsTaskMultilabelClassification.py
#	mteb/abstasks/AbsTaskPairClassification.py
#	mteb/abstasks/AbsTaskReranking.py
#	mteb/abstasks/AbsTaskRetrieval.py
#	mteb/abstasks/AbsTaskSTS.py
#	mteb/descriptive_stats/InstructionRetrieval/Core17InstructionRetrieval.json
#	mteb/descriptive_stats/MultilabelClassification/MultiEURLEXMultilabelClassification.json
#	mteb/descriptive_stats/Reranking/AskUbuntuDupQuestions.json
#	mteb/descriptive_stats/Reranking/ESCIReranking.json
#	mteb/descriptive_stats/Reranking/WikipediaRerankingMultilingual.json
#	mteb/descriptive_stats/Retrieval/AppsRetrieval.json
#	mteb/descriptive_stats/Retrieval/BelebeleRetrieval.json
#	mteb/descriptive_stats/Retrieval/COIRCodeSearchNetRetrieval.json
#	mteb/descriptive_stats/Retrieval/CodeEditSearchRetrieval.json
#	mteb/descriptive_stats/Retrieval/CodeFeedbackMT.json
#	mteb/descriptive_stats/Retrieval/CodeFeedbackST.json
#	mteb/descriptive_stats/Retrieval/CodeSearchNetCCRetrieval.json
#	mteb/descriptive_stats/Retrieval/CodeSearchNetRetrieval.json
#	mteb/descriptive_stats/Retrieval/CodeTransOceanContest.json
#	mteb/descriptive_stats/Retrieval/CodeTransOceanDL.json
#	mteb/descriptive_stats/Retrieval/CosQA.json
#	mteb/descriptive_stats/Retrieval/JaqketRetrieval.json
#	mteb/descriptive_stats/Retrieval/NFCorpus.json
#	mteb/descriptive_stats/Retrieval/StackOverflowQA.json
#	mteb/descriptive_stats/Retrieval/SyntheticText2SQL.json
#	mteb/descriptive_stats/Retrieval/Touche2020.json
#	mteb/descriptive_stats/Retrieval/Touche2020Retrieval.v3.json
#	mteb/descriptive_stats/Retrieval/mFollowIRCrossLingualInstructionRetrieval.json
#	mteb/descriptive_stats/Retrieval/mFollowIRInstructionRetrieval.json
#	mteb/evaluation/MTEB.py
#	mteb/evaluation/evaluators/RetrievalEvaluator.py
#	mteb/leaderboard/app.py
#	mteb/leaderboard/figures.py
#	mteb/leaderboard/table.py
#	mteb/model_meta.py
#	mteb/models/arctic_models.py
#	mteb/models/e5_models.py
#	mteb/models/nomic_models.py
#	mteb/models/overview.py
#	mteb/models/sentence_transformers_models.py
#	mteb/tasks/Reranking/zho/CMTEBReranking.py
#	mteb/tasks/Retrieval/__init__.py
#	mteb/tasks/STS/por/SickBrSTS.py
#	pyproject.toml
#	tests/test_benchmark/mock_tasks.py
  • Loading branch information
Samoed committed Dec 10, 2024
2 parents d0aa3a7 + e605c7b commit f16deb6
Show file tree
Hide file tree
Showing 9 changed files with 2,009 additions and 30 deletions.
16 changes: 11 additions & 5 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import mteb
from mteb.caching import json_cache
from mteb.leaderboard.figures import performance_size_plot
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import scores_to_tables


Expand Down Expand Up @@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
)
citation = gr.Markdown(update_citation, inputs=[benchmark_select])
with gr.Column():
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Performance-Size Plot"):
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Top 5 Radar Chart"):
radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all task types in the benchmark*"
)
with gr.Tab("Summary"):
summary_table.render()
with gr.Tab("Performance per task"):
Expand Down
89 changes: 89 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
margin=dict(b=20, t=10, l=20, r=10), # noqa
)
return fig


TOP_N = 5
task_types = [
"BitextMining",
"Classification",
"MultilabelClassification",
"Clustering",
"PairClassification",
"Reranking",
"Retrieval",
"STS",
"Summarization",
# "InstructionRetrieval",
# Not displayed, because the scores are negative,
# doesn't work well with the radar chart.
"Speed",
]

line_colors = [
"#EE4266",
"#00a6ed",
"#ECA72C",
"#B42318",
"#3CBBB1",
]
fill_colors = [
"rgba(238,66,102,0.2)",
"rgba(0,166,237,0.2)",
"rgba(236,167,44,0.2)",
"rgba(180,35,24,0.2)",
"rgba(60,187,177,0.2)",
]


def radar_chart(df: pd.DataFrame) -> go.Figure:
df = df.copy()
df["Model"] = df["Model"].map(parse_model_name)
# Remove whitespace
task_type_columns = [
column for column in df.columns if "".join(column.split()) in task_types
]
df = df[["Model", *task_type_columns]].set_index("Model")
df = df.replace("", np.nan)
df = df.dropna()
df = df.head(TOP_N)
df = df.iloc[::-1]
fig = go.Figure()
for i, (model_name, row) in enumerate(df.iterrows()):
fig.add_trace(
go.Scatterpolar(
name=model_name,
r=[row[task_type] for task_type in task_type_columns]
+ [row[task_type_columns[0]]],
theta=task_type_columns + [task_type_columns[0]],
showlegend=True,
mode="lines",
line=dict(width=2, color=line_colors[i]),
fill="toself",
fillcolor=fill_colors[i],
)
)
fig.update_layout(
font=dict(size=16, color="black"), # noqa
template="plotly_white",
polar=dict(
radialaxis=dict(
visible=True,
gridcolor="black",
linecolor="rgba(0,0,0,0)",
gridwidth=1,
showticklabels=False,
ticks="",
),
angularaxis=dict(
gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
),
),
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.6,
xanchor="left",
x=-0.05,
entrywidthmode="fraction",
entrywidth=1 / 5,
),
)
return fig
23 changes: 9 additions & 14 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
task_names_per_type = defaultdict(list)
for task_name, task_type in zip(df["task_name"], df["task_type"]):
task_names_per_type[task_type].append(task_name)
groups = df.groupby(["model_name", "model_revision"])
groups = df.groupby("model_name")
records = []
for (model_name, model_revision), group_data in groups:
for (model_name), group_data in groups:
name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
for task_type, task_names in task_names_per_type.items():
type_mean = np.mean(
Expand All @@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
records.append(
dict( # noqa
model_name=model_name,
model_revision=model_revision,
task_type=task_type,
score=type_mean,
)
Expand Down Expand Up @@ -125,24 +124,23 @@ def scores_to_tables(
)
mean_per_type = get_means_per_types(data)
mean_per_type = mean_per_type.pivot(
index=["model_name", "model_revision"], columns="task_type", values="score"
index="model_name", columns="task_type", values="score"
)
mean_per_type.columns = [
split_on_capital(column) for column in mean_per_type.columns
]
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
per_task = data.pivot(index="model_name", columns="task_name", values="score")
to_remove = per_task.isna().all(axis="columns")
if search_query:
names = per_task.index.get_level_values("model_name")
names = pd.Series(names, index=per_task.index)
to_remove |= ~names.str.contains(search_query, regex=True)
models_to_remove = list(per_task[to_remove].index)
typed_mean = mean_per_type.mean(skipna=False, axis=1)
overall_mean = per_task.mean(skipna=False, axis=1)
joint_table = mean_per_type.copy()
per_task = per_task[~to_remove]
joint_table = joint_table[~to_remove]
per_task = per_task.drop(models_to_remove, axis=0)
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean", overall_mean)
joint_table.insert(1, "mean_by_task_type", typed_mean)
joint_table["borda_rank"] = get_borda_rank(per_task)
Expand All @@ -166,10 +164,7 @@ def scores_to_tables(
model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
per_task = per_task.loc[
joint_table.set_index(["model_name", "model_revision"]).index
]
joint_table = joint_table.drop(columns=["model_revision"])
per_task = per_task.loc[joint_table.set_index("model_name").index]
# Removing HF organization from model
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
Expand All @@ -189,7 +184,7 @@ def scores_to_tables(
"mean": "Mean (Task)",
}
)
per_task = per_task.reset_index().drop(columns=["model_revision"])
per_task = per_task.reset_index()
per_task["model_name"] = per_task["model_name"].map(
lambda name: name.split("/")[-1]
)
Expand Down
2 changes: 0 additions & 2 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ class ModelMeta(BaseModel):
in the Latin script.
use_instructions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
input such as "query: {document}" or "passage: {document}".
zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
are evaluated non-zero-shot unless specified otherwise.
citation: The citation for the model. This is a bibtex string.
training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example
{"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to
Expand Down
Loading

0 comments on commit f16deb6

Please sign in to comment.