Skip to content

Commit

Permalink
Merge branch 'embeddings-benchmark:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
sam-hey authored Dec 12, 2024
2 parents 67e5200 + 9c0b208 commit a3a126f
Show file tree
Hide file tree
Showing 19 changed files with 2,304 additions and 82 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,28 @@ results = mteb.load_results(models=models, tasks=tasks)
df = results_to_dataframe(results)
```

</details>


<details>
<summary> Annotate Contamination in the training data of a model </summary>

### Annotate Contamination

have your found contamination in the training data of a model? Please let us know, either by opening an issue or ideally by submitting a PR
annotatig the training datasets of the model:

```py
model_w_contamination = ModelMeta(
name = "model-with-contamination"
...
training_datasets: {"ArguAna": # name of dataset within MTEB
["test"]} # the splits that have been trained on
...
)
```


</details>

<details>
Expand Down
5 changes: 3 additions & 2 deletions docs/tasks.md

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions mteb/evaluation/evaluators/RetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,10 +483,10 @@ def __call__(
if self.is_cross_encoder:
return self.retriever.search_cross_encoder(corpus, queries, self.top_k)
elif (
hasattr(self.retriever.model, "mteb_model_meta")
and self.retriever.model.mteb_model_meta.name == "bm25s"
hasattr(self.retriever.model.model, "mteb_model_meta")
and self.retriever.model.model.mteb_model_meta.name == "bm25s"
):
return self.retriever.model.search(
return self.retriever.model.model.search(
corpus,
queries,
self.top_k,
Expand Down
16 changes: 11 additions & 5 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import mteb
from mteb.caching import json_cache
from mteb.leaderboard.figures import performance_size_plot
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import scores_to_tables


Expand Down Expand Up @@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
)
citation = gr.Markdown(update_citation, inputs=[benchmark_select])
with gr.Column():
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Performance-Size Plot"):
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Top 5 Radar Chart"):
radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all task types in the benchmark*"
)
with gr.Tab("Summary"):
summary_table.render()
with gr.Tab("Performance per task"):
Expand Down
89 changes: 89 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
margin=dict(b=20, t=10, l=20, r=10), # noqa
)
return fig


TOP_N = 5
task_types = [
"BitextMining",
"Classification",
"MultilabelClassification",
"Clustering",
"PairClassification",
"Reranking",
"Retrieval",
"STS",
"Summarization",
# "InstructionRetrieval",
# Not displayed, because the scores are negative,
# doesn't work well with the radar chart.
"Speed",
]

line_colors = [
"#EE4266",
"#00a6ed",
"#ECA72C",
"#B42318",
"#3CBBB1",
]
fill_colors = [
"rgba(238,66,102,0.2)",
"rgba(0,166,237,0.2)",
"rgba(236,167,44,0.2)",
"rgba(180,35,24,0.2)",
"rgba(60,187,177,0.2)",
]


def radar_chart(df: pd.DataFrame) -> go.Figure:
df = df.copy()
df["Model"] = df["Model"].map(parse_model_name)
# Remove whitespace
task_type_columns = [
column for column in df.columns if "".join(column.split()) in task_types
]
df = df[["Model", *task_type_columns]].set_index("Model")
df = df.replace("", np.nan)
df = df.dropna()
df = df.head(TOP_N)
df = df.iloc[::-1]
fig = go.Figure()
for i, (model_name, row) in enumerate(df.iterrows()):
fig.add_trace(
go.Scatterpolar(
name=model_name,
r=[row[task_type] for task_type in task_type_columns]
+ [row[task_type_columns[0]]],
theta=task_type_columns + [task_type_columns[0]],
showlegend=True,
mode="lines",
line=dict(width=2, color=line_colors[i]),
fill="toself",
fillcolor=fill_colors[i],
)
)
fig.update_layout(
font=dict(size=16, color="black"), # noqa
template="plotly_white",
polar=dict(
radialaxis=dict(
visible=True,
gridcolor="black",
linecolor="rgba(0,0,0,0)",
gridwidth=1,
showticklabels=False,
ticks="",
),
angularaxis=dict(
gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
),
),
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.6,
xanchor="left",
x=-0.05,
entrywidthmode="fraction",
entrywidth=1 / 5,
),
)
return fig
23 changes: 9 additions & 14 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
task_names_per_type = defaultdict(list)
for task_name, task_type in zip(df["task_name"], df["task_type"]):
task_names_per_type[task_type].append(task_name)
groups = df.groupby(["model_name", "model_revision"])
groups = df.groupby("model_name")
records = []
for (model_name, model_revision), group_data in groups:
for (model_name), group_data in groups:
name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
for task_type, task_names in task_names_per_type.items():
type_mean = np.mean(
Expand All @@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
records.append(
dict( # noqa
model_name=model_name,
model_revision=model_revision,
task_type=task_type,
score=type_mean,
)
Expand Down Expand Up @@ -125,24 +124,23 @@ def scores_to_tables(
)
mean_per_type = get_means_per_types(data)
mean_per_type = mean_per_type.pivot(
index=["model_name", "model_revision"], columns="task_type", values="score"
index="model_name", columns="task_type", values="score"
)
mean_per_type.columns = [
split_on_capital(column) for column in mean_per_type.columns
]
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
per_task = data.pivot(index="model_name", columns="task_name", values="score")
to_remove = per_task.isna().all(axis="columns")
if search_query:
names = per_task.index.get_level_values("model_name")
names = pd.Series(names, index=per_task.index)
to_remove |= ~names.str.contains(search_query, regex=True)
models_to_remove = list(per_task[to_remove].index)
typed_mean = mean_per_type.mean(skipna=False, axis=1)
overall_mean = per_task.mean(skipna=False, axis=1)
joint_table = mean_per_type.copy()
per_task = per_task[~to_remove]
joint_table = joint_table[~to_remove]
per_task = per_task.drop(models_to_remove, axis=0)
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean", overall_mean)
joint_table.insert(1, "mean_by_task_type", typed_mean)
joint_table["borda_rank"] = get_borda_rank(per_task)
Expand All @@ -166,10 +164,7 @@ def scores_to_tables(
model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
per_task = per_task.loc[
joint_table.set_index(["model_name", "model_revision"]).index
]
joint_table = joint_table.drop(columns=["model_revision"])
per_task = per_task.loc[joint_table.set_index("model_name").index]
# Removing HF organization from model
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
Expand All @@ -189,7 +184,7 @@ def scores_to_tables(
"mean": "Mean (Task)",
}
)
per_task = per_task.reset_index().drop(columns=["model_revision"])
per_task = per_task.reset_index()
per_task["model_name"] = per_task["model_name"].map(
lambda name: name.split("/")[-1]
)
Expand Down
7 changes: 4 additions & 3 deletions mteb/model_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ class ModelMeta(BaseModel):
in the Latin script.
use_instructions: Whether the model uses instructions E.g. for prompt-based models. This also include models that require a specific format for
input such as "query: {document}" or "passage: {document}".
zero_shot_benchmarks: A list of benchmarks on which the model has been evaluated in a zero-shot setting. By default we assume that all models
are evaluated non-zero-shot unless specified otherwise.
training_datasets: A dictionary of datasets that the model was trained on. Names should be names as their appear in `mteb` for example
{"ArguAna": ["test"]} if the model is trained on the ArguAna test set. This field is used to determine if a model generalizes zero-shot to
a benchmark as well as mark dataset contaminations.
adapted_from: Name of the model from which this model is adapted from. For quantizations, fine-tunes, long doc extensions, etc.
superseded_by: Name of the model that supersedes this model, e.g. nvidia/NV-Embed-v2 supersedes v1.
"""
Expand All @@ -100,7 +101,7 @@ class ModelMeta(BaseModel):
reference: STR_URL | None = None
similarity_fn_name: DISTANCE_METRICS | None = None
use_instructions: bool | None = None
zero_shot_benchmarks: list[str] | None = None
training_datasets: dict[str, list[str]] | None = None
adapted_from: str | None = None
superseded_by: str | None = None

Expand Down
Loading

0 comments on commit a3a126f

Please sign in to comment.