-
Notifications
You must be signed in to change notification settings - Fork 283
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/embeddings-benchmark/mteb
- Loading branch information
Showing
67 changed files
with
2,489 additions
and
259 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# GitHub action for the task table generation. | ||
|
||
name: documentation | ||
|
||
on: | ||
push: | ||
branches: [main] | ||
|
||
jobs: | ||
create-table: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
with: | ||
token: ${{ secrets.RELEASE }} | ||
|
||
- uses: actions/setup-python@v4 | ||
with: | ||
python-version: "3.10" | ||
cache: "pip" | ||
|
||
- name: Install dependencies | ||
run: | | ||
make install | ||
- name: Create table | ||
run: python docs/create_tasks_table.py | ||
|
||
- name: Push table | ||
run: | | ||
git config --global user.email "github-actions[bot]@users.noreply.github.com" | ||
git config --global user.name "github-actions[bot]" | ||
# Check if changes exist | ||
if git diff --quiet; then | ||
echo "No changes detected" | ||
else | ||
git add docs/tasks.md | ||
git commit -m "Update tasks table" | ||
git push | ||
fi | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
## Adding a new Leaderboard tab | ||
|
||
The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we love new leaderboard tabs. To add a new leaderboard tab: | ||
|
||
1. Open a PR in https://hf.co/datasets/mteb/results with: | ||
- All results added in existing model folders or new folders | ||
- Updated paths.json (see snippet results.py) | ||
- If adding any new models, their names added to results.py | ||
- If you have access to all models you are adding, you can also [add results via the metadata](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) for all of them / some of them | ||
2. Open a PR at https://huggingface.co/spaces/mteb/leaderboard modifying app.py to add your tab: | ||
- Add any new models & their specs to the global lists | ||
- Add your tab, credits etc to where the other tabs are defined | ||
- If you're adding new results to existing models, remove those models from `EXTERNAL_MODEL_RESULTS.json` such that they can be reloaded with the new results and are not cached. | ||
- You may also have to uncomment `, download_mode='force_redownload', verification_mode="no_checks")` where the datasets are loaded to experiment locally without caching of results | ||
- Test that it runs & works locally as you desire with python app.py, **please add screenshots to the PR** |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from __future__ import annotations | ||
|
||
import re | ||
from pathlib import Path | ||
|
||
import mteb | ||
|
||
|
||
def author_from_bibtex(bibtex: str | None) -> str: | ||
"""Create (Authors, Year) from bibtex entry (author = {Authors}, year = {Year})""" | ||
if bibtex is None: | ||
return "" | ||
# get authors from bibtex (author = {Authors} or author={Authors}) | ||
authors = re.search(r"author\s*=\s*{([^}]*)}", bibtex) | ||
if authors is None: | ||
return "" | ||
authors = authors.group(1) | ||
authors = [a.split(", ") for a in authors.split(" and ")] | ||
author_str_w_et_al = ( | ||
authors[0][0] + " et al." if len(authors[0]) > 1 else authors[0][0] | ||
) | ||
# replace any newline characters | ||
author_str_w_et_al = author_str_w_et_al.replace("\n", " ") | ||
year = re.search(r"year\s*=\s*{([^}]*)}", bibtex) | ||
if year is None: | ||
return "" | ||
year_str = year.group(1) | ||
return f" ({author_str_w_et_al}, {year_str})" | ||
|
||
|
||
def task_to_markdown_row(task: mteb.AbsTask) -> str: | ||
name = task.metadata.name | ||
name_w_reference = ( | ||
f"[{name}]({task.metadata.reference})" if task.metadata.reference else name | ||
) | ||
domains = ( | ||
"[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else "" | ||
) | ||
n_samples = task.metadata.n_samples if task.metadata.n_samples else "" | ||
avg_character_length = ( | ||
task.metadata.avg_character_length if task.metadata.avg_character_length else "" | ||
) | ||
|
||
name_w_reference += author_from_bibtex(task.metadata.bibtex_citation) | ||
|
||
return f"| {name_w_reference} | {task.metadata.languages} | {task.metadata.type} | {task.metadata.category} | {domains} | {n_samples} | {avg_character_length} |" | ||
|
||
|
||
def create_tasks_table(tasks: list[mteb.AbsTask]) -> str: | ||
table = """ | ||
| Name | Languages | Type | Category | Domains | # Samples | Avg. Length (Char.) | | ||
|------|-----------|------|----------|---------|-----------|---------------------| | ||
""" | ||
for task in tasks: | ||
table += task_to_markdown_row(task) + "\n" | ||
return table | ||
|
||
|
||
def insert_table(file_path, table): | ||
"""Insert table in the in <!-- TABLE START --> and <!-- TABLE END -->""" | ||
with open(file_path, "r") as file: | ||
md = file.read() | ||
|
||
start = "<!-- TABLE START -->" | ||
end = "<!-- TABLE END -->" | ||
|
||
md = md.replace(md[md.index(start) + len(start) : md.index(end)], table) | ||
|
||
with open(file_path, "w") as file: | ||
file.write(md) | ||
|
||
|
||
def main(): | ||
tasks = mteb.get_tasks() | ||
tasks = sorted(tasks, key=lambda x: x.metadata.name) | ||
|
||
table = create_tasks_table(tasks) | ||
|
||
file_path = Path(__file__).parent / "tasks.md" | ||
|
||
insert_table(file_path, table) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "rbroc", "New dataset": 2} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "KennethEnevoldsen", "Coordination": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "dipam7", "New dataset": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "Akash190104", "New dataset": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "Akash190104", "New dataset": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "bp-high", "New dataset": 6} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"GitHub": "jaygala24", "New dataset": 30} | ||
{"GitHub": "digantamisra98", "New dataset": 20} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "isaac-chung", "Review PR": 2} | ||
{"GitHub": "KennethEnevoldsen", "Bug fixes": 3} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"GitHub": "jaygala24", "New dataset": 36} | ||
{"GitHub": "digantamisra98", "New dataset": 18} | ||
{"GitHub": "asparius", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "isaac-chung", "Review PR": 2} | ||
{"GitHub": "asparius", "New dataset": 6} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"GitHub": "Akash190104", "New dataset": 2} | ||
{"GitHub": "asparius", "Review PR": 2} | ||
{"GitHub": "isaac-chung", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "isaac-chung", "Review PR": 2} | ||
{"GitHub": "asparius", "New dataset": 6} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.