From bd2df889b7e19d98c7d6070c18c7e4c326cd8bd7 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 13:52:24 +0200 Subject: [PATCH 1/8] Added integration tests for MTEB --- .github/workflows/test.yml | 38 ++++++++++++++++++++++++++++++++++++++ .gitignore | 8 ++++++++ makefile | 8 ++++++++ tests/results/results | 1 + tests/test_load_results.py | 26 ++++++++++++++++++++++++++ 5 files changed, 81 insertions(+) create mode 100644 .github/workflows/test.yml create mode 100644 .gitignore create mode 100644 makefile create mode 160000 tests/results/results create mode 100644 tests/test_load_results.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 000000000..7aa03920a --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,38 @@ +# This workflow will: +# 1) install Python dependencies +# 2) run make test + + +name: Test +on: + push: + branches: [main] + pull_request: + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] #, macos-latest, windows-latest] + python-version: ["3.8"] # , "3.9", "3.10"] + + steps: + - uses: actions/checkout@v3 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install dependencies + shell: bash + run: | + make install-for-tests + + - name: Run tests + shell: bash + run: | + make test diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..5de8130ff --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# python +__pycache__ + +# vscode +.vscode/ + +# tmp files +tmp.py diff --git a/makefile b/makefile new file mode 100644 index 000000000..241394463 --- /dev/null +++ b/makefile @@ -0,0 +1,8 @@ +install-for-tests: + @echo "--- Installing dependencies for tests ---" + # just use the dev dependencies from mteb to keep everything compatible + pip install mteb[dev] + +test: + @echo "--- Running tests ---" + pytest \ No newline at end of file diff --git a/tests/results/results b/tests/results/results new file mode 160000 index 000000000..0da15454b --- /dev/null +++ b/tests/results/results @@ -0,0 +1 @@ +Subproject commit 0da15454bdfbf1e9069adcb9f5a5f29c4d05223a diff --git a/tests/test_load_results.py b/tests/test_load_results.py new file mode 100644 index 000000000..50e296051 --- /dev/null +++ b/tests/test_load_results.py @@ -0,0 +1,26 @@ +import os +from pathlib import Path + +import mteb + + +def test_load_results(): + """Ensures that files can be loaded using MTEB""" + tests_path = Path(__file__).parent / "results" + + os.environ["MTEB_CACHE"] = str(tests_path) + + results = mteb.load_results(download_latest=False) + + assert isinstance(results, dict) + for model in results: + assert isinstance(results[model], dict) + for revision in results[model]: + assert isinstance(results[model][revision], list) + for result in results[model][revision]: + assert isinstance(result, mteb.MTEBResults) + + known_model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" + known_revision = "bf3bf13ab40c3157080a7ab344c831b9ad18b5eb" + assert known_model in results + assert known_revision in results[known_model] From 9f5506749d9908db8a3482b57f52a36f723d34a8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:00:52 +0200 Subject: [PATCH 2/8] unsure tests uses results folder --- .../model_metaInstruct.json | 1 - .../BrightRetrieval.json | 393 ------------------ results/results | 1 + tests/test_load_results.py | 2 +- 4 files changed, 2 insertions(+), 395 deletions(-) delete mode 100644 results/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_metaInstruct.json delete mode 100644 results/all-mpnet-base-v2/no_revision_available/BrightRetrieval.json create mode 160000 results/results diff --git a/results/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_metaInstruct.json b/results/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_metaInstruct.json deleted file mode 100644 index c61f2fe7e..000000000 --- a/results/all-mpnet-base-v2/84f2bcc00d77236f9e89c8a360a00fb1139bf47d/model_metaInstruct.json +++ /dev/null @@ -1 +0,0 @@ -{"name": "sentence-transformers/all-mpnet-base-v2", "revision": "84f2bcc00d77236f9e89c8a360a00fb1139bf47d", "release_date": null, "languages": [], "n_parameters": null, "memory_usage": null, "max_tokens": null, "embed_dim": null, "license": null, "open_source": null, "similarity_fn_name": null, "framework": ["Sentence Transformers"], "loader": null} \ No newline at end of file diff --git a/results/all-mpnet-base-v2/no_revision_available/BrightRetrieval.json b/results/all-mpnet-base-v2/no_revision_available/BrightRetrieval.json deleted file mode 100644 index 701082f7f..000000000 --- a/results/all-mpnet-base-v2/no_revision_available/BrightRetrieval.json +++ /dev/null @@ -1,393 +0,0 @@ -{ - "dataset_revision": "a75a0eb", - "mteb_version": "1.12.79", - "scores": { - "standard": [ - { - "hf_subset": "robotics", - "languages": [ - "eng-Latn" - ], - "main_score": 0.08362, - "ndcg_at_1": 0.06931, - "ndcg_at_5": 0.07126, - "ndcg_at_10": 0.08362, - "ndcg_at_25": 0.10845, - "ndcg_at_50": 0.13101, - "ndcg_at_100": 0.14196, - "map_at_1": 0.02082, - "map_at_5": 0.03504, - "map_at_10": 0.04282, - "map_at_25": 0.05094, - "map_at_50": 0.05509, - "map_at_100": 0.05655, - "Recall_at_1": 0.02082, - "Recall_at_5": 0.06344, - "Recall_at_10": 0.11005, - "Recall_at_25": 0.20497, - "Recall_at_50": 0.29337, - "Recall_at_100": 0.33301, - "precision_at_1": 0.06931, - "precision_at_5": 0.05347, - "precision_at_10": 0.04455, - "precision_at_25": 0.03168, - "precision_at_50": 0.02297, - "precision_at_100": 0.01436, - "mrr": 0.13298 - }, - { - "hf_subset": "psychology", - "languages": [ - "eng-Latn" - ], - "main_score": 0.22626, - "ndcg_at_1": 0.21782, - "ndcg_at_5": 0.22305, - "ndcg_at_10": 0.22626, - "ndcg_at_25": 0.24536, - "ndcg_at_50": 0.2681, - "ndcg_at_100": 0.29678, - "map_at_1": 0.0837, - "map_at_5": 0.14288, - "map_at_10": 0.15946, - "map_at_25": 0.17579, - "map_at_50": 0.18426, - "map_at_100": 0.1886, - "Recall_at_1": 0.0837, - "Recall_at_5": 0.19787, - "Recall_at_10": 0.23901, - "Recall_at_25": 0.33513, - "Recall_at_50": 0.41065, - "Recall_at_100": 0.53237, - "precision_at_1": 0.21782, - "precision_at_5": 0.13663, - "precision_at_10": 0.10099, - "precision_at_25": 0.06178, - "precision_at_50": 0.04376, - "precision_at_100": 0.02891, - "mrr": 0.29408 - }, - { - "hf_subset": "leetcode", - "languages": [ - "eng-Latn" - ], - "main_score": 0.26404, - "ndcg_at_1": 0.26761, - "ndcg_at_5": 0.2424, - "ndcg_at_10": 0.26404, - "ndcg_at_25": 0.28704, - "ndcg_at_50": 0.2989, - "ndcg_at_100": 0.31162, - "map_at_1": 0.13826, - "map_at_5": 0.20132, - "map_at_10": 0.21148, - "map_at_25": 0.21764, - "map_at_50": 0.21966, - "map_at_100": 0.22098, - "Recall_at_1": 0.13826, - "Recall_at_5": 0.25587, - "Recall_at_10": 0.3135, - "Recall_at_25": 0.40329, - "Recall_at_50": 0.45599, - "Recall_at_100": 0.52641, - "precision_at_1": 0.26761, - "precision_at_5": 0.10704, - "precision_at_10": 0.06479, - "precision_at_25": 0.03127, - "precision_at_50": 0.01746, - "precision_at_100": 0.00986, - "mrr": 0.3321 - }, - { - "hf_subset": "biology", - "languages": [ - "eng-Latn" - ], - "main_score": 0.15523, - "ndcg_at_1": 0.15534, - "ndcg_at_5": 0.13501, - "ndcg_at_10": 0.15523, - "ndcg_at_25": 0.2024, - "ndcg_at_50": 0.22617, - "ndcg_at_100": 0.25811, - "map_at_1": 0.03541, - "map_at_5": 0.08866, - "map_at_10": 0.10351, - "map_at_25": 0.11985, - "map_at_50": 0.12583, - "map_at_100": 0.13017, - "Recall_at_1": 0.03541, - "Recall_at_5": 0.12522, - "Recall_at_10": 0.18074, - "Recall_at_25": 0.31393, - "Recall_at_50": 0.39392, - "Recall_at_100": 0.53202, - "precision_at_1": 0.15534, - "precision_at_5": 0.09515, - "precision_at_10": 0.07184, - "precision_at_25": 0.04893, - "precision_at_50": 0.03146, - "precision_at_100": 0.02058, - "mrr": 0.2375 - }, - { - "hf_subset": "theoremqa_questions", - "languages": [ - "eng-Latn" - ], - "main_score": 0.18494, - "ndcg_at_1": 0.18537, - "ndcg_at_5": 0.18113, - "ndcg_at_10": 0.18494, - "ndcg_at_25": 0.19987, - "ndcg_at_50": 0.20682, - "ndcg_at_100": 0.21592, - "map_at_1": 0.11016, - "map_at_5": 0.15883, - "map_at_10": 0.16069, - "map_at_25": 0.16496, - "map_at_50": 0.16595, - "map_at_100": 0.16696, - "Recall_at_1": 0.11016, - "Recall_at_5": 0.19919, - "Recall_at_10": 0.20772, - "Recall_at_25": 0.25772, - "Recall_at_50": 0.28972, - "Recall_at_100": 0.33728, - "precision_at_1": 0.18537, - "precision_at_5": 0.07122, - "precision_at_10": 0.03805, - "precision_at_25": 0.01951, - "precision_at_50": 0.01083, - "precision_at_100": 0.00629, - "mrr": 0.21758 - }, - { - "hf_subset": "economics", - "languages": [ - "eng-Latn" - ], - "main_score": 0.16636, - "ndcg_at_1": 0.16505, - "ndcg_at_5": 0.14591, - "ndcg_at_10": 0.16636, - "ndcg_at_25": 0.18945, - "ndcg_at_50": 0.20749, - "ndcg_at_100": 0.23965, - "map_at_1": 0.05228, - "map_at_5": 0.07664, - "map_at_10": 0.09459, - "map_at_25": 0.11053, - "map_at_50": 0.11815, - "map_at_100": 0.12454, - "Recall_at_1": 0.05228, - "Recall_at_5": 0.10281, - "Recall_at_10": 0.18708, - "Recall_at_25": 0.27945, - "Recall_at_50": 0.34849, - "Recall_at_100": 0.48289, - "precision_at_1": 0.16505, - "precision_at_5": 0.0932, - "precision_at_10": 0.0835, - "precision_at_25": 0.05981, - "precision_at_50": 0.04369, - "precision_at_100": 0.03087, - "mrr": 0.22919 - }, - { - "hf_subset": "stackoverflow", - "languages": [ - "eng-Latn" - ], - "main_score": 0.09484, - "ndcg_at_1": 0.06838, - "ndcg_at_5": 0.06862, - "ndcg_at_10": 0.09484, - "ndcg_at_25": 0.12448, - "ndcg_at_50": 0.1491, - "ndcg_at_100": 0.16954, - "map_at_1": 0.02429, - "map_at_5": 0.04045, - "map_at_10": 0.05131, - "map_at_25": 0.06022, - "map_at_50": 0.06623, - "map_at_100": 0.06927, - "Recall_at_1": 0.02429, - "Recall_at_5": 0.06315, - "Recall_at_10": 0.12689, - "Recall_at_25": 0.2269, - "Recall_at_50": 0.30656, - "Recall_at_100": 0.37871, - "precision_at_1": 0.06838, - "precision_at_5": 0.04274, - "precision_at_10": 0.04957, - "precision_at_25": 0.03692, - "precision_at_50": 0.02786, - "precision_at_100": 0.01923, - "mrr": 0.14046 - }, - { - "hf_subset": "pony", - "languages": [ - "eng-Latn" - ], - "main_score": 0.06946, - "ndcg_at_1": 0.07143, - "ndcg_at_5": 0.06671, - "ndcg_at_10": 0.06946, - "ndcg_at_25": 0.06597, - "ndcg_at_50": 0.09255, - "ndcg_at_100": 0.12776, - "map_at_1": 0.00385, - "map_at_5": 0.00879, - "map_at_10": 0.01334, - "map_at_25": 0.01756, - "map_at_50": 0.02258, - "map_at_100": 0.02717, - "Recall_at_1": 0.00385, - "Recall_at_5": 0.01548, - "Recall_at_10": 0.035, - "Recall_at_25": 0.06334, - "Recall_at_50": 0.12144, - "Recall_at_100": 0.19613, - "precision_at_1": 0.07143, - "precision_at_5": 0.06429, - "precision_at_10": 0.06964, - "precision_at_25": 0.05357, - "precision_at_50": 0.04714, - "precision_at_100": 0.04036, - "mrr": 0.17959 - }, - { - "hf_subset": "earth_science", - "languages": [ - "eng-Latn" - ], - "main_score": 0.20109, - "ndcg_at_1": 0.18966, - "ndcg_at_5": 0.17368, - "ndcg_at_10": 0.20109, - "ndcg_at_25": 0.23288, - "ndcg_at_50": 0.2666, - "ndcg_at_100": 0.28534, - "map_at_1": 0.08197, - "map_at_5": 0.12566, - "map_at_10": 0.14143, - "map_at_25": 0.15259, - "map_at_50": 0.15956, - "map_at_100": 0.1627, - "Recall_at_1": 0.08197, - "Recall_at_5": 0.17426, - "Recall_at_10": 0.25421, - "Recall_at_25": 0.33982, - "Recall_at_50": 0.45312, - "Recall_at_100": 0.52257, - "precision_at_1": 0.18966, - "precision_at_5": 0.09828, - "precision_at_10": 0.07414, - "precision_at_25": 0.04724, - "precision_at_50": 0.03466, - "precision_at_100": 0.02129, - "mrr": 0.275 - }, - { - "hf_subset": "theoremqa_theorems", - "languages": [ - "eng-Latn" - ], - "main_score": 0.12383, - "ndcg_at_1": 0.04615, - "ndcg_at_5": 0.09592, - "ndcg_at_10": 0.12383, - "ndcg_at_25": 0.14551, - "ndcg_at_50": 0.15875, - "ndcg_at_100": 0.18178, - "map_at_1": 0.03077, - "map_at_5": 0.07568, - "map_at_10": 0.08844, - "map_at_25": 0.09587, - "map_at_50": 0.09811, - "map_at_100": 0.10012, - "Recall_at_1": 0.03077, - "Recall_at_5": 0.13333, - "Recall_at_10": 0.21538, - "Recall_at_25": 0.28051, - "Recall_at_50": 0.34513, - "Recall_at_100": 0.46744, - "precision_at_1": 0.04615, - "precision_at_5": 0.04615, - "precision_at_10": 0.03385, - "precision_at_25": 0.02031, - "precision_at_50": 0.01169, - "precision_at_100": 0.008, - "mrr": 0.11205 - }, - { - "hf_subset": "sustainable_living", - "languages": [ - "eng-Latn" - ], - "main_score": 0.15336, - "ndcg_at_1": 0.12037, - "ndcg_at_5": 0.13605, - "ndcg_at_10": 0.15336, - "ndcg_at_25": 0.19799, - "ndcg_at_50": 0.22737, - "ndcg_at_100": 0.25441, - "map_at_1": 0.04805, - "map_at_5": 0.08993, - "map_at_10": 0.10352, - "map_at_25": 0.12019, - "map_at_50": 0.12823, - "map_at_100": 0.13246, - "Recall_at_1": 0.04805, - "Recall_at_5": 0.1437, - "Recall_at_10": 0.18752, - "Recall_at_25": 0.31095, - "Recall_at_50": 0.41411, - "Recall_at_100": 0.52431, - "precision_at_1": 0.12037, - "precision_at_5": 0.08519, - "precision_at_10": 0.06759, - "precision_at_25": 0.05074, - "precision_at_50": 0.03537, - "precision_at_100": 0.02333, - "mrr": 0.21499 - }, - { - "hf_subset": "aops", - "languages": [ - "eng-Latn" - ], - "main_score": 0.05325, - "ndcg_at_1": 0.05405, - "ndcg_at_5": 0.05143, - "ndcg_at_10": 0.05325, - "ndcg_at_25": 0.06836, - "ndcg_at_50": 0.08239, - "ndcg_at_100": 0.09377, - "map_at_1": 0.00928, - "map_at_5": 0.03117, - "map_at_10": 0.03654, - "map_at_25": 0.04088, - "map_at_50": 0.04328, - "map_at_100": 0.04486, - "Recall_at_1": 0.00928, - "Recall_at_5": 0.04231, - "Recall_at_10": 0.05792, - "Recall_at_25": 0.09905, - "Recall_at_50": 0.14602, - "Recall_at_100": 0.1908, - "precision_at_1": 0.05405, - "precision_at_5": 0.04324, - "precision_at_10": 0.03153, - "precision_at_25": 0.0191, - "precision_at_50": 0.01369, - "precision_at_100": 0.00874, - "mrr": 0.08466 - } - ] - }, - "task_name": "BrightRetrieval" -} \ No newline at end of file diff --git a/results/results b/results/results new file mode 160000 index 000000000..0da15454b --- /dev/null +++ b/results/results @@ -0,0 +1 @@ +Subproject commit 0da15454bdfbf1e9069adcb9f5a5f29c4d05223a diff --git a/tests/test_load_results.py b/tests/test_load_results.py index 50e296051..f79dcbaec 100644 --- a/tests/test_load_results.py +++ b/tests/test_load_results.py @@ -6,7 +6,7 @@ def test_load_results(): """Ensures that files can be loaded using MTEB""" - tests_path = Path(__file__).parent / "results" + tests_path = Path(__file__).parent.parent / "results" os.environ["MTEB_CACHE"] = str(tests_path) From 6fd556115b198fb5001575dee36ab853920b80a8 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:03:28 +0200 Subject: [PATCH 3/8] remove file --- tests/results/results | 1 - 1 file changed, 1 deletion(-) delete mode 160000 tests/results/results diff --git a/tests/results/results b/tests/results/results deleted file mode 160000 index 0da15454b..000000000 --- a/tests/results/results +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0da15454bdfbf1e9069adcb9f5a5f29c4d05223a From e1bbb85054f7c5d4d53dc7367eeece8bb98fc168 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:05:49 +0200 Subject: [PATCH 4/8] remove repo --- results/results | 1 - 1 file changed, 1 deletion(-) delete mode 160000 results/results diff --git a/results/results b/results/results deleted file mode 160000 index 0da15454b..000000000 --- a/results/results +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0da15454bdfbf1e9069adcb9f5a5f29c4d05223a From 1c9228af860b529e9bd1db603e4f2db2f6eeb8fe Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:24:33 +0200 Subject: [PATCH 5/8] Added mock cache dir --- tests/mock_cache_dir/readme.md | 1 + tests/mock_cache_dir/results/results | 1 + tests/test_load_results.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 tests/mock_cache_dir/readme.md create mode 120000 tests/mock_cache_dir/results/results diff --git a/tests/mock_cache_dir/readme.md b/tests/mock_cache_dir/readme.md new file mode 100644 index 000000000..0742fa8f4 --- /dev/null +++ b/tests/mock_cache_dir/readme.md @@ -0,0 +1 @@ +this is just a mock cache directory, which implements a folder with a link to the results folder in root. \ No newline at end of file diff --git a/tests/mock_cache_dir/results/results b/tests/mock_cache_dir/results/results new file mode 120000 index 000000000..016a66419 --- /dev/null +++ b/tests/mock_cache_dir/results/results @@ -0,0 +1 @@ +../../../results \ No newline at end of file diff --git a/tests/test_load_results.py b/tests/test_load_results.py index f79dcbaec..994fc50d5 100644 --- a/tests/test_load_results.py +++ b/tests/test_load_results.py @@ -6,7 +6,7 @@ def test_load_results(): """Ensures that files can be loaded using MTEB""" - tests_path = Path(__file__).parent.parent / "results" + tests_path = Path(__file__).parent / "mock_cache_dir" os.environ["MTEB_CACHE"] = str(tests_path) From a57170a1741c64ba1cd4acbf8086dcdecbf6bf8c Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:28:37 +0200 Subject: [PATCH 6/8] Added a mock cache to make tests more clear --- makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile b/makefile index 241394463..7f004dfc7 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ install-for-tests: @echo "--- Installing dependencies for tests ---" # just use the dev dependencies from mteb to keep everything compatible - pip install mteb[dev] + pip install mteb[dev]>=1.13.0 test: @echo "--- Running tests ---" From 73bda695407ab20c80dc45f5f89cebbaeabf5ca9 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:51:32 +0200 Subject: [PATCH 7/8] Fixes makefile to also work for zsh --- makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile b/makefile index 7f004dfc7..e19d7f403 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ install-for-tests: @echo "--- Installing dependencies for tests ---" # just use the dev dependencies from mteb to keep everything compatible - pip install mteb[dev]>=1.13.0 + pip install "mteb[dev]>=1.13.0" test: @echo "--- Running tests ---" From ef0e8c326c64d37665aa6901c912dbe7a2907007 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 9 Aug 2024 14:59:46 +0200 Subject: [PATCH 8/8] avoid caching --- .github/workflows/test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7aa03920a..ba8ff8d13 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,9 +23,6 @@ jobs: - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - cache: "pip" - name: Install dependencies shell: bash