-
Notifications
You must be signed in to change notification settings - Fork 283
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
186 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskRetrieval, TaskMetadata | ||
import json | ||
|
||
class TurHistQuadRetrieval(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="TurHistQuadRetrieval", | ||
dataset={ | ||
"path": "asparius/TurHistQuAD", | ||
"revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", | ||
}, | ||
description="Question Answering dataset on Ottoman History in Turkish", | ||
reference="https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset", | ||
type="Retrieval", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["tur-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2021-01-01", "2021-10-13"), | ||
form=["written"], | ||
task_subtypes=["Question answering"], | ||
domains=["Encyclopaedic", "Non-fiction", "Academic"], | ||
license="MIT", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=""" | ||
@INPROCEEDINGS{9559013, | ||
author={Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner}, | ||
booktitle={2021 6th International Conference on Computer Science and Engineering (UBMK)}, | ||
title={THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension}, | ||
year={2021}, | ||
volume={}, | ||
number={}, | ||
pages={215-220}, | ||
keywords={Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings}, | ||
doi={10.1109/UBMK52708.2021.9559013}} | ||
""", | ||
n_samples={"test": 1330, "train": 14221}, | ||
avg_character_length={"train": 1219.37, "test": 1513.83}, | ||
) | ||
|
||
def load_data(self,**kwargs) -> None: | ||
"""And transform to a retrieval datset, which have the following attributes | ||
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text | ||
self.queries = Dict[query_id, str] #id => query | ||
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] | ||
""" | ||
|
||
if self.data_loaded: | ||
return | ||
|
||
|
||
self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) | ||
|
||
self.corpus = {} | ||
self.relevant_docs = {} | ||
self.queries = {} | ||
text2id = {} | ||
|
||
for split in self.dataset: | ||
ds: datasets.Dataset = self.dataset[split] # type: ignore | ||
ds = ds.shuffle(seed=42) | ||
max_samples = min(1024, len(ds)) | ||
ds = ds.select( | ||
range(max_samples) | ||
) # limit the dataset size to make sure the task does not take too long to run | ||
self.queries[split] = {} | ||
self.relevant_docs[split] = {} | ||
self.corpus[split] = {} | ||
|
||
|
||
question = ds["question"] | ||
context = ds["context"] | ||
answer = [a["text"] for a in ds["answers"]] | ||
|
||
n = 0 | ||
for q, cont, ans in zip(question, context, answer): | ||
self.queries[split][str(n)] = q | ||
q_n = n | ||
n += 1 | ||
if cont not in text2id: | ||
text2id[cont] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": cont} | ||
n += 1 | ||
if ans not in text2id: | ||
text2id[ans] = n | ||
self.corpus[split][str(n)] = {"title": "", "text": ans} | ||
n += 1 | ||
|
||
self.relevant_docs[split][str(q_n)] = { | ||
str(text2id[ans]): 1, | ||
str(text2id[cont]): 1, | ||
} # only two correct matches | ||
self.data_loaded = True |
Empty file.
43 changes: 43 additions & 0 deletions
43
results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", | ||
"mteb_dataset_name": "TurHistQuadRetrieval", | ||
"mteb_version": "1.6.8", | ||
"test": { | ||
"evaluation_time": 97.69, | ||
"map_at_1": 0.21289, | ||
"map_at_10": 0.31645, | ||
"map_at_100": 0.32852, | ||
"map_at_1000": 0.33043, | ||
"map_at_20": 0.32291, | ||
"map_at_3": 0.28768, | ||
"map_at_5": 0.30272, | ||
"mrr_at_1": 0.42578, | ||
"mrr_at_10": 0.55599, | ||
"mrr_at_100": 0.56179, | ||
"mrr_at_1000": 0.56202, | ||
"mrr_at_20": 0.55955, | ||
"mrr_at_3": 0.53011, | ||
"mrr_at_5": 0.54495, | ||
"ndcg_at_1": 0.42578, | ||
"ndcg_at_10": 0.41696, | ||
"ndcg_at_100": 0.4662, | ||
"ndcg_at_1000": 0.50738, | ||
"ndcg_at_20": 0.4354, | ||
"ndcg_at_3": 0.36357, | ||
"ndcg_at_5": 0.38843, | ||
"precision_at_1": 0.42578, | ||
"precision_at_10": 0.09512, | ||
"precision_at_100": 0.01345, | ||
"precision_at_1000": 0.0019, | ||
"precision_at_20": 0.05352, | ||
"precision_at_3": 0.23633, | ||
"precision_at_5": 0.16152, | ||
"recall_at_1": 0.21289, | ||
"recall_at_10": 0.47559, | ||
"recall_at_100": 0.67236, | ||
"recall_at_1000": 0.95166, | ||
"recall_at_20": 0.53516, | ||
"recall_at_3": 0.35449, | ||
"recall_at_5": 0.40381 | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
...ts/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
{ | ||
"dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", | ||
"mteb_dataset_name": "TurHistQuadRetrieval", | ||
"mteb_version": "1.6.8", | ||
"test": { | ||
"evaluation_time": 64.8, | ||
"map_at_1": 0.10205, | ||
"map_at_10": 0.1676, | ||
"map_at_100": 0.1797, | ||
"map_at_1000": 0.18232, | ||
"map_at_20": 0.17366, | ||
"map_at_3": 0.14364, | ||
"map_at_5": 0.15814, | ||
"mrr_at_1": 0.2041, | ||
"mrr_at_10": 0.30393, | ||
"mrr_at_100": 0.31484, | ||
"mrr_at_1000": 0.31575, | ||
"mrr_at_20": 0.31044, | ||
"mrr_at_3": 0.27214, | ||
"mrr_at_5": 0.29274, | ||
"ndcg_at_1": 0.2041, | ||
"ndcg_at_10": 0.23708, | ||
"ndcg_at_100": 0.29725, | ||
"ndcg_at_1000": 0.35858, | ||
"ndcg_at_20": 0.25785, | ||
"ndcg_at_3": 0.18704, | ||
"ndcg_at_5": 0.21444, | ||
"precision_at_1": 0.2041, | ||
"precision_at_10": 0.06025, | ||
"precision_at_100": 0.01093, | ||
"precision_at_1000": 0.00192, | ||
"precision_at_20": 0.03682, | ||
"precision_at_3": 0.1263, | ||
"precision_at_5": 0.09746, | ||
"recall_at_1": 0.10205, | ||
"recall_at_10": 0.30127, | ||
"recall_at_100": 0.54639, | ||
"recall_at_1000": 0.95947, | ||
"recall_at_20": 0.36816, | ||
"recall_at_3": 0.18945, | ||
"recall_at_5": 0.24365 | ||
} | ||
} |