diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 3d6fef68fc..22418767a5 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -83,6 +83,7 @@ from .spa.SpanishPassageRetrievalS2S import * from .swe.swedn_retrieval import * from .swe.swefaq_retrieval import * +from .tur.TurHistQuad import * from .vie.VieQuADRetrieval import * from .zho.CMTEBRetrieval import * from .zho.LeCaRDv2Retrieval import * diff --git a/mteb/tasks/Retrieval/tur/TurHistQuad.py b/mteb/tasks/Retrieval/tur/TurHistQuad.py new file mode 100644 index 0000000000..4c930fcba6 --- /dev/null +++ b/mteb/tasks/Retrieval/tur/TurHistQuad.py @@ -0,0 +1,99 @@ +import datasets + +from mteb.abstasks import AbsTaskRetrieval, TaskMetadata +import json + +class TurHistQuadRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TurHistQuadRetrieval", + dataset={ + "path": "asparius/TurHistQuAD", + "revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", + }, + description="Question Answering dataset on Ottoman History in Turkish", + reference="https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset", + type="Retrieval", + category="p2p", + eval_splits=["test"], + eval_langs=["tur-Latn"], + main_score="ndcg_at_10", + date=("2021-01-01", "2021-10-13"), + form=["written"], + task_subtypes=["Question answering"], + domains=["Encyclopaedic", "Non-fiction", "Academic"], + license="MIT", + socioeconomic_status="high", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation=""" + @INPROCEEDINGS{9559013, + author={Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner}, + booktitle={2021 6th International Conference on Computer Science and Engineering (UBMK)}, + title={THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension}, + year={2021}, + volume={}, + number={}, + pages={215-220}, + keywords={Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings}, + doi={10.1109/UBMK52708.2021.9559013}} + + """, + n_samples={"test": 1330, "train": 14221}, + avg_character_length={"train": 1219.37, "test": 1513.83}, + ) + + def load_data(self,**kwargs) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + + if self.data_loaded: + return + + + self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"]) + + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + text2id = {} + + for split in self.dataset: + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + max_samples = min(1024, len(ds)) + ds = ds.select( + range(max_samples) + ) # limit the dataset size to make sure the task does not take too long to run + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + + question = ds["question"] + context = ds["context"] + answer = [a["text"] for a in ds["answers"]] + + n = 0 + for q, cont, ans in zip(question, context, answer): + self.queries[split][str(n)] = q + q_n = n + n += 1 + if cont not in text2id: + text2id[cont] = n + self.corpus[split][str(n)] = {"title": "", "text": cont} + n += 1 + if ans not in text2id: + text2id[ans] = n + self.corpus[split][str(n)] = {"title": "", "text": ans} + n += 1 + + self.relevant_docs[split][str(q_n)] = { + str(text2id[ans]): 1, + str(text2id[cont]): 1, + } # only two correct matches + self.data_loaded = True \ No newline at end of file diff --git a/mteb/tasks/Retrieval/tur/__init__.py b/mteb/tasks/Retrieval/tur/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json b/results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json new file mode 100644 index 0000000000..36bfd8d827 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json @@ -0,0 +1,43 @@ +{ + "dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", + "mteb_dataset_name": "TurHistQuadRetrieval", + "mteb_version": "1.6.8", + "test": { + "evaluation_time": 97.69, + "map_at_1": 0.21289, + "map_at_10": 0.31645, + "map_at_100": 0.32852, + "map_at_1000": 0.33043, + "map_at_20": 0.32291, + "map_at_3": 0.28768, + "map_at_5": 0.30272, + "mrr_at_1": 0.42578, + "mrr_at_10": 0.55599, + "mrr_at_100": 0.56179, + "mrr_at_1000": 0.56202, + "mrr_at_20": 0.55955, + "mrr_at_3": 0.53011, + "mrr_at_5": 0.54495, + "ndcg_at_1": 0.42578, + "ndcg_at_10": 0.41696, + "ndcg_at_100": 0.4662, + "ndcg_at_1000": 0.50738, + "ndcg_at_20": 0.4354, + "ndcg_at_3": 0.36357, + "ndcg_at_5": 0.38843, + "precision_at_1": 0.42578, + "precision_at_10": 0.09512, + "precision_at_100": 0.01345, + "precision_at_1000": 0.0019, + "precision_at_20": 0.05352, + "precision_at_3": 0.23633, + "precision_at_5": 0.16152, + "recall_at_1": 0.21289, + "recall_at_10": 0.47559, + "recall_at_100": 0.67236, + "recall_at_1000": 0.95166, + "recall_at_20": 0.53516, + "recall_at_3": 0.35449, + "recall_at_5": 0.40381 + } +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json new file mode 100644 index 0000000000..ad75121cbe --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json @@ -0,0 +1,43 @@ +{ + "dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03", + "mteb_dataset_name": "TurHistQuadRetrieval", + "mteb_version": "1.6.8", + "test": { + "evaluation_time": 64.8, + "map_at_1": 0.10205, + "map_at_10": 0.1676, + "map_at_100": 0.1797, + "map_at_1000": 0.18232, + "map_at_20": 0.17366, + "map_at_3": 0.14364, + "map_at_5": 0.15814, + "mrr_at_1": 0.2041, + "mrr_at_10": 0.30393, + "mrr_at_100": 0.31484, + "mrr_at_1000": 0.31575, + "mrr_at_20": 0.31044, + "mrr_at_3": 0.27214, + "mrr_at_5": 0.29274, + "ndcg_at_1": 0.2041, + "ndcg_at_10": 0.23708, + "ndcg_at_100": 0.29725, + "ndcg_at_1000": 0.35858, + "ndcg_at_20": 0.25785, + "ndcg_at_3": 0.18704, + "ndcg_at_5": 0.21444, + "precision_at_1": 0.2041, + "precision_at_10": 0.06025, + "precision_at_100": 0.01093, + "precision_at_1000": 0.00192, + "precision_at_20": 0.03682, + "precision_at_3": 0.1263, + "precision_at_5": 0.09746, + "recall_at_1": 0.10205, + "recall_at_10": 0.30127, + "recall_at_100": 0.54639, + "recall_at_1000": 0.95947, + "recall_at_20": 0.36816, + "recall_at_3": 0.18945, + "recall_at_5": 0.24365 + } +} \ No newline at end of file