Skip to content

Commit

Permalink
First Turkish Retrieval dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
asparius committed Apr 23, 2024
1 parent edef77b commit 08f71f7
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 0 deletions.
1 change: 1 addition & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
from .spa.SpanishPassageRetrievalS2S import *
from .swe.swedn_retrieval import *
from .swe.swefaq_retrieval import *
from .tur.TurHistQuad import *
from .vie.VieQuADRetrieval import *
from .zho.CMTEBRetrieval import *
from .zho.LeCaRDv2Retrieval import *
99 changes: 99 additions & 0 deletions mteb/tasks/Retrieval/tur/TurHistQuad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import datasets

from mteb.abstasks import AbsTaskRetrieval, TaskMetadata
import json

class TurHistQuadRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="TurHistQuadRetrieval",
dataset={
"path": "asparius/TurHistQuAD",
"revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
},
description="Question Answering dataset on Ottoman History in Turkish",
reference="https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset",
type="Retrieval",
category="p2p",
eval_splits=["test"],
eval_langs=["tur-Latn"],
main_score="ndcg_at_10",
date=("2021-01-01", "2021-10-13"),
form=["written"],
task_subtypes=["Question answering"],
domains=["Encyclopaedic", "Non-fiction", "Academic"],
license="MIT",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""
@INPROCEEDINGS{9559013,
author={Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner},
booktitle={2021 6th International Conference on Computer Science and Engineering (UBMK)},
title={THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension},
year={2021},
volume={},
number={},
pages={215-220},
keywords={Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings},
doi={10.1109/UBMK52708.2021.9559013}}
""",
n_samples={"test": 1330, "train": 14221},
avg_character_length={"train": 1219.37, "test": 1513.83},
)

def load_data(self,**kwargs) -> None:
"""And transform to a retrieval datset, which have the following attributes
self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
self.queries = Dict[query_id, str] #id => query
self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
"""

if self.data_loaded:
return


self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])

self.corpus = {}
self.relevant_docs = {}
self.queries = {}
text2id = {}

for split in self.dataset:
ds: datasets.Dataset = self.dataset[split] # type: ignore
ds = ds.shuffle(seed=42)
max_samples = min(1024, len(ds))
ds = ds.select(
range(max_samples)
) # limit the dataset size to make sure the task does not take too long to run
self.queries[split] = {}
self.relevant_docs[split] = {}
self.corpus[split] = {}


question = ds["question"]
context = ds["context"]
answer = [a["text"] for a in ds["answers"]]

n = 0
for q, cont, ans in zip(question, context, answer):
self.queries[split][str(n)] = q
q_n = n
n += 1
if cont not in text2id:
text2id[cont] = n
self.corpus[split][str(n)] = {"title": "", "text": cont}
n += 1
if ans not in text2id:
text2id[ans] = n
self.corpus[split][str(n)] = {"title": "", "text": ans}
n += 1

self.relevant_docs[split][str(q_n)] = {
str(text2id[ans]): 1,
str(text2id[cont]): 1,
} # only two correct matches
self.data_loaded = True
Empty file.
43 changes: 43 additions & 0 deletions results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
"mteb_dataset_name": "TurHistQuadRetrieval",
"mteb_version": "1.6.8",
"test": {
"evaluation_time": 97.69,
"map_at_1": 0.21289,
"map_at_10": 0.31645,
"map_at_100": 0.32852,
"map_at_1000": 0.33043,
"map_at_20": 0.32291,
"map_at_3": 0.28768,
"map_at_5": 0.30272,
"mrr_at_1": 0.42578,
"mrr_at_10": 0.55599,
"mrr_at_100": 0.56179,
"mrr_at_1000": 0.56202,
"mrr_at_20": 0.55955,
"mrr_at_3": 0.53011,
"mrr_at_5": 0.54495,
"ndcg_at_1": 0.42578,
"ndcg_at_10": 0.41696,
"ndcg_at_100": 0.4662,
"ndcg_at_1000": 0.50738,
"ndcg_at_20": 0.4354,
"ndcg_at_3": 0.36357,
"ndcg_at_5": 0.38843,
"precision_at_1": 0.42578,
"precision_at_10": 0.09512,
"precision_at_100": 0.01345,
"precision_at_1000": 0.0019,
"precision_at_20": 0.05352,
"precision_at_3": 0.23633,
"precision_at_5": 0.16152,
"recall_at_1": 0.21289,
"recall_at_10": 0.47559,
"recall_at_100": 0.67236,
"recall_at_1000": 0.95166,
"recall_at_20": 0.53516,
"recall_at_3": 0.35449,
"recall_at_5": 0.40381
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
"mteb_dataset_name": "TurHistQuadRetrieval",
"mteb_version": "1.6.8",
"test": {
"evaluation_time": 64.8,
"map_at_1": 0.10205,
"map_at_10": 0.1676,
"map_at_100": 0.1797,
"map_at_1000": 0.18232,
"map_at_20": 0.17366,
"map_at_3": 0.14364,
"map_at_5": 0.15814,
"mrr_at_1": 0.2041,
"mrr_at_10": 0.30393,
"mrr_at_100": 0.31484,
"mrr_at_1000": 0.31575,
"mrr_at_20": 0.31044,
"mrr_at_3": 0.27214,
"mrr_at_5": 0.29274,
"ndcg_at_1": 0.2041,
"ndcg_at_10": 0.23708,
"ndcg_at_100": 0.29725,
"ndcg_at_1000": 0.35858,
"ndcg_at_20": 0.25785,
"ndcg_at_3": 0.18704,
"ndcg_at_5": 0.21444,
"precision_at_1": 0.2041,
"precision_at_10": 0.06025,
"precision_at_100": 0.01093,
"precision_at_1000": 0.00192,
"precision_at_20": 0.03682,
"precision_at_3": 0.1263,
"precision_at_5": 0.09746,
"recall_at_1": 0.10205,
"recall_at_10": 0.30127,
"recall_at_100": 0.54639,
"recall_at_1000": 0.95947,
"recall_at_20": 0.36816,
"recall_at_3": 0.18945,
"recall_at_5": 0.24365
}
}

0 comments on commit 08f71f7

Please sign in to comment.