First Turkish Retrieval dataset

embeddings-benchmark · Apr 23, 2024 · 08f71f7 · 08f71f7
1 parent edef77b
commit 08f71f7
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 0 deletions.
diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py
@@ -83,6 +83,7 @@
 from .spa.SpanishPassageRetrievalS2S import *
 from .swe.swedn_retrieval import *
 from .swe.swefaq_retrieval import *
+from .tur.TurHistQuad import *
 from .vie.VieQuADRetrieval import *
 from .zho.CMTEBRetrieval import *
 from .zho.LeCaRDv2Retrieval import *
diff --git a/mteb/tasks/Retrieval/tur/TurHistQuad.py b/mteb/tasks/Retrieval/tur/TurHistQuad.py
@@ -0,0 +1,99 @@
+import datasets
+
+from mteb.abstasks import AbsTaskRetrieval, TaskMetadata
+import json
+
+class TurHistQuadRetrieval(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="TurHistQuadRetrieval",
+        dataset={
+            "path": "asparius/TurHistQuAD",
+            "revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
+        },
+        description="Question Answering dataset on Ottoman History in Turkish",
+        reference="https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset",
+        type="Retrieval",
+        category="p2p",
+        eval_splits=["test"],
+        eval_langs=["tur-Latn"],
+        main_score="ndcg_at_10",
+        date=("2021-01-01", "2021-10-13"),
+        form=["written"],
+        task_subtypes=["Question answering"],
+        domains=["Encyclopaedic", "Non-fiction", "Academic"],
+        license="MIT",
+        socioeconomic_status="high",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""
+            @INPROCEEDINGS{9559013,
+                author={Soygazi, Fatih and Çiftçi, Okan and Kök, Uğurcan and Cengiz, Soner},
+                booktitle={2021 6th International Conference on Computer Science and Engineering (UBMK)}, 
+                title={THQuAD: Turkish Historic Question Answering Dataset for Reading Comprehension}, 
+                year={2021},
+                volume={},
+                number={},
+                pages={215-220},
+                keywords={Computer science;Computational modeling;Neural networks;Knowledge discovery;Information retrieval;Natural language processing;History;question answering;information retrieval;natural language understanding;deep learning;contextualized word embeddings},
+                doi={10.1109/UBMK52708.2021.9559013}}
+
+        """,
+        n_samples={"test": 1330, "train": 14221},
+        avg_character_length={"train": 1219.37, "test": 1513.83},
+    )
+
+    def load_data(self,**kwargs) -> None:
+        """And transform to a retrieval datset, which have the following attributes
+
+        self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text
+        self.queries = Dict[query_id, str] #id => query
+        self.relevant_docs = Dict[query_id, Dict[[doc_id, score]]
+        """
+
+        if self.data_loaded:
+            return
+
+
+        self.dataset = datasets.load_dataset(**self.metadata_dict["dataset"])
+
+        self.corpus = {}
+        self.relevant_docs = {}
+        self.queries = {}
+        text2id = {}
+
+        for split in self.dataset:
+            ds: datasets.Dataset = self.dataset[split]  # type: ignore
+            ds = ds.shuffle(seed=42)
+            max_samples = min(1024, len(ds))
+            ds = ds.select(
+                range(max_samples)
+            )  # limit the dataset size to make sure the task does not take too long to run
+            self.queries[split] = {}
+            self.relevant_docs[split] = {}
+            self.corpus[split] = {}
+
+
+            question = ds["question"]
+            context = ds["context"]
+            answer = [a["text"] for a in ds["answers"]]
+
+            n = 0
+            for q, cont, ans in zip(question, context, answer):
+                self.queries[split][str(n)] = q
+                q_n = n
+                n += 1
+                if cont not in text2id:
+                    text2id[cont] = n
+                    self.corpus[split][str(n)] = {"title": "", "text": cont}
+                    n += 1
+                if ans not in text2id:
+                    text2id[ans] = n
+                    self.corpus[split][str(n)] = {"title": "", "text": ans}
+                    n += 1
+
+                self.relevant_docs[split][str(q_n)] = {
+                    str(text2id[ans]): 1,
+                    str(text2id[cont]): 1,
+                }  # only two correct matches
+            self.data_loaded = True
diff --git a/mteb/tasks/Retrieval/tur/__init__.py b/mteb/tasks/Retrieval/tur/__init__.py
diff --git a/results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json b/results/intfloat__multilingual-e5-small/TurHistQuadRetrieval.json
@@ -0,0 +1,43 @@
+{
+  "dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
+  "mteb_dataset_name": "TurHistQuadRetrieval",
+  "mteb_version": "1.6.8",
+  "test": {
+    "evaluation_time": 97.69,
+    "map_at_1": 0.21289,
+    "map_at_10": 0.31645,
+    "map_at_100": 0.32852,
+    "map_at_1000": 0.33043,
+    "map_at_20": 0.32291,
+    "map_at_3": 0.28768,
+    "map_at_5": 0.30272,
+    "mrr_at_1": 0.42578,
+    "mrr_at_10": 0.55599,
+    "mrr_at_100": 0.56179,
+    "mrr_at_1000": 0.56202,
+    "mrr_at_20": 0.55955,
+    "mrr_at_3": 0.53011,
+    "mrr_at_5": 0.54495,
+    "ndcg_at_1": 0.42578,
+    "ndcg_at_10": 0.41696,
+    "ndcg_at_100": 0.4662,
+    "ndcg_at_1000": 0.50738,
+    "ndcg_at_20": 0.4354,
+    "ndcg_at_3": 0.36357,
+    "ndcg_at_5": 0.38843,
+    "precision_at_1": 0.42578,
+    "precision_at_10": 0.09512,
+    "precision_at_100": 0.01345,
+    "precision_at_1000": 0.0019,
+    "precision_at_20": 0.05352,
+    "precision_at_3": 0.23633,
+    "precision_at_5": 0.16152,
+    "recall_at_1": 0.21289,
+    "recall_at_10": 0.47559,
+    "recall_at_100": 0.67236,
+    "recall_at_1000": 0.95166,
+    "recall_at_20": 0.53516,
+    "recall_at_3": 0.35449,
+    "recall_at_5": 0.40381
+  }
+}
diff --git a/...ts/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json b/...ts/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/TurHistQuadRetrieval.json
@@ -0,0 +1,43 @@
+{
+  "dataset_revision": "2a2b8ddecf1189f530676244d0751e1d0a569e03",
+  "mteb_dataset_name": "TurHistQuadRetrieval",
+  "mteb_version": "1.6.8",
+  "test": {
+    "evaluation_time": 64.8,
+    "map_at_1": 0.10205,
+    "map_at_10": 0.1676,
+    "map_at_100": 0.1797,
+    "map_at_1000": 0.18232,
+    "map_at_20": 0.17366,
+    "map_at_3": 0.14364,
+    "map_at_5": 0.15814,
+    "mrr_at_1": 0.2041,
+    "mrr_at_10": 0.30393,
+    "mrr_at_100": 0.31484,
+    "mrr_at_1000": 0.31575,
+    "mrr_at_20": 0.31044,
+    "mrr_at_3": 0.27214,
+    "mrr_at_5": 0.29274,
+    "ndcg_at_1": 0.2041,
+    "ndcg_at_10": 0.23708,
+    "ndcg_at_100": 0.29725,
+    "ndcg_at_1000": 0.35858,
+    "ndcg_at_20": 0.25785,
+    "ndcg_at_3": 0.18704,
+    "ndcg_at_5": 0.21444,
+    "precision_at_1": 0.2041,
+    "precision_at_10": 0.06025,
+    "precision_at_100": 0.01093,
+    "precision_at_1000": 0.00192,
+    "precision_at_20": 0.03682,
+    "precision_at_3": 0.1263,
+    "precision_at_5": 0.09746,
+    "recall_at_1": 0.10205,
+    "recall_at_10": 0.30127,
+    "recall_at_100": 0.54639,
+    "recall_at_1000": 0.95947,
+    "recall_at_20": 0.36816,
+    "recall_at_3": 0.18945,
+    "recall_at_5": 0.24365
+  }
+}