From 048677712943cc27158e2d7745882781c26bc2c8 Mon Sep 17 00:00:00 2001 From: Bhavish Pahwa <53102161+bp-high@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:00:03 +0530 Subject: [PATCH] Add Sanskrit Shlokas Dataset (#506) * Add Sanskrit Shlokas Dataset * Add model results and review changes * Add model results and review changes * Add points --- docs/mmteb/points/506.jsonl | 2 + mteb/tasks/Classification/__init__.py | 1 + .../san/SanskritShlokasClassification.py | 56 +++++++++++++++++++ mteb/tasks/Classification/san/__init__.py | 0 .../SanskritShlokasClassification.json | 21 +++++++ .../SanskritShlokasClassification.json | 21 +++++++ 6 files changed, 101 insertions(+) create mode 100644 docs/mmteb/points/506.jsonl create mode 100644 mteb/tasks/Classification/san/SanskritShlokasClassification.py create mode 100644 mteb/tasks/Classification/san/__init__.py create mode 100644 results/intfloat__multilingual-e5-small/SanskritShlokasClassification.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SanskritShlokasClassification.json diff --git a/docs/mmteb/points/506.jsonl b/docs/mmteb/points/506.jsonl new file mode 100644 index 0000000000..3e944cadda --- /dev/null +++ b/docs/mmteb/points/506.jsonl @@ -0,0 +1,2 @@ +{"GitHub": "bp-high", "New dataset": 2} +{"GitHub": "imenelydiaker", "Review PR": 2} \ No newline at end of file diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index d79a1e1152..d797c4c84f 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -62,6 +62,7 @@ from .pan.PunjabiNewsClassification import * from .pol.PolishClassification import * from .ron.RomanianSentimentClassification import * +from .san.SanskritShlokasClassification import * from .slk.SlovakSentimentClassification import * from .ssw.SiswatiNewsClassification import * from .swe.DalajClassification import * diff --git a/mteb/tasks/Classification/san/SanskritShlokasClassification.py b/mteb/tasks/Classification/san/SanskritShlokasClassification.py new file mode 100644 index 0000000000..6eba516a4d --- /dev/null +++ b/mteb/tasks/Classification/san/SanskritShlokasClassification.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SanskritShlokasClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SanskritShlokasClassification", + description="This data set contains ~500 Shlokas ", + reference="https://github.com/goru001/nlp-for-sanskrit", + dataset={ + "path": "bpHigh/iNLTK_Sanskrit_Shlokas_Dataset", + "revision": "5a79d6472db143690c7ce6e974995d3610eee7f0", + }, + type="Classification", + category="s2s", + date=("2019-01-01", "2020-01-01"), + eval_splits=["train", "validation"], + eval_langs=["san-Deva"], + main_score="accuracy", + form=["written"], + domains=["Religious"], + task_subtypes=["Topic classification"], + license="CC BY-SA 4.0", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="found", + bibtex_citation=""" + @inproceedings{arora-2020-inltk, + title = "i{NLTK}: Natural Language Toolkit for Indic Languages", + author = "Arora, Gaurav", + editor = "Park, Eunjeong L. and + Hagiwara, Masato and + Milajevs, Dmitrijs and + Liu, Nelson F. and + Chauhan, Geeticka and + Tan, Liling", + booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)", + month = nov, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2020.nlposs-1.10", + doi = "10.18653/v1/2020.nlposs-1.10", + pages = "66--71", + abstract = "We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.", + } + """, + n_samples={"train": 383, "validation": 96}, + avg_character_length={"train": 98.415, "validation": 96.635}, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns({"Sloka": "text", "Class": "label"}) diff --git a/mteb/tasks/Classification/san/__init__.py b/mteb/tasks/Classification/san/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/results/intfloat__multilingual-e5-small/SanskritShlokasClassification.json b/results/intfloat__multilingual-e5-small/SanskritShlokasClassification.json new file mode 100644 index 0000000000..a0ddbc3dee --- /dev/null +++ b/results/intfloat__multilingual-e5-small/SanskritShlokasClassification.json @@ -0,0 +1,21 @@ +{ + "dataset_revision": "5a79d6472db143690c7ce6e974995d3610eee7f0", + "mteb_dataset_name": "SanskritShlokasClassification", + "mteb_version": "1.7.6", + "train": { + "accuracy": 0.7861618798955614, + "accuracy_stderr": 0.03234754078233619, + "evaluation_time": 5.71, + "f1": 0.785558565911011, + "f1_stderr": 0.0311303300468244, + "main_score": 0.7861618798955614 + }, + "validation": { + "accuracy": 0.775, + "accuracy_stderr": 0.0489028941429396, + "evaluation_time": 1.43, + "f1": 0.7820968680935053, + "f1_stderr": 0.046782872469135026, + "main_score": 0.775 + } +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SanskritShlokasClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SanskritShlokasClassification.json new file mode 100644 index 0000000000..318fa39128 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SanskritShlokasClassification.json @@ -0,0 +1,21 @@ +{ + "dataset_revision": "5a79d6472db143690c7ce6e974995d3610eee7f0", + "mteb_dataset_name": "SanskritShlokasClassification", + "mteb_version": "1.7.6", + "train": { + "accuracy": 0.6107049608355092, + "accuracy_stderr": 0.04994321503293052, + "evaluation_time": 8.12, + "f1": 0.6044402852208871, + "f1_stderr": 0.04864067065258704, + "main_score": 0.6107049608355092 + }, + "validation": { + "accuracy": 0.6583333333333334, + "accuracy_stderr": 0.06383980602518567, + "evaluation_time": 1.92, + "f1": 0.6425332989408176, + "f1_stderr": 0.06450800640092026, + "main_score": 0.6583333333333334 + } +} \ No newline at end of file