Skip to content

Commit

Permalink
Add Sanskrit Shlokas Dataset (#506)
Browse files Browse the repository at this point in the history
* Add Sanskrit Shlokas Dataset

* Add model results and review changes

* Add model results and review changes

* Add points
  • Loading branch information
bp-high authored Apr 25, 2024
1 parent bfbe990 commit 0486777
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/506.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "bp-high", "New dataset": 2}
{"GitHub": "imenelydiaker", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
from .pan.PunjabiNewsClassification import *
from .pol.PolishClassification import *
from .ron.RomanianSentimentClassification import *
from .san.SanskritShlokasClassification import *
from .slk.SlovakSentimentClassification import *
from .ssw.SiswatiNewsClassification import *
from .swe.DalajClassification import *
Expand Down
56 changes: 56 additions & 0 deletions mteb/tasks/Classification/san/SanskritShlokasClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class SanskritShlokasClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="SanskritShlokasClassification",
description="This data set contains ~500 Shlokas ",
reference="https://github.com/goru001/nlp-for-sanskrit",
dataset={
"path": "bpHigh/iNLTK_Sanskrit_Shlokas_Dataset",
"revision": "5a79d6472db143690c7ce6e974995d3610eee7f0",
},
type="Classification",
category="s2s",
date=("2019-01-01", "2020-01-01"),
eval_splits=["train", "validation"],
eval_langs=["san-Deva"],
main_score="accuracy",
form=["written"],
domains=["Religious"],
task_subtypes=["Topic classification"],
license="CC BY-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""
@inproceedings{arora-2020-inltk,
title = "i{NLTK}: Natural Language Toolkit for Indic Languages",
author = "Arora, Gaurav",
editor = "Park, Eunjeong L. and
Hagiwara, Masato and
Milajevs, Dmitrijs and
Liu, Nelson F. and
Chauhan, Geeticka and
Tan, Liling",
booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.nlposs-1.10",
doi = "10.18653/v1/2020.nlposs-1.10",
pages = "66--71",
abstract = "We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.",
}
""",
n_samples={"train": 383, "validation": 96},
avg_character_length={"train": 98.415, "validation": 96.635},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"Sloka": "text", "Class": "label"})
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"dataset_revision": "5a79d6472db143690c7ce6e974995d3610eee7f0",
"mteb_dataset_name": "SanskritShlokasClassification",
"mteb_version": "1.7.6",
"train": {
"accuracy": 0.7861618798955614,
"accuracy_stderr": 0.03234754078233619,
"evaluation_time": 5.71,
"f1": 0.785558565911011,
"f1_stderr": 0.0311303300468244,
"main_score": 0.7861618798955614
},
"validation": {
"accuracy": 0.775,
"accuracy_stderr": 0.0489028941429396,
"evaluation_time": 1.43,
"f1": 0.7820968680935053,
"f1_stderr": 0.046782872469135026,
"main_score": 0.775
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"dataset_revision": "5a79d6472db143690c7ce6e974995d3610eee7f0",
"mteb_dataset_name": "SanskritShlokasClassification",
"mteb_version": "1.7.6",
"train": {
"accuracy": 0.6107049608355092,
"accuracy_stderr": 0.04994321503293052,
"evaluation_time": 8.12,
"f1": 0.6044402852208871,
"f1_stderr": 0.04864067065258704,
"main_score": 0.6107049608355092
},
"validation": {
"accuracy": 0.6583333333333334,
"accuracy_stderr": 0.06383980602518567,
"evaluation_time": 1.92,
"f1": 0.6425332989408176,
"f1_stderr": 0.06450800640092026,
"main_score": 0.6583333333333334
}
}

0 comments on commit 0486777

Please sign in to comment.