From dc9ba24bfa48774e488df671bc2fb6df0080c2b3 Mon Sep 17 00:00:00 2001 From: Jay Gala Date: Wed, 24 Apr 2024 13:04:12 +0530 Subject: [PATCH] fix: add Clustering dataset for Indic languages (#532) * add Indic clustering dataset * update module import statement * add points for the contribution --- docs/mmteb/points/532.jsonl | 3 + mteb/tasks/Clustering/__init__.py | 1 + .../multilingual/IndicReviewsClusteringP2P.py | 86 +++++++++ .../IndicReviewsClusteringP2P.json | 164 ++++++++++++++++++ .../IndicReviewsClusteringP2P.json | 164 ++++++++++++++++++ 5 files changed, 418 insertions(+) create mode 100644 docs/mmteb/points/532.jsonl create mode 100644 mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py create mode 100644 results/intfloat__multilingual-e5-small/IndicReviewsClusteringP2P.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/IndicReviewsClusteringP2P.json diff --git a/docs/mmteb/points/532.jsonl b/docs/mmteb/points/532.jsonl new file mode 100644 index 0000000000..a01d28100f --- /dev/null +++ b/docs/mmteb/points/532.jsonl @@ -0,0 +1,3 @@ +{"GitHub": "jaygala24", "New dataset": 36} +{"GitHub": "digantamisra98", "New dataset": 18} +{"GitHub": "asparius", "Review PR": 2} \ No newline at end of file diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 9999a7d284..1a5d743d8d 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -23,6 +23,7 @@ from .fra.HALClusteringS2S import * from .fra.MLSUMClusteringP2P import * from .fra.MLSUMClusteringS2S import * +from .multilingual.IndicReviewsClusteringP2P import * from .multilingual.MasakhaNEWSClusteringP2P import * from .multilingual.MasakhaNEWSClusteringS2S import * from .nob.snl_clustering import * diff --git a/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py new file mode 100644 index 0000000000..27c0a93d33 --- /dev/null +++ b/mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import Any + +import datasets +import numpy as np + +from mteb.abstasks import AbsTaskClustering, MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "as": ["asm-Beng"], + "bd": ["brx-Deva"], + "bn": ["ben-Beng"], + "gu": ["guj-Gujr"], + "hi": ["hin-Deva"], + "kn": ["kan-Knda"], + "ml": ["mal-Mlym"], + "mr": ["mar-Deva"], + "or": ["ory-Orya"], + "pa": ["pan-Guru"], + "ta": ["tam-Taml"], + "te": ["tel-Telu"], + "ur": ["urd-Arab"], +} + + +class IndicReviewsClusteringP2P(AbsTaskClustering, MultilingualTask): + metadata = TaskMetadata( + name="IndicReviewsClusteringP2P", + dataset={ + "path": "ai4bharat/IndicSentiment", + "revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4", + }, + description="Clustering of reviews from IndicSentiment dataset. Clustering of 14 sets on the generic categories label.", + reference="https://arxiv.org/abs/2212.05409", + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="v_measure", + date=("2022-08-01", "2022-12-20"), + form=["written"], + domains=["Reviews"], + task_subtypes=["Thematic clustering"], + license="CC0", + socioeconomic_status="mixed", + annotations_creators="human-annotated", + dialect=[], + text_creation="machine-translated and verified", + bibtex_citation="""@article{doddapaneni2022towards, + title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages}, + author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar}, + journal = {Annual Meeting of the Association for Computational Linguistics}, + year = {2022}, + doi = {10.18653/v1/2023.acl-long.693} +}""", + n_samples={"test": 1000}, + avg_character_length={"test": 137.6}, + ) + + def load_data(self, **kwargs: Any) -> None: + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = {} + for lang in self.langs: + self.dataset[lang] = datasets.load_dataset( + name=f"translation-{lang}", + **self.metadata_dict["dataset"], + ) + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + for lang in self.langs: + self.dataset[lang].pop("validation") + + texts = self.dataset[lang]["test"]["INDIC REVIEW"] + labels = self.dataset[lang]["test"]["GENERIC CATEGORIES"] + + new_format = { + "sentences": [split.tolist() for split in np.array_split(texts, 5)], + "labels": [split.tolist() for split in np.array_split(labels, 5)], + } + self.dataset[lang]["test"] = datasets.Dataset.from_dict(new_format) diff --git a/results/intfloat__multilingual-e5-small/IndicReviewsClusteringP2P.json b/results/intfloat__multilingual-e5-small/IndicReviewsClusteringP2P.json new file mode 100644 index 0000000000..1fc21e5259 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/IndicReviewsClusteringP2P.json @@ -0,0 +1,164 @@ +{ + "dataset_revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4", + "mteb_dataset_name": "IndicReviewsClusteringP2P", + "mteb_version": "1.7.17", + "test": { + "as": { + "main_score": 0.38858031116616254, + "v_measure": 0.38858031116616254, + "v_measure_std": 0.03860410447319436, + "v_measures": [ + 0.44718331638993847, + 0.376494146234758, + 0.39350320433544006, + 0.39838324584809703, + 0.32733764302257895 + ] + }, + "bd": { + "main_score": 0.3083789302070182, + "v_measure": 0.3083789302070182, + "v_measure_std": 0.018797624777147037, + "v_measures": [ + 0.30090855025003, + 0.3267577057010023, + 0.33296647851774197, + 0.28230512576623834, + 0.2989567908000782 + ] + }, + "bn": { + "main_score": 0.44060339762415, + "v_measure": 0.44060339762415, + "v_measure_std": 0.026150909476873926, + "v_measures": [ + 0.44975702765370407, + 0.4376960948746971, + 0.46839926933251225, + 0.45500731078171, + 0.39215728547812656 + ] + }, + "evaluation_time": 13.01, + "gu": { + "main_score": 0.4102948154121003, + "v_measure": 0.4102948154121003, + "v_measure_std": 0.03360584068564712, + "v_measures": [ + 0.47056383269590996, + 0.38177659453392665, + 0.3938682169161623, + 0.4228053540586962, + 0.38246007885580596 + ] + }, + "hi": { + "main_score": 0.42015656983611543, + "v_measure": 0.42015656983611543, + "v_measure_std": 0.02912377481544926, + "v_measures": [ + 0.3937559772328873, + 0.458814670444042, + 0.3944729170258295, + 0.4523622937412655, + 0.40137699073655303 + ] + }, + "kn": { + "main_score": 0.3975630424710955, + "v_measure": 0.3975630424710955, + "v_measure_std": 0.02140617819951331, + "v_measures": [ + 0.3850406572744366, + 0.40359275814445467, + 0.43445254376123893, + 0.39409355934528506, + 0.3706356938300621 + ] + }, + "ml": { + "main_score": 0.4314312862442121, + "v_measure": 0.4314312862442121, + "v_measure_std": 0.041302009854990884, + "v_measures": [ + 0.41196613422164635, + 0.5001004626869213, + 0.4100078755888857, + 0.45352770176775214, + 0.3815542569558553 + ] + }, + "mr": { + "main_score": 0.4458930893881707, + "v_measure": 0.4458930893881707, + "v_measure_std": 0.05888071695406765, + "v_measures": [ + 0.4757921543008085, + 0.5082075884589856, + 0.4242430832374764, + 0.4798908624864185, + 0.34133175845716457 + ] + }, + "or": { + "main_score": 0.3830589085949544, + "v_measure": 0.3830589085949544, + "v_measure_std": 0.04126934650768488, + "v_measures": [ + 0.4221243155754502, + 0.38661861765826727, + 0.39138096733774613, + 0.41054735139122933, + 0.30462329101207913 + ] + }, + "pa": { + "main_score": 0.413428132829562, + "v_measure": 0.413428132829562, + "v_measure_std": 0.027146722109158312, + "v_measures": [ + 0.43973719048831933, + 0.4314347809902511, + 0.4226783480036921, + 0.41060447414618134, + 0.3626858705193662 + ] + }, + "ta": { + "main_score": 0.41382186577400254, + "v_measure": 0.41382186577400254, + "v_measure_std": 0.024171284038429493, + "v_measures": [ + 0.43376351090526816, + 0.4154996135099985, + 0.36862557394761314, + 0.43561358809810585, + 0.415607042409027 + ] + }, + "te": { + "main_score": 0.39733522065922383, + "v_measure": 0.39733522065922383, + "v_measure_std": 0.009749907305329921, + "v_measures": [ + 0.4156969552896471, + 0.3938957052644776, + 0.39798349173903314, + 0.39107960908482675, + 0.38802034191813445 + ] + }, + "ur": { + "main_score": 0.42981092185025843, + "v_measure": 0.42981092185025843, + "v_measure_std": 0.04285550103152173, + "v_measures": [ + 0.4638832631582357, + 0.4800188678011598, + 0.43334328167870156, + 0.41448080785491787, + 0.3573283887582773 + ] + } + } +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/IndicReviewsClusteringP2P.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/IndicReviewsClusteringP2P.json new file mode 100644 index 0000000000..68149023ed --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/IndicReviewsClusteringP2P.json @@ -0,0 +1,164 @@ +{ + "dataset_revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4", + "mteb_dataset_name": "IndicReviewsClusteringP2P", + "mteb_version": "1.7.17", + "test": { + "as": { + "main_score": 0.283905671281757, + "v_measure": 0.283905671281757, + "v_measure_std": 0.032274913722608364, + "v_measures": [ + 0.3351686632628645, + 0.26310907074553536, + 0.30794806251724083, + 0.26314424025728467, + 0.25015831962585944 + ] + }, + "bd": { + "main_score": 0.22260652203657538, + "v_measure": 0.22260652203657538, + "v_measure_std": 0.02890123894565602, + "v_measures": [ + 0.2304232640105884, + 0.22627495985348936, + 0.2516254867627226, + 0.2372790542608185, + 0.16742984529525803 + ] + }, + "bn": { + "main_score": 0.3034799179608592, + "v_measure": 0.3034799179608592, + "v_measure_std": 0.03346069847546473, + "v_measures": [ + 0.35430616485912053, + 0.31628429923931345, + 0.3115401862689122, + 0.257240435033857, + 0.27802850440309274 + ] + }, + "evaluation_time": 13.59, + "gu": { + "main_score": 0.3804257151047565, + "v_measure": 0.3804257151047565, + "v_measure_std": 0.02545161711524506, + "v_measures": [ + 0.42776359209116405, + 0.38483239481742143, + 0.3668243435094857, + 0.36710660249766336, + 0.35560164260804794 + ] + }, + "hi": { + "main_score": 0.3849010518314793, + "v_measure": 0.3849010518314793, + "v_measure_std": 0.025111990543539492, + "v_measures": [ + 0.3609806522406112, + 0.4256069823171105, + 0.3655429234427713, + 0.4027078800548277, + 0.3696668211020759 + ] + }, + "kn": { + "main_score": 0.2303267125024783, + "v_measure": 0.2303267125024783, + "v_measure_std": 0.010307182059553333, + "v_measures": [ + 0.24642688441776245, + 0.2268546852320908, + 0.23575234209351617, + 0.22710984528274192, + 0.2154898054862801 + ] + }, + "ml": { + "main_score": 0.2362183628604662, + "v_measure": 0.2362183628604662, + "v_measure_std": 0.027070157428323114, + "v_measures": [ + 0.2385401128171075, + 0.2301995623276578, + 0.28597283229903037, + 0.20662457582081836, + 0.2197547310377169 + ] + }, + "mr": { + "main_score": 0.3850912366134934, + "v_measure": 0.3850912366134934, + "v_measure_std": 0.02926063271558728, + "v_measures": [ + 0.38672209986255046, + 0.4368109236452519, + 0.3776930013072469, + 0.34641801122206334, + 0.3778121470303543 + ] + }, + "or": { + "main_score": 0.3100703145390522, + "v_measure": 0.3100703145390522, + "v_measure_std": 0.02522979139705925, + "v_measures": [ + 0.3387910222952054, + 0.2846776768553455, + 0.3410268072768593, + 0.28378946921572007, + 0.30206659705213074 + ] + }, + "pa": { + "main_score": 0.2554815273575678, + "v_measure": 0.2554815273575678, + "v_measure_std": 0.029895903897804933, + "v_measures": [ + 0.2600456410355764, + 0.24181465420838455, + 0.30238100793737344, + 0.26253845000393106, + 0.21062788360257317 + ] + }, + "ta": { + "main_score": 0.24788105574887198, + "v_measure": 0.24788105574887198, + "v_measure_std": 0.023804510375870325, + "v_measures": [ + 0.28696360541246535, + 0.2355649945178528, + 0.25775674177461794, + 0.2434396095403764, + 0.21568032749904725 + ] + }, + "te": { + "main_score": 0.26959972238540286, + "v_measure": 0.26959972238540286, + "v_measure_std": 0.03124530874079984, + "v_measures": [ + 0.3144137052036683, + 0.2386380609168765, + 0.2919046085689876, + 0.27105633398286605, + 0.2319859032546158 + ] + }, + "ur": { + "main_score": 0.388335332759503, + "v_measure": 0.388335332759503, + "v_measure_std": 0.03553956810109769, + "v_measures": [ + 0.44400344837034483, + 0.41530256871433574, + 0.349478209471759, + 0.36320654829927906, + 0.3696858889417963 + ] + } + } +} \ No newline at end of file