Skip to content

Commit

Permalink
fix: add Clustering dataset for Indic languages (#532)
Browse files Browse the repository at this point in the history
* add Indic clustering dataset

* update module import statement

* add points for the contribution
  • Loading branch information
jaygala24 authored Apr 24, 2024
1 parent a61de4c commit dc9ba24
Show file tree
Hide file tree
Showing 5 changed files with 418 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/mmteb/points/532.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "jaygala24", "New dataset": 36}
{"GitHub": "digantamisra98", "New dataset": 18}
{"GitHub": "asparius", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .fra.HALClusteringS2S import *
from .fra.MLSUMClusteringP2P import *
from .fra.MLSUMClusteringS2S import *
from .multilingual.IndicReviewsClusteringP2P import *
from .multilingual.MasakhaNEWSClusteringP2P import *
from .multilingual.MasakhaNEWSClusteringS2S import *
from .nob.snl_clustering import *
Expand Down
86 changes: 86 additions & 0 deletions mteb/tasks/Clustering/multilingual/IndicReviewsClusteringP2P.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from __future__ import annotations

from typing import Any

import datasets
import numpy as np

from mteb.abstasks import AbsTaskClustering, MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

_LANGUAGES = {
"as": ["asm-Beng"],
"bd": ["brx-Deva"],
"bn": ["ben-Beng"],
"gu": ["guj-Gujr"],
"hi": ["hin-Deva"],
"kn": ["kan-Knda"],
"ml": ["mal-Mlym"],
"mr": ["mar-Deva"],
"or": ["ory-Orya"],
"pa": ["pan-Guru"],
"ta": ["tam-Taml"],
"te": ["tel-Telu"],
"ur": ["urd-Arab"],
}


class IndicReviewsClusteringP2P(AbsTaskClustering, MultilingualTask):
metadata = TaskMetadata(
name="IndicReviewsClusteringP2P",
dataset={
"path": "ai4bharat/IndicSentiment",
"revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4",
},
description="Clustering of reviews from IndicSentiment dataset. Clustering of 14 sets on the generic categories label.",
reference="https://arxiv.org/abs/2212.05409",
type="Clustering",
category="p2p",
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="v_measure",
date=("2022-08-01", "2022-12-20"),
form=["written"],
domains=["Reviews"],
task_subtypes=["Thematic clustering"],
license="CC0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="machine-translated and verified",
bibtex_citation="""@article{doddapaneni2022towards,
title = {Towards Leaving No Indic Language Behind: Building Monolingual Corpora, Benchmark and Models for Indic Languages},
author = {Sumanth Doddapaneni and Rahul Aralikatte and Gowtham Ramesh and Shreyansh Goyal and Mitesh M. Khapra and Anoop Kunchukuttan and Pratyush Kumar},
journal = {Annual Meeting of the Association for Computational Linguistics},
year = {2022},
doi = {10.18653/v1/2023.acl-long.693}
}""",
n_samples={"test": 1000},
avg_character_length={"test": 137.6},
)

def load_data(self, **kwargs: Any) -> None:
"""Load dataset from HuggingFace hub"""
if self.data_loaded:
return
self.dataset = {}
for lang in self.langs:
self.dataset[lang] = datasets.load_dataset(
name=f"translation-{lang}",
**self.metadata_dict["dataset"],
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self) -> None:
for lang in self.langs:
self.dataset[lang].pop("validation")

texts = self.dataset[lang]["test"]["INDIC REVIEW"]
labels = self.dataset[lang]["test"]["GENERIC CATEGORIES"]

new_format = {
"sentences": [split.tolist() for split in np.array_split(texts, 5)],
"labels": [split.tolist() for split in np.array_split(labels, 5)],
}
self.dataset[lang]["test"] = datasets.Dataset.from_dict(new_format)
164 changes: 164 additions & 0 deletions results/intfloat__multilingual-e5-small/IndicReviewsClusteringP2P.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"dataset_revision": "ccb472517ce32d103bba9d4f5df121ed5a6592a4",
"mteb_dataset_name": "IndicReviewsClusteringP2P",
"mteb_version": "1.7.17",
"test": {
"as": {
"main_score": 0.38858031116616254,
"v_measure": 0.38858031116616254,
"v_measure_std": 0.03860410447319436,
"v_measures": [
0.44718331638993847,
0.376494146234758,
0.39350320433544006,
0.39838324584809703,
0.32733764302257895
]
},
"bd": {
"main_score": 0.3083789302070182,
"v_measure": 0.3083789302070182,
"v_measure_std": 0.018797624777147037,
"v_measures": [
0.30090855025003,
0.3267577057010023,
0.33296647851774197,
0.28230512576623834,
0.2989567908000782
]
},
"bn": {
"main_score": 0.44060339762415,
"v_measure": 0.44060339762415,
"v_measure_std": 0.026150909476873926,
"v_measures": [
0.44975702765370407,
0.4376960948746971,
0.46839926933251225,
0.45500731078171,
0.39215728547812656
]
},
"evaluation_time": 13.01,
"gu": {
"main_score": 0.4102948154121003,
"v_measure": 0.4102948154121003,
"v_measure_std": 0.03360584068564712,
"v_measures": [
0.47056383269590996,
0.38177659453392665,
0.3938682169161623,
0.4228053540586962,
0.38246007885580596
]
},
"hi": {
"main_score": 0.42015656983611543,
"v_measure": 0.42015656983611543,
"v_measure_std": 0.02912377481544926,
"v_measures": [
0.3937559772328873,
0.458814670444042,
0.3944729170258295,
0.4523622937412655,
0.40137699073655303
]
},
"kn": {
"main_score": 0.3975630424710955,
"v_measure": 0.3975630424710955,
"v_measure_std": 0.02140617819951331,
"v_measures": [
0.3850406572744366,
0.40359275814445467,
0.43445254376123893,
0.39409355934528506,
0.3706356938300621
]
},
"ml": {
"main_score": 0.4314312862442121,
"v_measure": 0.4314312862442121,
"v_measure_std": 0.041302009854990884,
"v_measures": [
0.41196613422164635,
0.5001004626869213,
0.4100078755888857,
0.45352770176775214,
0.3815542569558553
]
},
"mr": {
"main_score": 0.4458930893881707,
"v_measure": 0.4458930893881707,
"v_measure_std": 0.05888071695406765,
"v_measures": [
0.4757921543008085,
0.5082075884589856,
0.4242430832374764,
0.4798908624864185,
0.34133175845716457
]
},
"or": {
"main_score": 0.3830589085949544,
"v_measure": 0.3830589085949544,
"v_measure_std": 0.04126934650768488,
"v_measures": [
0.4221243155754502,
0.38661861765826727,
0.39138096733774613,
0.41054735139122933,
0.30462329101207913
]
},
"pa": {
"main_score": 0.413428132829562,
"v_measure": 0.413428132829562,
"v_measure_std": 0.027146722109158312,
"v_measures": [
0.43973719048831933,
0.4314347809902511,
0.4226783480036921,
0.41060447414618134,
0.3626858705193662
]
},
"ta": {
"main_score": 0.41382186577400254,
"v_measure": 0.41382186577400254,
"v_measure_std": 0.024171284038429493,
"v_measures": [
0.43376351090526816,
0.4154996135099985,
0.36862557394761314,
0.43561358809810585,
0.415607042409027
]
},
"te": {
"main_score": 0.39733522065922383,
"v_measure": 0.39733522065922383,
"v_measure_std": 0.009749907305329921,
"v_measures": [
0.4156969552896471,
0.3938957052644776,
0.39798349173903314,
0.39107960908482675,
0.38802034191813445
]
},
"ur": {
"main_score": 0.42981092185025843,
"v_measure": 0.42981092185025843,
"v_measure_std": 0.04285550103152173,
"v_measures": [
0.4638832631582357,
0.4800188678011598,
0.43334328167870156,
0.41448080785491787,
0.3573283887582773
]
}
}
}
Loading

0 comments on commit dc9ba24

Please sign in to comment.