Skip to content

Commit

Permalink
fix: Added Sentiment Analysis Bengali Dataset (#536)
Browse files Browse the repository at this point in the history
* Added Hindi sentiment analysis dataset
* Made changes based on comments and added points on points table
* linted correctly
* bengali sentiment analysis dataset
* Added Sentiment Analysis Bengali Dataset
* deleted old hindi sentiment files
* removed hindi points
* updated all references, rerun tests, and added points
  • Loading branch information
Akash190104 authored Apr 24, 2024
1 parent ee96dc8 commit ba9bcaa
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/mmteb/points/536.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "Akash190104", "New dataset": 2}
{"GitHub": "asparius", "Review PR": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .ara.TweetSarcasmClassification import *
from .bam.BambaraSentimentClassification import *
from .ben.BengaliHateSpeechClassification import *
from .ben.BengaliSentimentAnalysis import *
from .bul.BulgarianSentimentClassification import *
from .bul.BulgarianStoreReviewSentimentClassfication import *
from .ces.CzechSubjectivityClassification import *
Expand Down
44 changes: 44 additions & 0 deletions mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class BengaliSentimentAnalysis(AbsTaskClassification):
metadata = TaskMetadata(
name="BengaliSentimentAnalysis",
description="dataset contains 3307 Negative reviews and 8500 Positive reviews collected and manually annotated from Youtube Bengali drama.",
reference="https://data.mendeley.com/datasets/p6zc7krs37/4",
dataset={
"path": "Akash190104/bengali_sentiment_analysis",
"revision": "a4b3685b1854cc26c554dda4c7cb918a36a6fb6c",
},
type="Classification",
category="s2s",
eval_splits=["train"],
eval_langs=["ben-Beng"],
main_score="f1",
date=("2020-06-24", "2020-11-26"),
form=["written"],
dialect=[],
domains=["Reviews"],
task_subtypes=["Sentiment/Hate speech"],
license="CC BY 4.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
text_creation="found",
bibtex_citation="""@inproceedings{sazzed2020cross,
title={Cross-lingual sentiment classification in low-resource Bengali language},
author={Sazzed, Salim},
booktitle={Proceedings of the Sixth Workshop on Noisy User-generated Text (W-NUT 2020)},
pages={50--60},
year={2020}
}""",
n_samples={"train": 11807},
avg_character_length={"train": 69.66},
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "a4b3685b1854cc26c554dda4c7cb918a36a6fb6c",
"mteb_dataset_name": "BengaliSentimentAnalysis",
"mteb_version": "1.7.17",
"train": {
"accuracy": 0.853759765625,
"accuracy_stderr": 0.029660222421585114,
"ap": 0.9118492318156679,
"ap_stderr": 0.018562450430510975,
"evaluation_time": 66.98,
"f1": 0.8318878279557277,
"f1_stderr": 0.03000343346915389,
"main_score": 0.8318878279557277
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"dataset_revision": "a4b3685b1854cc26c554dda4c7cb918a36a6fb6c",
"mteb_dataset_name": "BengaliSentimentAnalysis",
"mteb_version": "1.7.17",
"train": {
"accuracy": 0.6267578125,
"accuracy_stderr": 0.03611908523026426,
"ap": 0.7938445145047481,
"ap_stderr": 0.016033199516799757,
"evaluation_time": 52.43,
"f1": 0.6061352150370449,
"f1_stderr": 0.030922844922113873,
"main_score": 0.6061352150370449
}
}

0 comments on commit ba9bcaa

Please sign in to comment.