Skip to content

Commit

Permalink
fix: added the first thai classification dataset (#538)
Browse files Browse the repository at this point in the history
* added thai sentiment dataset

* all changes based on comments made. models rerun and points added

* Update mteb/tasks/Classification/tha/wisesight_sentiment_classification.py

* Update docs/mmteb/points/538.jsonl

---------

Co-authored-by: Kenneth Enevoldsen <[email protected]>
  • Loading branch information
Akash190104 and KennethEnevoldsen authored Apr 24, 2024
1 parent 716b17f commit 5f5935c
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/538.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "Akash190104", "New dataset": 2}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
from .slk.SlovakSentimentClassification import *
from .ssw.SiswatiNewsClassification import *
from .swe.SweRecClassification import *
from .tha.wisesight_sentiment_classification import *
from .tur.TurkishMovieSentimentClassification import *
from .tur.TurkishProductSentimentClassification import *
from .uig.UyghurSentimentClassification import *
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class wisesight_sentiment_classification(AbsTaskClassification):
metadata = TaskMetadata(
name="wisesight_sentiment_classification",
description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)",
reference="https://github.com/PyThaiNLP/wisesight-sentiment",
dataset={
"path": "wisesight_sentiment",
"revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
},
type="Classification",
category="s2s",
eval_splits=["test"],
eval_langs=["tha-Thai"],
main_score="f1",
date=("2019-05-24", "2021-09-16"),
form=["written"],
dialect=[],
domains=["Social", "News"],
task_subtypes=["Sentiment/Hate speech"],
license="cc0-1.0",
socioeconomic_status="mixed",
annotations_creators="expert-annotated",
text_creation="found",
bibtex_citation="""@software{bact_2019_3457447,
author = {Suriyawongkul, Arthit and
Chuangsuwanich, Ekapol and
Chormai, Pattarawat and
Polpanumas, Charin},
title = {PyThaiNLP/wisesight-sentiment: First release},
month = sep,
year = 2019,
publisher = {Zenodo},
version = {v1.0},
doi = {10.5281/zenodo.3457447},
url = {https://doi.org/10.5281/zenodo.3457447}
}
""",
n_samples={"train": 2048},
avg_character_length={"train": 103.42},
)

def dataset_transform(self):
for split in self.dataset.keys():
self.dataset[split] = self.dataset[split].rename_column("texts", "text")
self.dataset[split] = self.dataset[split].rename_column("category", "label")

self.dataset = self.stratified_subsampling(
self.dataset,
seed=self.seed,
splits=["test"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
"mteb_dataset_name": "wisesight_sentiment_classification",
"mteb_version": "1.7.17",
"test": {
"accuracy": 0.374267578125,
"accuracy_stderr": 0.027660389391074545,
"evaluation_time": 117.94,
"f1": 0.3439113310260592,
"f1_stderr": 0.024188314643972506,
"main_score": 0.3439113310260592
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
"mteb_dataset_name": "wisesight_sentiment_classification",
"mteb_version": "1.7.17",
"test": {
"accuracy": 0.361669921875,
"accuracy_stderr": 0.030422743273052427,
"evaluation_time": 64.73,
"f1": 0.322002704618304,
"f1_stderr": 0.02587338704093155,
"main_score": 0.322002704618304
}
}

0 comments on commit 5f5935c

Please sign in to comment.