embeddings-benchmark · KennethEnevoldsen · Apr 18, 2024 · Apr 18, 2024 · Apr 18, 2024
diff --git a/docs/mmteb/points.md b/docs/mmteb/points.md
@@ -2,7 +2,7 @@
 
 | GitHub            | Total points | New dataset | New task | Dataset annotations | (Bug)fixes | Running Models | Review PR |  Paper Writing | Ideation | Coordination |
 |-------------------| ------------ |-------------| -------- | ------------------- | ---------- | -------------- |-----------| -------------- | -------- | ------------- |
-| KennethEnevoldsen |              | 54          |          |                   8 |         18 |                | 33        |                |          |             5 |
+| KennethEnevoldsen |              | 54          |          |                   8 |         18 |                | 34        |                |          |             5 |
 | x-tabdeveloping   |              | 48          |          |                     |            |                |           |                |          |               |
 | imenelydiaker     |              | 88          |          |                     |          2 |                | 15        |                |          |               |
 | wissam-sib        |              | 88          |          |                     |            |                | 1         |                |          |               |
@@ -32,6 +32,7 @@
 | manandey          |              | 12          |          |                     |            |                |           |                |          |               |
 | isaac-chung       |              | 24          |          |                     |            |                | 2         |                |          |               |
 | asparius          |              | 8           |          |                     |            |                |           |                |          |               |
+| rbroc             |              | 6           |          |                     |            |                |           |                |          |               |
 
 
 Note that coordination and ideation is not included in the points yet, but is used to determine first and last authors.

diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -41,6 +41,7 @@
 TASK_DOMAIN = Literal[
     "Academic",
     "Blog",
+    "Constructed",
     "Encyclopaedic",
     "Fiction",
     "Government",

diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -21,6 +21,7 @@
 from .est.estonian_valence import *
 from .hin.HindiDiscourseClassification import *
 from .ind.IndonesianIdClickbaitClassification import *
+from .ita.ItaHateClassification import *
 from .jpn.WRIMEClassification import *
 from .mkd.MacedonianTweetSentimentClassification import *
 from .multilingual.AmazonCounterfactualClassification import *

diff --git a/mteb/tasks/Classification/ita/ItaHateClassification.py b/mteb/tasks/Classification/ita/ItaHateClassification.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+
+class ItaHateClassification(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="ItaHateClassification",
+        dataset={
+            "path": "Paul/hatecheck-italian",
+            "revision": "21e3d5c827cb60619a89988b24979850a7af85a5",
+        },
+        description="""Hate speech detection dataset with binary
+                       (hateful vs non-hateful) labels. Includes 25+ distinct types of hate
+                       and challenging non-hate. Multilingual datase released as 10 unilingual models
+                     """,
+        reference="https://aclanthology.org/2022.woah-1.15/",
+        type="Classification",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["ita-Latn"],
+        main_score="accuracy",
+        date=("2021-11-01", "2022-02-28"),
+        form=["written"],
+        domains=["Constructed"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-4.0",
+        socioeconomic_status="high",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        text_creation="created",
+        bibtex_citation="""
+        @inproceedings{rottger-etal-2022-multilingual,
+            title = "Multilingual {H}ate{C}heck: Functional Tests for Multilingual Hate Speech Detection Models",
+            author = {R{\"o}ttger, Paul  and
+            Seelawi, Haitham  and
+            Nozza, Debora  and
+            Talat, Zeerak  and
+            Vidgen, Bertie},
+            editor = "Narang, Kanika  and
+            Mostafazadeh Davani, Aida  and
+            Mathias, Lambert  and
+            Vidgen, Bertie  and
+            Talat, Zeerak",
+            booktitle = "Proceedings of the Sixth Workshop on Online Abuse and Harms (WOAH)",
+            month = jul,
+            year = "2022",
+            address = "Seattle, Washington (Hybrid)",
+            publisher = "Association for Computational Linguistics",
+            url = "https://aclanthology.org/2022.woah-1.15",
+            doi = "10.18653/v1/2022.woah-1.15",
+            pages = "154--169",
+            abstract = "Hate speech detection models are typically evaluated on held-out test sets. However, this risks painting an incomplete and potentially misleading picture of model performance because of increasingly well-documented systematic gaps and biases in hate speech datasets. To enable more targeted diagnostic insights, recent research has thus introduced functional tests for hate speech detection models. However, these tests currently only exist for English-language content, which means that they cannot support the development of more effective models in other languages spoken by billions across the world. To help address this issue, we introduce Multilingual HateCheck (MHC), a suite of functional tests for multilingual hate speech detection models. MHC covers 34 functionalities across ten languages, which is more languages than any other hate speech dataset. To illustrate MHC{'}s utility, we train and test a high-performing multilingual hate speech detection model, and reveal critical model weaknesses for monolingual and cross-lingual applications.",
+        }
+        """,
+        n_samples={"test": 1845},
+        avg_character_length={"test": 50.4},
+    )
+
+    def dataset_transform(self):
+        keep_cols = ["test_case", "label_gold"]
+        rename_dict = dict(zip(keep_cols, ["text", "label"]))
+        remove_cols = [
+            col for col in self.dataset["test"].column_names if col not in keep_cols
+        ]
+        self.dataset = self.dataset.rename_columns(rename_dict)
+        self.dataset = self.dataset.class_encode_column("label")
+        self.dataset = self.dataset.class_encode_column("functionality")
+        self.dataset = self.dataset["test"].train_test_split(
+            test_size=0.5, seed=42, stratify_by_column="functionality"
+        )  # balanced sampling across types of hate speech
+        self.dataset = self.dataset.remove_columns(remove_cols)
diff --git a/mteb/tasks/Classification/ita/__init__.py b/mteb/tasks/Classification/ita/__init__.py
diff --git a/results/intfloat__multilingual-e5-small/ItaHateClassification.json b/results/intfloat__multilingual-e5-small/ItaHateClassification.json
@@ -0,0 +1,15 @@
+{
+  "dataset_revision": "21e3d5c827cb60619a89988b24979850a7af85a5",
+  "mteb_dataset_name": "ItaHateClassification",
+  "mteb_version": "1.6.10",
+  "test": {
+    "accuracy": 0.5959349593495935,
+    "accuracy_stderr": 0.05178534901667928,
+    "ap": 0.3476262458086787,
+    "ap_stderr": 0.02235628028867616,
+    "evaluation_time": 13.67,
+    "f1": 0.5627839442396998,
+    "f1_stderr": 0.038675129943598265,
+    "main_score": 0.5959349593495935
+  }
+}
diff --git a/...s/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/ItaHateClassification.json b/...s/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/ItaHateClassification.json
@@ -0,0 +1,15 @@
+{
+  "dataset_revision": "21e3d5c827cb60619a89988b24979850a7af85a5",
+  "mteb_dataset_name": "ItaHateClassification",
+  "mteb_version": "1.6.10",
+  "test": {
+    "accuracy": 0.584010840108401,
+    "accuracy_stderr": 0.041498116799461505,
+    "ap": 0.3352188603107494,
+    "ap_stderr": 0.01879799601881252,
+    "evaluation_time": 10.0,
+    "f1": 0.5475091948731234,
+    "f1_stderr": 0.03215657704717439,
+    "main_score": 0.584010840108401
+  }
+}