-
Notifications
You must be signed in to change notification settings - Fork 292
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added tasks from SEB * docs: fix link * fix: ran linting * fix typing for 3.8 * fixed annotation for v3.8
- Loading branch information
1 parent
76056b5
commit 39cff49
Showing
59 changed files
with
2,034 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
52 changes: 52 additions & 0 deletions
52
mteb/tasks/BitextMining/nb/norwegian_courts_bitext_mining.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from typing import Any | ||
|
||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskBitextMining | ||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
|
||
class NorwegianCourtsBitextMining(AbsTaskBitextMining): | ||
metadata = TaskMetadata( | ||
name="NorwegianCourtsBitextMining", | ||
hf_hub_name="kaedrodrur/norwegian-courts", | ||
description="Nynorsk and Bokmål parallel corpus from Norwegian courts. ", | ||
reference="https://opus.nlpl.eu/ELRC-Courts_Norway-v1.php", | ||
type="BitextMining", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["nb", "nn"], | ||
main_score="accuracy", | ||
revision="d79af07e969a6678fcbbe819956840425816468f", | ||
date=("2000-01-01", "2020-12-31"), # approximate guess | ||
form=["spoken"], | ||
domains=["Spoken"], | ||
task_subtypes=["Political classification"], | ||
license="openUnder-PSI", | ||
socioeconomic_status="high", | ||
annotations_creators="derived", # best guess | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=None, | ||
n_samples={"test": 456}, | ||
avg_character_length={"test": 82.11}, | ||
) | ||
|
||
def load_data(self, **kwargs: Any) -> None: # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub and convert it to the standard format. | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset = datasets.load_dataset( | ||
self.metadata_dict["hf_hub_name"], | ||
revision=self.metadata_dict.get("revision", None), | ||
) | ||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self) -> None: | ||
# Convert to standard format | ||
self.dataset = self.dataset.rename_column("nb", "sentence1") | ||
self.dataset = self.dataset.rename_column("nn", "sentence2") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from __future__ import annotations | ||
|
||
import random | ||
from itertools import islice | ||
from typing import Iterable, TypeVar | ||
|
||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskClustering, TaskMetadata | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: | ||
# batched('ABCDEFG', 3) --> ABC DEF G | ||
if n < 1: | ||
raise ValueError("n must be at least one") | ||
it = iter(iterable) | ||
while batch := tuple(islice(it, n)): | ||
yield batch | ||
|
||
|
||
class SNLClustering(AbsTaskClustering): | ||
metadata = TaskMetadata( | ||
name="SNLClustering", | ||
hf_hub_name="navjordj/SNL_summarization", | ||
description="Webscrabed articles from the Norwegian lexicon 'Det Store Norske Leksikon'. Uses articles categories as clusters.", | ||
reference="https://huggingface.co/datasets/navjordj/SNL_summarization", | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["nb"], | ||
main_score="v_measure", | ||
revision="3d3d27aa7af8941408cefc3991ada5d12a4273d1", | ||
date=("2020-01-01", "2024-12-31"), # best guess | ||
form=["written"], | ||
domains=["Encyclopaedic", "Non-fiction"], | ||
license=None, | ||
socioeconomic_status="high", | ||
annotations_creators="derived", | ||
dialect=[], | ||
task_subtypes=["Thematic clustering"], | ||
text_creation="found", | ||
bibtex_citation="""@mastersthesis{navjord2023beyond, | ||
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, | ||
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, | ||
year={2023}, | ||
school={Norwegian University of Life Sciences, {\AA}s} | ||
}""", | ||
n_samples={"test": 2048}, | ||
avg_character_length={"test": 1101.30}, | ||
) | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.metadata_dict["hf_hub_name"], | ||
revision=self.metadata_dict.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
splits = self.metadata_dict["eval_splits"] | ||
|
||
documents: list = [] | ||
labels: list = [] | ||
label_col = "category" | ||
|
||
ds = {} | ||
for split in splits: | ||
ds_split = self.dataset[split] | ||
|
||
_label = self.normalize_labels(ds_split[label_col]) | ||
documents.extend(ds_split["ingress"]) | ||
labels.extend(_label) | ||
|
||
documents.extend(ds_split["article"]) | ||
labels.extend(_label) | ||
|
||
assert len(documents) == len(labels) | ||
|
||
rng = random.Random(42) # local only seed | ||
pairs = list(zip(documents, labels)) | ||
rng.shuffle(pairs) | ||
documents, labels = [list(collection) for collection in zip(*pairs)] | ||
|
||
# reduce size of dataset to not have too large datasets in the clustering task | ||
documents_batched = list(batched(documents, 512))[:4] | ||
labels_batched = list(batched(labels, 512))[:4] | ||
|
||
ds[split] = datasets.Dataset.from_dict( | ||
{ | ||
"sentences": documents_batched, | ||
"labels": labels_batched, | ||
} | ||
) | ||
|
||
self.dataset = datasets.DatasetDict(ds) | ||
|
||
@staticmethod | ||
def normalize_labels(labels: list[str]) -> list[str]: | ||
# example label: | ||
# Store norske leksikon,Kunst og estetikk,Musikk,Klassisk musikk,Internasjonale dirigenter | ||
# When using 2 levels there is 17 unique labels | ||
# When using 3 levels there is 121 unique labels | ||
return [",".join(tuple(label.split(",")[:3])) for label in labels] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
|
||
import random | ||
from itertools import islice | ||
from typing import Iterable, TypeVar | ||
|
||
import datasets | ||
|
||
from mteb.abstasks import AbsTaskClustering, TaskMetadata | ||
|
||
T = TypeVar("T") | ||
|
||
|
||
def batched(iterable: Iterable[T], n: int) -> Iterable[tuple[T, ...]]: | ||
# batched('ABCDEFG', 3) --> ABC DEF G | ||
if n < 1: | ||
raise ValueError("n must be at least one") | ||
it = iter(iterable) | ||
while batch := tuple(islice(it, n)): | ||
yield batch | ||
|
||
|
||
class VGClustering(AbsTaskClustering): | ||
metadata = TaskMetadata( | ||
name="VGClustering", | ||
hf_hub_name="navjordj/VG_summarization", | ||
description="Articles and their classes (e.g. sports) from VG news articles extracted from Norsk Aviskorpus.", | ||
reference="https://huggingface.co/datasets/navjordj/VG_summarization", | ||
type="Clustering", | ||
category="p2p", | ||
eval_splits=["test"], | ||
eval_langs=["nb"], | ||
main_score="v_measure", | ||
revision="d4c5a8ba10ae71224752c727094ac4c46947fa29", | ||
date=("2020-01-01", "2024-12-31"), # best guess | ||
form=["written"], | ||
domains=["News", "Non-fiction"], | ||
license=None, | ||
socioeconomic_status="mixed", | ||
annotations_creators="derived", | ||
dialect=[], | ||
task_subtypes=["Thematic clustering"], | ||
text_creation="found", | ||
bibtex_citation="""@mastersthesis{navjord2023beyond, | ||
title={Beyond extractive: advancing abstractive automatic text summarization in Norwegian with transformers}, | ||
author={Navjord, J{\o}rgen Johnsen and Korsvik, Jon-Mikkel Ryen}, | ||
year={2023}, | ||
school={Norwegian University of Life Sciences, {\AA}s} | ||
}""", | ||
n_samples={"test": 2048}, | ||
avg_character_length={"test": 1009.65}, | ||
) | ||
|
||
def load_data(self, **kwargs: dict): # noqa: ARG002 | ||
""" | ||
Load dataset from HuggingFace hub | ||
""" | ||
if self.data_loaded: | ||
return | ||
|
||
self.dataset: datasets.DatasetDict = datasets.load_dataset( | ||
self.metadata_dict["hf_hub_name"], | ||
revision=self.metadata_dict.get("revision"), | ||
) # type: ignore | ||
|
||
self.dataset_transform() | ||
self.data_loaded = True | ||
|
||
def dataset_transform(self): | ||
splits = self.metadata_dict["eval_splits"] | ||
|
||
documents: list = [] | ||
labels: list = [] | ||
label_col = "classes" | ||
|
||
ds = {} | ||
for split in splits: | ||
ds_split = self.dataset[split] | ||
|
||
_label = self.normalize_labels(ds_split[label_col]) | ||
documents.extend(ds_split["title"]) | ||
labels.extend(_label) | ||
|
||
documents.extend(ds_split["ingress"]) | ||
labels.extend(_label) | ||
|
||
documents.extend(ds_split["article"]) | ||
labels.extend(_label) | ||
|
||
assert len(documents) == len(labels) | ||
|
||
rng = random.Random(1111) # local only seed | ||
# resampling changes scores from 12.68, 11.30, 12.65 (sample model) | ||
pairs = list(zip(documents, labels)) | ||
rng.shuffle(pairs) | ||
documents, labels = [list(collection) for collection in zip(*pairs)] | ||
|
||
# reduce size of dataset to not have too large datasets in the clustering task | ||
documents_batched = list(batched(documents, 512))[:4] | ||
labels_batched = list(batched(labels, 512))[:4] | ||
# See: | ||
# https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/pull/96 | ||
# for a discussion on sizes | ||
|
||
ds[split] = datasets.Dataset.from_dict( | ||
{ | ||
"sentences": documents_batched, | ||
"labels": labels_batched, | ||
} | ||
) | ||
|
||
self.dataset = datasets.DatasetDict(ds) | ||
|
||
@staticmethod | ||
def normalize_labels(labels: list[str]) -> list[str]: | ||
# Agreed on and debated in: https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/issues/83 | ||
return [label.split(",")[0] for label in labels] |
Empty file.
Oops, something went wrong.