Skip to content

Commit

Permalink
Remove additional load_data
Browse files Browse the repository at this point in the history
  • Loading branch information
gariepyalex committed Mar 29, 2024
1 parent a9b5808 commit 59abc83
Show file tree
Hide file tree
Showing 5 changed files with 5 additions and 74 deletions.
16 changes: 1 addition & 15 deletions docs/adding_a_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class VGClustering(AbsTaskClustering):
dataset={
"path": "navjordj/VG_summarization",
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
}
},
date=("2012-01-01", "2020-01-01"),
form="written",
domains=["Academic", "Non-fiction"],
Expand All @@ -92,20 +92,6 @@ class VGClustering(AbsTaskClustering):
bibtex_citation= ... # removed for brevity
)

def load_data(self, **kwargs: dict): # noqa: ARG002
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset: datasets.DatasetDict = datasets.load_dataset(
**self.description["dataset"]
)

self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
splits = self.description["eval_splits"]

Expand Down
17 changes: 1 addition & 16 deletions mteb/tasks/Classification/da/DalajClassification.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# SuperLIM tasks
from __future__ import annotations

import datasets

from mteb.abstasks import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata

Expand All @@ -13,6 +11,7 @@ class DalajClassification(AbsTaskClassification):
dataset={
"path": "AI-Sweden/SuperLim",
"revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56",
"name": "dalaj",
},
description="A Swedish dataset for linguistic acceptability. Available as a part of Superlim.",
reference="https://spraakbanken.gu.se/en/resources/superlim",
Expand Down Expand Up @@ -42,20 +41,6 @@ def metadata_dict(self) -> dict[str, str]:
metadata_dict["samples_per_label"] = 16
return metadata_dict

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
name="dalaj", # chose the relevant subset
**self.metadata_dict["dataset"],
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
"""
This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence".
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

import datasets

from mteb.abstasks import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata

Expand All @@ -14,6 +12,7 @@ class NordicLangClassification(AbsTaskClassification):
dataset={
"path": "strombergnlp/nordic_langid",
"revision": "e254179d18ab0165fdb6dbef91178266222bee2a",
"name": "10k",
},
type="Classification",
category="s2s",
Expand Down Expand Up @@ -41,20 +40,6 @@ def metadata_dict(self) -> dict[str, str]:
metadata_dict["samples_per_label"] = 32
return metadata_dict

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
name="10k",
**self.metadata_dict["dataset"],
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
self.dataset = self.dataset.rename_column("language", "label")
15 changes: 1 addition & 14 deletions mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AlloProfClusteringP2P(AbsTaskClustering):
dataset={
"path": "mteb/alloprof",
"revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"name": "documents",
},
type="Clustering",
category="p2p",
Expand All @@ -36,20 +37,6 @@ class AlloProfClusteringP2P(AbsTaskClustering):
avg_character_length=None,
)

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub and convert it to the standard format.
"""
if self.data_loaded:
return

self.dataset = datasets.load_dataset(
name="documents",
**self.metadata_dict["dataset"],
)
self.dataset_transform()
self.data_loaded = True

def create_description(self, example):
example["text"] = example["title"] + " " + example["text"]
return example
Expand Down
14 changes: 1 addition & 13 deletions mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AlloProfClusteringS2S(AbsTaskClustering):
dataset={
"path": "mteb/alloprof",
"revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b",
"name": "documents",
},
type="Clustering",
category="s2s",
Expand All @@ -36,19 +37,6 @@ class AlloProfClusteringS2S(AbsTaskClustering):
avg_character_length=None,
)

def load_data(self, **kwargs):
"""
Load dataset from HuggingFace hub and convert it to the standard format.
"""
if self.data_loaded:
return
self.dataset = datasets.load_dataset(
name="documents",
**self.metadata_dict["dataset"],
)
self.dataset_transform()
self.data_loaded = True

def dataset_transform(self):
"""
Convert to standard format
Expand Down

0 comments on commit 59abc83

Please sign in to comment.