From 59abc8383227f131ba36967fa135bb31e47b41e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Gari=C3=A9py?= Date: Fri, 29 Mar 2024 18:13:04 -0400 Subject: [PATCH] Remove additional load_data --- docs/adding_a_dataset.md | 16 +--------------- .../Classification/da/DalajClassification.py | 17 +---------------- .../multilingual/NordicLangClassification.py | 17 +---------------- .../Clustering/fr/AlloProfClusteringP2P.py | 15 +-------------- .../Clustering/fr/AlloProfClusteringS2S.py | 14 +------------- 5 files changed, 5 insertions(+), 74 deletions(-) diff --git a/docs/adding_a_dataset.md b/docs/adding_a_dataset.md index d5141dbf5c..de1033c024 100644 --- a/docs/adding_a_dataset.md +++ b/docs/adding_a_dataset.md @@ -79,7 +79,7 @@ class VGClustering(AbsTaskClustering): dataset={ "path": "navjordj/VG_summarization", "revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29", - } + }, date=("2012-01-01", "2020-01-01"), form="written", domains=["Academic", "Non-fiction"], @@ -92,20 +92,6 @@ class VGClustering(AbsTaskClustering): bibtex_citation= ... # removed for brevity ) - def load_data(self, **kwargs: dict): # noqa: ARG002 - """ - Load dataset from HuggingFace hub - """ - if self.data_loaded: - return - - self.dataset: datasets.DatasetDict = datasets.load_dataset( - **self.description["dataset"] - ) - - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): splits = self.description["eval_splits"] diff --git a/mteb/tasks/Classification/da/DalajClassification.py b/mteb/tasks/Classification/da/DalajClassification.py index 5bc612591d..cf3444f059 100644 --- a/mteb/tasks/Classification/da/DalajClassification.py +++ b/mteb/tasks/Classification/da/DalajClassification.py @@ -1,8 +1,6 @@ # SuperLIM tasks from __future__ import annotations -import datasets - from mteb.abstasks import AbsTaskClassification from mteb.abstasks.TaskMetadata import TaskMetadata @@ -13,6 +11,7 @@ class DalajClassification(AbsTaskClassification): dataset={ "path": "AI-Sweden/SuperLim", "revision": "7ebf0b4caa7b2ae39698a889de782c09e6f5ee56", + "name": "dalaj", }, description="A Swedish dataset for linguistic acceptability. Available as a part of Superlim.", reference="https://spraakbanken.gu.se/en/resources/superlim", @@ -42,20 +41,6 @@ def metadata_dict(self) -> dict[str, str]: metadata_dict["samples_per_label"] = 16 return metadata_dict - def load_data(self, **kwargs): - """ - Load dataset from HuggingFace hub - """ - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - name="dalaj", # chose the relevant subset - **self.metadata_dict["dataset"], - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): """ This dataset consist of two columns of relevance, "original_sentence" and "corrected_sentence". diff --git a/mteb/tasks/Classification/multilingual/NordicLangClassification.py b/mteb/tasks/Classification/multilingual/NordicLangClassification.py index 243672137d..79251167a1 100644 --- a/mteb/tasks/Classification/multilingual/NordicLangClassification.py +++ b/mteb/tasks/Classification/multilingual/NordicLangClassification.py @@ -1,7 +1,5 @@ from __future__ import annotations -import datasets - from mteb.abstasks import AbsTaskClassification from mteb.abstasks.TaskMetadata import TaskMetadata @@ -14,6 +12,7 @@ class NordicLangClassification(AbsTaskClassification): dataset={ "path": "strombergnlp/nordic_langid", "revision": "e254179d18ab0165fdb6dbef91178266222bee2a", + "name": "10k", }, type="Classification", category="s2s", @@ -41,20 +40,6 @@ def metadata_dict(self) -> dict[str, str]: metadata_dict["samples_per_label"] = 32 return metadata_dict - def load_data(self, **kwargs): - """ - Load dataset from HuggingFace hub - """ - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - name="10k", - **self.metadata_dict["dataset"], - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): self.dataset = self.dataset.rename_column("sentence", "text") self.dataset = self.dataset.rename_column("language", "label") diff --git a/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py b/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py index 0974cbc974..a0245b5f2b 100644 --- a/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py +++ b/mteb/tasks/Clustering/fr/AlloProfClusteringP2P.py @@ -16,6 +16,7 @@ class AlloProfClusteringP2P(AbsTaskClustering): dataset={ "path": "mteb/alloprof", "revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b", + "name": "documents", }, type="Clustering", category="p2p", @@ -36,20 +37,6 @@ class AlloProfClusteringP2P(AbsTaskClustering): avg_character_length=None, ) - def load_data(self, **kwargs): - """ - Load dataset from HuggingFace hub and convert it to the standard format. - """ - if self.data_loaded: - return - - self.dataset = datasets.load_dataset( - name="documents", - **self.metadata_dict["dataset"], - ) - self.dataset_transform() - self.data_loaded = True - def create_description(self, example): example["text"] = example["title"] + " " + example["text"] return example diff --git a/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py b/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py index 158baa8273..6f1128f0e2 100644 --- a/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py +++ b/mteb/tasks/Clustering/fr/AlloProfClusteringS2S.py @@ -16,6 +16,7 @@ class AlloProfClusteringS2S(AbsTaskClustering): dataset={ "path": "mteb/alloprof", "revision": "392ba3f5bcc8c51f578786c1fc3dae648662cb9b", + "name": "documents", }, type="Clustering", category="s2s", @@ -36,19 +37,6 @@ class AlloProfClusteringS2S(AbsTaskClustering): avg_character_length=None, ) - def load_data(self, **kwargs): - """ - Load dataset from HuggingFace hub and convert it to the standard format. - """ - if self.data_loaded: - return - self.dataset = datasets.load_dataset( - name="documents", - **self.metadata_dict["dataset"], - ) - self.dataset_transform() - self.data_loaded = True - def dataset_transform(self): """ Convert to standard format