Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Normalize licenses including casing, uses of "-" etc. #1210

Merged
merged 2 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/adding_a_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ class VGClustering(AbsTaskClustering):
form="Written",
domains=["Academic", "Non-fiction"],
task_subtypes=["Scientific Reranking"],
license="cc-by-nc",
license="cc-by-nc-4.0",
annotations_creators="derived",
dialect=[],
text_creation="found",
Expand Down
31 changes: 28 additions & 3 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@
"Web",
"Written",
"Programming",
None,
]

SAMPLE_CREATION_METHOD = Literal[
Expand Down Expand Up @@ -135,6 +134,32 @@
"sql",
]

LICENSES = ( # this list can be extended as needed
Literal[ # we use lowercase for the licenses similar to the huggingface datasets
"not specified", # or none found
"mit",
"cc-by-2.0",
"cc-by-3.0",
"cc-by-4.0",
"cc-by-sa-3.0",
"cc-by-sa-4.0",
"cc-by-nc-4.0",
"cc-by-nc-sa-3.0",
"cc-by-nc-sa-4.0",
"cc-by-nc-nd-4.0",
"openrail",
"openrail++",
"odc-by",
"afl-3.0",
"apache-2.0",
"cc-by-nd-2.1-jp",
"cc0-1.0",
"bsd-3-clause",
"gpl-3.0",
"cdla-sharing-1.0",
"mpl-2.0",
]
)

METRIC_NAME = str
METRIC_VALUE = Union[int, float, Dict[str, Any]]
Expand Down Expand Up @@ -163,7 +188,7 @@ class TaskMetadata(BaseModel):
domains: The domains of the data. These includes "Non-fiction", "Social", "Fiction", "News", "Academic", "Blog", "Encyclopaedic",
"Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken", "Written". A dataset can belong to multiple domains.
task_subtypes: The subtypes of the task. E.g. includes "Sentiment/Hate speech", "Thematic Clustering". Feel free to update the list as needed.
license: The license of the data.
license: The license of the data specified as lowercase, e.g. "cc-by-nc-4.0". If the license is not specified, use "not specified". For custom licenses a URL is used.
socioeconomic_status: The socioeconomic status of the data. Includes "high", "medium", "low", "mixed".
annotations_creators: The type of the annotators. Includes "expert-annotated" (annotated by experts), "human-annotated" (annotated e.g. by
mturkers), "derived" (derived from structure in the data).
Expand Down Expand Up @@ -193,7 +218,7 @@ class TaskMetadata(BaseModel):
date: tuple[STR_DATE, STR_DATE] | None = None
domains: list[TASK_DOMAIN] | None = None
task_subtypes: list[TASK_SUBTYPE] | None = None
license: str | None = None
license: LICENSES | STR_URL | None = None

annotations_creators: ANNOTATOR_TYPE | None = None
dialect: list[str] | None = None
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/dan/BornholmskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class BornholmBitextMining(AbsTaskBitextMining):
main_score="f1",
date=("2019-01-01", "2019-12-31"),
domains=["Web", "Social", "Fiction", "Written"],
license="CC-BY-4.0",
license="cc-by-4.0",
task_subtypes=["Dialect pairing"],
annotations_creators="expert-annotated",
dialect=["da-dan-bornholm"],
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class TbilisiCityHallBitextMining(AbsTaskBitextMining, MultilingualTask):
reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles",
date=("2024-05-02", "2024-05-03"),
task_subtypes=[],
license="Not specified",
license="not specified",
annotations_creators="derived",
dialect=[],
bibtex_citation="",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2017-01-01", "2018-12-31"),
domains=["Written"],
task_subtypes=[],
license="Unknown",
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="human-translated",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask):
date=("2017-01-01", "2018-12-31"),
domains=["Written"],
task_subtypes=[],
license="Unknown",
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="human-translated",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -880,7 +880,7 @@ class BibleNLPBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("1997-01-01", "2020-12-31"),
domains=["Religious", "Written"],
task_subtypes=[],
license="CC-BY-SA-4.0",
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2016-01-01", "2017-12-31"),
domains=["Social", "Written"],
task_subtypes=[],
license="CC BY-NC-SA 4.0",
license="cc-by-nc-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="created",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2022-01-01", "2022-12-31"),
domains=["Non-fiction", "Encyclopaedic", "Written"],
task_subtypes=[],
license="CC BY-SA 4.0",
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="created",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class IN22ConvBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2022-10-01", "2023-03-01"),
domains=["Social", "Spoken", "Fiction", "Spoken"],
task_subtypes=[],
license="CC-BY-4.0",
license="cc-by-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class IN22GenBitextMining(AbsTaskBitextMining, MultilingualTask):
"Written",
],
task_subtypes=[],
license="CC-BY-4.0",
license="cc-by-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class IWSLT2017BitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2007-01-01", "2017-12-14"), # rough estimate
domains=["Non-fiction", "Fiction", "Written"],
task_subtypes=[],
license="CC-BY-NC-ND-4.0",
license="cc-by-nc-nd-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class IndicGenBenchFloresBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2023-10-01", "2024-05-01"),
domains=["Web", "News", "Written"],
task_subtypes=[],
license="CC-BY-SA-4.0",
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="human-translated and localized",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class LinceMTBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2019-01-01", "2020-01-01"),
domains=["Social", "Written"],
task_subtypes=[],
license="Unknown",
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/NTREXBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ class NTREXBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2019-08-01", "2022-11-01"),
domains=["News", "Written"],
task_subtypes=[],
license="CC-BY-SA-4.0",
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="human-translated and localized",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class NollySentiBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2022-01-01", "2023-01-01"),
domains=["Social", "Reviews", "Written"],
task_subtypes=[],
license="CC BY-SA 4.0",
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class NorwegianCourtsBitextMining(AbsTaskBitextMining):
date=("2020-01-01", "2020-12-31"),
domains=["Legal", "Written"],
task_subtypes=[],
license="CC BY 4.0",
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class NusaTranslationBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2021-08-01", "2022-07-01"),
domains=["Social", "Written"],
task_subtypes=[],
license="CC BY-SA 4.0",
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="created",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class NusaXBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2021-08-01", "2022-07-01"),
domains=["Reviews", "Written"],
task_subtypes=[],
license="CC BY-SA 4.0",
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="created",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/multilingual/PhincBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class PhincBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2019-01-01", "2020-01-01"),
domains=["Social", "Written"],
task_subtypes=[],
license="CC BY 4.0",
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class RomaTalesBitextMining(AbsTaskBitextMining, MultilingualTask):
), # Broad historical range for the creation of folk tales
domains=["Fiction", "Written"],
task_subtypes=[],
license="Not specified",
license="not specified",
annotations_creators="expert-annotated",
dialect=["Lovari"],
sample_creation="created",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ class TatoebaBitextMining(AbsTaskBitextMining, MultilingualTask):
"Written"
], # Tatoeba corpus includes a wide range of topics and domains
task_subtypes=[],
license="CC BY 2.0",
license="cc-by-2.0",
annotations_creators="human-annotated",
dialect=[], # No specific dialect mentioned
sample_creation="found",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/srn/SRNCorpusBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class SRNCorpusBitextMining(AbsTaskBitextMining, MultilingualTask):
date=("2022-04-01", "2022-07-31"),
domains=["Social", "Web", "Written"],
task_subtypes=[],
license="CC-BY-SA-4.0",
license="cc-by-sa-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/BitextMining/vie/VieMedEVBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class VieMedEVBitextMining(AbsTaskBitextMining):
date=("2024-08-28", "2022-03-28"),
domains=["Medical", "Written"],
task_subtypes=[],
license="cc-by-nc",
license="cc-by-nc-4.0", # version is assumed, but was previously unspecified
annotations_creators="expert-annotated",
dialect=[],
sample_creation="human-translated and localized",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Classification/ara/AJGT.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class AJGT(AbsTaskClassification):
date=("2021-01-01", "2022-01-25"),
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="AFL",
license="afl-3.0",
annotations_creators="human-annotated",
dialect=["ara-arab-MSA", "ara-arab-JO"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
date=("2016-06-01", "2016-07-31"),
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="Not specified",
license="not specified",
annotations_creators="derived",
dialect=["ara-arab-EG", "ara-arab-JO", "ara-arab-LB", "ara-arab-SA"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class OnlineStoreReviewSentimentClassification(AbsTaskClassification):
date=("2024-05-01", "2024-05-15"),
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="Not specified",
license="not specified",
annotations_creators="derived",
dialect=["ara-Arab-SA"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class RestaurantReviewSentimentClassification(AbsTaskClassification):
date=("2014-01-01", "2015-01-01"),
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="None specified",
license="not specified",
annotations_creators="derived",
dialect=["ara-arab-EG", "ara-arab-JO", "ara-arab-SA"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TweetEmotionClassification(AbsTaskClassification):
date=("2014-01-01", "2016-08-31"),
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="Not specified",
license="not specified",
annotations_creators="human-annotated",
dialect=["ara-arab-EG", "ara-arab-LB", "ara-arab-JO", "ara-arab-SA"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class TweetSarcasmClassification(AbsTaskClassification):
date=("2020-01-01", "2021-01-01"),
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="MIT",
license="mit",
annotations_creators="human-annotated",
dialect=["ara-arab-EG", "ara-arab-LB", "ara-arab-MA", "ara-arab-SA"],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BengaliDocumentClassification(AbsTaskClassification):
dialect=[],
domains=["News", "Written"],
task_subtypes=[],
license="CC BY-NC-SA 4.0",
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
sample_creation="found",
bibtex_citation="""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BengaliHateSpeechClassification(AbsTaskClassification):
dialect=[],
domains=["News", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="MIT",
license="mit",
annotations_creators="expert-annotated",
sample_creation="found",
bibtex_citation="""@inproceedings{karim2020BengaliNLP,
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Classification/ben/BengaliSentimentAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BengaliSentimentAnalysis(AbsTaskClassification):
dialect=[],
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="CC BY 4.0",
license="cc-by-4.0",
annotations_creators="human-annotated",
sample_creation="found",
bibtex_citation="""@inproceedings{sazzed2020cross,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
main_score="accuracy",
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="CC-BY-SA-4.0",
license="cc-by-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class CzechProductReviewSentimentClassification(AbsTaskClassification):
dialect=[],
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="CC BY-NC-SA 4.0",
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
sample_creation="found",
bibtex_citation="""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class CzechSoMeSentimentClassification(AbsTaskClassification):
dialect=[],
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="CC BY-NC-SA 4.0",
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
sample_creation="found",
bibtex_citation="""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class CzechSubjectivityClassification(AbsTaskClassification):
main_score="accuracy",
domains=["Reviews", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="Not specified",
license="not specified",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Classification/dan/AngryTweetsClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class AngryTweetsClassification(AbsTaskClassification):
date=("2021-01-01", "2021-12-31"),
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="CC-BY-4.0",
license="cc-by-4.0",
annotations_creators="human-annotated",
dialect=[],
sample_creation="found",
Expand Down
Loading
Loading