-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
PiperOrigin-RevId: 524810057
- Loading branch information
Showing
8 changed files
with
135 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
`databricks-dolly-15k` is an open source dataset of instruction-following | ||
records used in training | ||
[databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b) that | ||
was generated by thousands of Databricks employees in several of the behavioral | ||
categories outlined in the [InstructGPT](https://arxiv.org/abs/2203.02155) | ||
paper, including brainstorming, classification, closed QA, generation, | ||
information extraction, open QA, and summarization. | ||
|
||
This dataset can be used for any purpose, whether academic or commercial, under | ||
the terms of the | ||
[Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
content.data-type.dialogue # Contains dialogue data. | ||
content.data-type.text # Contains text data. | ||
content.language.en # Contains text in language English / en. | ||
ml.task.language-modeling # Relates to Language Modeling, a machine learning task. | ||
ml.task.language-modelling # Relates to Language Modelling, a machine learning task. | ||
ml.task.natural-language-understanding # Relates to Natural Language Understanding, a machine learning task. | ||
ml.task.open-domain-question-answering # Relates to Open Domain Question Answering, a machine learning task. | ||
ml.task.question-answering # Relates to Question Answering, a machine learning task. | ||
ml.task.sequence-modeling # Relates to Sequence Modeling, a machine learning task. | ||
ml.task.sequence-to-sequence-language-modeling # Relates to Sequence To Sequence Language Modeling, a machine learning task. | ||
ml.task.sequence-to-sequence-language-modelling # Relates to Sequence to Sequence Language Modelling, a machine learning task. | ||
ml.task.text-generation # Relates to Text Generation, a machine learning task. | ||
ml.task.text-summarization # Relates to Text Summarization, a machine learning task. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# coding=utf-8 | ||
# Copyright 2023 The TensorFlow Datasets Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl 13216570 e442d96b320010510c67a6cc0a86ea988a2a6f71c7570d1afbdf4fa631896e58 databricks-dolly-15k.jsonl |
60 changes: 60 additions & 0 deletions
60
tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# coding=utf-8 | ||
# Copyright 2023 The TensorFlow Datasets Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""databricks_dolly dataset.""" | ||
import json | ||
|
||
from etils import epath | ||
import tensorflow_datasets.public_api as tfds | ||
|
||
|
||
class Builder(tfds.core.GeneratorBasedBuilder): | ||
"""DatasetBuilder for databricks_dolly dataset.""" | ||
|
||
VERSION = tfds.core.Version('1.0.0') | ||
RELEASE_NOTES = { | ||
'1.0.0': 'Initial release.', | ||
} | ||
|
||
def _info(self) -> tfds.core.DatasetInfo: | ||
"""Returns the dataset metadata.""" | ||
return self.dataset_info_from_configs( | ||
features=tfds.features.FeaturesDict({ | ||
'instruction': tfds.features.Text(), | ||
'context': tfds.features.Text(), | ||
'response': tfds.features.Text(), | ||
'category': tfds.features.Text(), | ||
}), | ||
homepage='https://github.com/databrickslabs/dolly', | ||
license='CC BY-SA 3.0', | ||
) | ||
|
||
def _split_generators(self, dl_manager: tfds.download.DownloadManager): | ||
path = dl_manager.download( | ||
{ | ||
'train': 'https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl' | ||
} | ||
) | ||
return { | ||
'train': self._generate_examples(path['train']), | ||
} | ||
|
||
def _generate_examples(self, path): | ||
with epath.Path(path).open() as f: | ||
for idx, line in enumerate(f): | ||
if not line: | ||
continue | ||
content = json.loads(line) | ||
yield idx, content |
33 changes: 33 additions & 0 deletions
33
tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# coding=utf-8 | ||
# Copyright 2023 The TensorFlow Datasets Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
"""databricks_dolly dataset.""" | ||
|
||
from tensorflow_datasets.datasets.databricks_dolly import databricks_dolly_dataset_builder | ||
import tensorflow_datasets.public_api as tfds | ||
|
||
|
||
class DatabricksDollyTest(tfds.testing.DatasetBuilderTestCase): | ||
"""Tests for databricks_dolly dataset.""" | ||
|
||
DATASET_CLASS = databricks_dolly_dataset_builder.Builder | ||
SPLITS = { | ||
'train': 2, | ||
} | ||
DL_EXTRACT_RESULT = {'train': 'databricks-dolly-15k.jsonl'} | ||
|
||
|
||
if __name__ == '__main__': | ||
tfds.testing.test_main() |
2 changes: 2 additions & 0 deletions
2
tensorflow_datasets/datasets/databricks_dolly/dummy_data/databricks-dolly-15k.jsonl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"instruction": "Who gave the UN the land in NY to build their HQ", "context": "", "response": "John D Rockerfeller", "category": "open_qa"} | ||
{"instruction": "Why mobile is bad for human", "context": "", "response": "We are always engaged one phone which is not good.", "category": "brainstorming"} |