Skip to content

Commit

Permalink
Add Databricks Dolly dataset
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 524810057
  • Loading branch information
tomvdw authored and The TensorFlow Datasets Authors committed Apr 19, 2023
1 parent 6762408 commit 27ed7a5
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 0 deletions.
Empty file.
11 changes: 11 additions & 0 deletions tensorflow_datasets/datasets/databricks_dolly/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
`databricks-dolly-15k` is an open source dataset of instruction-following
records used in training
[databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b) that
was generated by thousands of Databricks employees in several of the behavioral
categories outlined in the [InstructGPT](https://arxiv.org/abs/2203.02155)
paper, including brainstorming, classification, closed QA, generation,
information extraction, open QA, and summarization.

This dataset can be used for any purpose, whether academic or commercial, under
the terms of the
[Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode).
13 changes: 13 additions & 0 deletions tensorflow_datasets/datasets/databricks_dolly/TAGS.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
content.data-type.dialogue # Contains dialogue data.
content.data-type.text # Contains text data.
content.language.en # Contains text in language English / en.
ml.task.language-modeling # Relates to Language Modeling, a machine learning task.
ml.task.language-modelling # Relates to Language Modelling, a machine learning task.
ml.task.natural-language-understanding # Relates to Natural Language Understanding, a machine learning task.
ml.task.open-domain-question-answering # Relates to Open Domain Question Answering, a machine learning task.
ml.task.question-answering # Relates to Question Answering, a machine learning task.
ml.task.sequence-modeling # Relates to Sequence Modeling, a machine learning task.
ml.task.sequence-to-sequence-language-modeling # Relates to Sequence To Sequence Language Modeling, a machine learning task.
ml.task.sequence-to-sequence-language-modelling # Relates to Sequence to Sequence Language Modelling, a machine learning task.
ml.task.text-generation # Relates to Text Generation, a machine learning task.
ml.task.text-summarization # Relates to Text Summarization, a machine learning task.
15 changes: 15 additions & 0 deletions tensorflow_datasets/datasets/databricks_dolly/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2023 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl 13216570 e442d96b320010510c67a6cc0a86ea988a2a6f71c7570d1afbdf4fa631896e58 databricks-dolly-15k.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# coding=utf-8
# Copyright 2023 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""databricks_dolly dataset."""
import json

from etils import epath
import tensorflow_datasets.public_api as tfds


class Builder(tfds.core.GeneratorBasedBuilder):
"""DatasetBuilder for databricks_dolly dataset."""

VERSION = tfds.core.Version('1.0.0')
RELEASE_NOTES = {
'1.0.0': 'Initial release.',
}

def _info(self) -> tfds.core.DatasetInfo:
"""Returns the dataset metadata."""
return self.dataset_info_from_configs(
features=tfds.features.FeaturesDict({
'instruction': tfds.features.Text(),
'context': tfds.features.Text(),
'response': tfds.features.Text(),
'category': tfds.features.Text(),
}),
homepage='https://github.com/databrickslabs/dolly',
license='CC BY-SA 3.0',
)

def _split_generators(self, dl_manager: tfds.download.DownloadManager):
path = dl_manager.download(
{
'train': 'https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl'
}
)
return {
'train': self._generate_examples(path['train']),
}

def _generate_examples(self, path):
with epath.Path(path).open() as f:
for idx, line in enumerate(f):
if not line:
continue
content = json.loads(line)
yield idx, content
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# coding=utf-8
# Copyright 2023 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""databricks_dolly dataset."""

from tensorflow_datasets.datasets.databricks_dolly import databricks_dolly_dataset_builder
import tensorflow_datasets.public_api as tfds


class DatabricksDollyTest(tfds.testing.DatasetBuilderTestCase):
"""Tests for databricks_dolly dataset."""

DATASET_CLASS = databricks_dolly_dataset_builder.Builder
SPLITS = {
'train': 2,
}
DL_EXTRACT_RESULT = {'train': 'databricks-dolly-15k.jsonl'}


if __name__ == '__main__':
tfds.testing.test_main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"instruction": "Who gave the UN the land in NY to build their HQ", "context": "", "response": "John D Rockerfeller", "category": "open_qa"}
{"instruction": "Why mobile is bad for human", "context": "", "response": "We are always engaged one phone which is not good.", "category": "brainstorming"}

0 comments on commit 27ed7a5

Please sign in to comment.