Add Databricks Dolly dataset

PiperOrigin-RevId: 524810057
tensorflow · Apr 19, 2023 · 27ed7a5 · 27ed7a5
1 parent 6762408
commit 27ed7a5
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 0 deletions.
diff --git a/tensorflow_datasets/datasets/databricks_dolly/CITATIONS.bib b/tensorflow_datasets/datasets/databricks_dolly/CITATIONS.bib
diff --git a/tensorflow_datasets/datasets/databricks_dolly/README.md b/tensorflow_datasets/datasets/databricks_dolly/README.md
@@ -0,0 +1,11 @@
+`databricks-dolly-15k` is an open source dataset of instruction-following
+records used in training
+[databricks/dolly-v2-12b](https://huggingface.co/databricks/dolly-v2-12b) that
+was generated by thousands of Databricks employees in several of the behavioral
+categories outlined in the [InstructGPT](https://arxiv.org/abs/2203.02155)
+paper, including brainstorming, classification, closed QA, generation,
+information extraction, open QA, and summarization.
+
+This dataset can be used for any purpose, whether academic or commercial, under
+the terms of the
+[Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode).
diff --git a/tensorflow_datasets/datasets/databricks_dolly/TAGS.txt b/tensorflow_datasets/datasets/databricks_dolly/TAGS.txt
@@ -0,0 +1,13 @@
+content.data-type.dialogue # Contains dialogue data.
+content.data-type.text # Contains text data.
+content.language.en # Contains text in language English / en.
+ml.task.language-modeling # Relates to Language Modeling, a machine learning task.
+ml.task.language-modelling # Relates to Language Modelling, a machine learning task.
+ml.task.natural-language-understanding # Relates to Natural Language Understanding, a machine learning task.
+ml.task.open-domain-question-answering # Relates to Open Domain Question Answering, a machine learning task.
+ml.task.question-answering # Relates to Question Answering, a machine learning task.
+ml.task.sequence-modeling # Relates to Sequence Modeling, a machine learning task.
+ml.task.sequence-to-sequence-language-modeling # Relates to Sequence To Sequence Language Modeling, a machine learning task.
+ml.task.sequence-to-sequence-language-modelling # Relates to Sequence to Sequence Language Modelling, a machine learning task.
+ml.task.text-generation # Relates to Text Generation, a machine learning task.
+ml.task.text-summarization # Relates to Text Summarization, a machine learning task.
diff --git a/tensorflow_datasets/datasets/databricks_dolly/__init__.py b/tensorflow_datasets/datasets/databricks_dolly/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/tensorflow_datasets/datasets/databricks_dolly/checksums.tsv b/tensorflow_datasets/datasets/databricks_dolly/checksums.tsv
@@ -0,0 +1 @@
+https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl	13216570	e442d96b320010510c67a6cc0a86ea988a2a6f71c7570d1afbdf4fa631896e58	databricks-dolly-15k.jsonl
diff --git a/tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder.py b/tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""databricks_dolly dataset."""
+import json
+
+from etils import epath
+import tensorflow_datasets.public_api as tfds
+
+
+class Builder(tfds.core.GeneratorBasedBuilder):
+  """DatasetBuilder for databricks_dolly dataset."""
+
+  VERSION = tfds.core.Version('1.0.0')
+  RELEASE_NOTES = {
+      '1.0.0': 'Initial release.',
+  }
+
+  def _info(self) -> tfds.core.DatasetInfo:
+    """Returns the dataset metadata."""
+    return self.dataset_info_from_configs(
+        features=tfds.features.FeaturesDict({
+            'instruction': tfds.features.Text(),
+            'context': tfds.features.Text(),
+            'response': tfds.features.Text(),
+            'category': tfds.features.Text(),
+        }),
+        homepage='https://github.com/databrickslabs/dolly',
+        license='CC BY-SA 3.0',
+    )
+
+  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
+    path = dl_manager.download(
+        {
+            'train': 'https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl'
+        }
+    )
+    return {
+        'train': self._generate_examples(path['train']),
+    }
+
+  def _generate_examples(self, path):
+    with epath.Path(path).open() as f:
+      for idx, line in enumerate(f):
+        if not line:
+          continue
+        content = json.loads(line)
+        yield idx, content
diff --git a/tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder_test.py b/tensorflow_datasets/datasets/databricks_dolly/databricks_dolly_dataset_builder_test.py
@@ -0,0 +1,33 @@
+# coding=utf-8
+# Copyright 2023 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""databricks_dolly dataset."""
+
+from tensorflow_datasets.datasets.databricks_dolly import databricks_dolly_dataset_builder
+import tensorflow_datasets.public_api as tfds
+
+
+class DatabricksDollyTest(tfds.testing.DatasetBuilderTestCase):
+  """Tests for databricks_dolly dataset."""
+
+  DATASET_CLASS = databricks_dolly_dataset_builder.Builder
+  SPLITS = {
+      'train': 2,
+  }
+  DL_EXTRACT_RESULT = {'train': 'databricks-dolly-15k.jsonl'}
+
+
+if __name__ == '__main__':
+  tfds.testing.test_main()
diff --git a/tensorflow_datasets/datasets/databricks_dolly/dummy_data/databricks-dolly-15k.jsonl b/tensorflow_datasets/datasets/databricks_dolly/dummy_data/databricks-dolly-15k.jsonl
@@ -0,0 +1,2 @@
+{"instruction": "Who gave the UN the land in NY to build their HQ", "context": "", "response": "John D Rockerfeller", "category": "open_qa"}
+{"instruction": "Why mobile is bad for human", "context": "", "response": "We are always engaged one phone which is not good.", "category": "brainstorming"}