Added the required files to generate a instructional python dataset… (#…

…3106) … and updated `__init__.py`. This solves #297 --------- Co-authored-by: Andreas Köpf <[email protected]> Co-authored-by: Oliver Stanley <[email protected]>
LAION-AI · May 13, 2023 · 24856cd · 24856cd
1 parent a449ff7
commit 24856cd
Show file tree

Hide file tree

Showing 4 changed files with 473 additions and 0 deletions.
diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py
@@ -27,6 +27,7 @@
     "oa_stackexchange": "donfu/oa-stackexchange",
     "stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
     "ru_riddles_337": "0x22almostEvil/ru-riddles-377",
+    "instructional_codesearchnet_python": "Nan-Do/instructional_codesearchnet_python",
     "tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
 }
 

diff --git a/...s/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb b/...s/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb
@@ -0,0 +1,151 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "source": [
+    "!pip install datasets tqdm"
+   ],
+   "metadata": {
+    "id": "zLxBMw9Lsr6I"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qyuuLNEzsaYR"
+   },
+   "outputs": [],
+   "source": [
+    "import gzip\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "\n",
+    "from collections import defaultdict\n",
+    "from datasets import load_dataset\n",
+    "from tqdm.auto import tqdm\n",
+    "from random import random, randint\n",
+    "\n",
+    "ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
+    "    \"Can you write a program in {lang} where it\\n\",\n",
+    "    \"How would you implement a function in {lang} that\\n\",\n",
+    "    \"Write a {lang} function for\\n\",\n",
+    "    \"Can you create a {lang} program that\\n\",\n",
+    "    \"Implement a function in {lang} to\\n\",\n",
+    "    \"Write a {lang} script for\\n\",\n",
+    "    \"How would you code a program in {lang} to\\n\",\n",
+    "    \"Create a {lang} function for\\n\",\n",
+    "    \"Write a {lang} program that can\\n\",\n",
+    "    \"Can you implement a function in {lang} that\\n\",\n",
+    "]\n",
+    "\n",
+    "ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
+    "    \"Explain what the following {lang} code does\\n\",\n",
+    "    \"Can you tell what is the following {lang} function doing\\n\",\n",
+    "    \"Here you have a function in {lang}, explain what it does\\n\",\n",
+    "    \"Make a summary of the following {lang} code\\n\",\n",
+    "    \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
+    "    \"How would you explain what the following {lang} function does\\n\",\n",
+    "    \"Can you generate the documentation for the following {lang} function\\n\",\n",
+    "    \"Create a docstring for the following {lang} code\\n\",\n",
+    "    \"Given the following {lang} function, write the documentation\\n\",\n",
+    "    \"Write a docstring for the following {lang} function\\n\",\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def remove_docstring(code_function):\n",
+    "    triple_quotes = '\"\"\"'\n",
+    "    lines = code_function.split(\"\\n\")\n",
+    "\n",
+    "    c = lines[1].count(triple_quotes)\n",
+    "    # There is no docstring\n",
+    "    if c == 0:\n",
+    "        return code_function\n",
+    "    # One line dostring\n",
+    "    if c == 2:\n",
+    "        return \"\\n\".join([lines[0]] + lines[2:])\n",
+    "\n",
+    "    idx = 2\n",
+    "    while idx < len(lines) and triple_quotes not in lines[idx]:\n",
+    "        idx += 1\n",
+    "\n",
+    "    return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
+    "\n",
+    "\n",
+    "lang = \"Python 3\"\n",
+    "data = defaultdict(list)\n",
+    "dataset = load_dataset(\"Nan-Do/codesearchnet-python\")\n",
+    "\n",
+    "for data_point in tqdm(dataset[\"train\"]):\n",
+    "    code = data_point[\"original_string\"]\n",
+    "    summary = data_point[\"summary\"]\n",
+    "    data[\"SOURCE\"].append(\"codesearchnet\")\n",
+    "    # Generate code\n",
+    "    if random() > 0.5:\n",
+    "        idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
+    "        template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
+    "        data[\"INSTRUCTION\"].append(template)\n",
+    "        data[\"RESPONSE\"].append(code)\n",
+    "    # Generate summary\n",
+    "    else:\n",
+    "        # We are generating the docstring or a summary so we better remove it from\n",
+    "        # the function\n",
+    "        if random() < 0.9:\n",
+    "            code = remove_docstring(code)\n",
+    "        idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
+    "        template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
+    "        data[\"INSTRUCTION\"].append(template)\n",
+    "        if idx <= 5:\n",
+    "            data[\"RESPONSE\"].append(summary)\n",
+    "        else:\n",
+    "            data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
+    "\n",
+    "df = pd.DataFrame(data=data)\n",
+    "df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ],
+   "metadata": {
+    "id": "_6jaUZRsy1-R"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from datasets import Dataset\n",
+    "\n",
+    "ds = Dataset.from_parquet(\"dataset.parquet\")\n",
+    "ds.push_to_hub(\"Nan-Do/open-assistant-codesearchnet-python\")"
+   ],
+   "metadata": {
+    "id": "DSHrvbF6tIyd"
+   },
+   "execution_count": null,
+   "outputs": []
+  }
+ ]
+}
diff --git a/data/datasets/instructional_codesearchnet_python/README.md b/data/datasets/instructional_codesearchnet_python/README.md
@@ -0,0 +1,26 @@
+This dataset is a template generated instructional Python datastet generated
+from an annotated version of the code-search-net dataset. The annotated version
+of code-search-net dataset can be found
+[here](https://huggingface.co/datasets/Nan-Do/codesearchnet-python).
+
+The dataset contains around 450000 python annotated functions. The dataset is
+split into two blocks, one in which the task is starting from the annotated
+summary to generate an instruction to generate the code as a response, and
+another one in which the expected response is to generate a description of the
+function or a docstring. For the second block the docstring has been removed
+from the function from 90% of the samples. To generate the summaries this
+[model](https://huggingface.co/Salesforce/codet5-base-codexglue-sum-python) has
+been used.
+
+**Note**: some summarisation tasks are very easy because the prompt already
+contains a docstring in the function which is then used as the ground truth
+response. It may be useful to filter these in future.
+
+### Summarize_codesearchnet_for_python.ipynb
+
+This notebook is used to generate the python annotated version of the
+code-search-net dataset for Python
+
+### GenerateOpenAssistantInstructionResponseFormat.ipynb
+
+This notebook is used to generate the Open-Assistant instructional dataset