Skip to content

Commit

Permalink
Added the required files to generate a instructional python dataset… (#…
Browse files Browse the repository at this point in the history
…3106)

… and updated `__init__.py`. This solves
#297

---------

Co-authored-by: Andreas Köpf <[email protected]>
Co-authored-by: Oliver Stanley <[email protected]>
  • Loading branch information
3 people authored May 13, 2023
1 parent a449ff7 commit 24856cd
Show file tree
Hide file tree
Showing 4 changed files with 473 additions and 0 deletions.
1 change: 1 addition & 0 deletions data/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"oa_stackexchange": "donfu/oa-stackexchange",
"stable_diffusion_instructional_dataset": "MadVoyager/stable_diffusion_instructional_dataset",
"ru_riddles_337": "0x22almostEvil/ru-riddles-377",
"instructional_codesearchnet_python": "Nan-Do/instructional_codesearchnet_python",
"tatoeba_mt_qna_oa": "0x22almostEvil/tatoeba-mt-qna-oa",
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"!pip install datasets tqdm"
],
"metadata": {
"id": "zLxBMw9Lsr6I"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "qyuuLNEzsaYR"
},
"outputs": [],
"source": [
"import gzip\n",
"import json\n",
"import pandas as pd\n",
"\n",
"from collections import defaultdict\n",
"from datasets import load_dataset\n",
"from tqdm.auto import tqdm\n",
"from random import random, randint\n",
"\n",
"ONE_STEP_OUPUT_CODE_TEMPLATES = [\n",
" \"Can you write a program in {lang} where it\\n\",\n",
" \"How would you implement a function in {lang} that\\n\",\n",
" \"Write a {lang} function for\\n\",\n",
" \"Can you create a {lang} program that\\n\",\n",
" \"Implement a function in {lang} to\\n\",\n",
" \"Write a {lang} script for\\n\",\n",
" \"How would you code a program in {lang} to\\n\",\n",
" \"Create a {lang} function for\\n\",\n",
" \"Write a {lang} program that can\\n\",\n",
" \"Can you implement a function in {lang} that\\n\",\n",
"]\n",
"\n",
"ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [\n",
" \"Explain what the following {lang} code does\\n\",\n",
" \"Can you tell what is the following {lang} function doing\\n\",\n",
" \"Here you have a function in {lang}, explain what it does\\n\",\n",
" \"Make a summary of the following {lang} code\\n\",\n",
" \"Can you generate a brief explanation for the following {lang} code\\n\",\n",
" \"How would you explain what the following {lang} function does\\n\",\n",
" \"Can you generate the documentation for the following {lang} function\\n\",\n",
" \"Create a docstring for the following {lang} code\\n\",\n",
" \"Given the following {lang} function, write the documentation\\n\",\n",
" \"Write a docstring for the following {lang} function\\n\",\n",
"]\n",
"\n",
"\n",
"def remove_docstring(code_function):\n",
" triple_quotes = '\"\"\"'\n",
" lines = code_function.split(\"\\n\")\n",
"\n",
" c = lines[1].count(triple_quotes)\n",
" # There is no docstring\n",
" if c == 0:\n",
" return code_function\n",
" # One line dostring\n",
" if c == 2:\n",
" return \"\\n\".join([lines[0]] + lines[2:])\n",
"\n",
" idx = 2\n",
" while idx < len(lines) and triple_quotes not in lines[idx]:\n",
" idx += 1\n",
"\n",
" return \"\\n\".join([lines[0]] + lines[idx + 1 :])\n",
"\n",
"\n",
"lang = \"Python 3\"\n",
"data = defaultdict(list)\n",
"dataset = load_dataset(\"Nan-Do/codesearchnet-python\")\n",
"\n",
"for data_point in tqdm(dataset[\"train\"]):\n",
" code = data_point[\"original_string\"]\n",
" summary = data_point[\"summary\"]\n",
" data[\"SOURCE\"].append(\"codesearchnet\")\n",
" # Generate code\n",
" if random() > 0.5:\n",
" idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)\n",
" template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary\n",
" data[\"INSTRUCTION\"].append(template)\n",
" data[\"RESPONSE\"].append(code)\n",
" # Generate summary\n",
" else:\n",
" # We are generating the docstring or a summary so we better remove it from\n",
" # the function\n",
" if random() < 0.9:\n",
" code = remove_docstring(code)\n",
" idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)\n",
" template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code\n",
" data[\"INSTRUCTION\"].append(template)\n",
" if idx <= 5:\n",
" data[\"RESPONSE\"].append(summary)\n",
" else:\n",
" data[\"RESPONSE\"].append('\"\"\"' + summary + '\"\"\"')\n",
"\n",
"df = pd.DataFrame(data=data)\n",
"df.to_parquet(\"dataset.parquet\", row_group_size=100, engine=\"pyarrow\", index=False)"
]
},
{
"cell_type": "code",
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
],
"metadata": {
"id": "_6jaUZRsy1-R"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from datasets import Dataset\n",
"\n",
"ds = Dataset.from_parquet(\"dataset.parquet\")\n",
"ds.push_to_hub(\"Nan-Do/open-assistant-codesearchnet-python\")"
],
"metadata": {
"id": "DSHrvbF6tIyd"
},
"execution_count": null,
"outputs": []
}
]
}
26 changes: 26 additions & 0 deletions data/datasets/instructional_codesearchnet_python/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
This dataset is a template generated instructional Python datastet generated
from an annotated version of the code-search-net dataset. The annotated version
of code-search-net dataset can be found
[here](https://huggingface.co/datasets/Nan-Do/codesearchnet-python).

The dataset contains around 450000 python annotated functions. The dataset is
split into two blocks, one in which the task is starting from the annotated
summary to generate an instruction to generate the code as a response, and
another one in which the expected response is to generate a description of the
function or a docstring. For the second block the docstring has been removed
from the function from 90% of the samples. To generate the summaries this
[model](https://huggingface.co/Salesforce/codet5-base-codexglue-sum-python) has
been used.

**Note**: some summarisation tasks are very easy because the prompt already
contains a docstring in the function which is then used as the ground truth
response. It may be useful to filter these in future.

### Summarize_codesearchnet_for_python.ipynb

This notebook is used to generate the python annotated version of the
code-search-net dataset for Python

### GenerateOpenAssistantInstructionResponseFormat.ipynb

This notebook is used to generate the Open-Assistant instructional dataset
Loading

0 comments on commit 24856cd

Please sign in to comment.