From c77913e7b341cb65716641e1abc4aab70115e7da Mon Sep 17 00:00:00 2001
From: Giovanni1085 <gcolavizza@turing.ac.uk>
Date: Thu, 27 Jun 2019 19:31:02 +0100
Subject: [PATCH] minor updates and checks from merge of K's code

---
 .gitignore                         |   1 +
 README.md                          |   7 +-
 dataset/README.md                  |  15 +--
 dataset/das_classifier/README.md   |   3 +
 notebooks/DescriptiveFigures.ipynb | 174 +----------------------------
 requirements.txt                   |   7 ++
 6 files changed, 22 insertions(+), 185 deletions(-)

diff --git a/.gitignore b/.gitignore
index 148c606..df364bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,4 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+.Rproj.user
diff --git a/README.md b/README.md
index 34c20bb..058a276 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,11 @@
 # das-public
 
-Data availability statements (public repo).
+Data availability statements (public repo). Pre-print: ADD.
 
-See the [dataset folder](dataset) to create a dataset for analysis from the [PubMed Central OA collection](https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist). See instead the [analysis folder](analysis) to replicate analytical results from the paper. The [dataset analysed in the paper](analysis/dataset/export_full.csv.zip) is provided, so that the two replication steps can be done independently. 
+* See the [dataset folder](dataset) to create a dataset for analysis from the [PubMed Central OA collection](https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist).
+* See the [notebooks](notebooks) and [scripts](scripts) folders to replicate Figure 2 and have a descriptive overview of the dataset. 
+* See the [analysis folder](analysis) to replicate analytical results from the paper. The [dataset analysed in the paper](analysis/dataset/export_full.csv.zip) is provided, so that the two replication steps can be done independently.
+* The [figures](figures) and [resources](resources) folders contain supporting files.
 
 ## Report issues
 
diff --git a/dataset/README.md b/dataset/README.md
index 14530f1..1c36416 100644
--- a/dataset/README.md
+++ b/dataset/README.md
@@ -25,17 +25,4 @@ Folder containing the necessary code to create a dataset for analysis from the P
 
 ## Requirements
 
-We used the following Python libraries (and versions):
-
-* Python 3.6+
-* Gensim 3.4.0
-* Matplotlib 3.0.2
-* Numpy 1.15.4
-* NLTK 3.4.1
-* Pandas 0.23.4
-* Pymongo 3.7.2
-* Scikit-learn 0.20.3
-* Scipy 1.1.0
-* Seaborn 0.9.0
-* BeautifulSoup 4.7.1
-* TextBlob 0.15.3
\ No newline at end of file
+See [requirements](../requirements.txt).
\ No newline at end of file
diff --git a/dataset/das_classifier/README.md b/dataset/das_classifier/README.md
index 91b5d2d..607f846 100644
--- a/dataset/das_classifier/README.md
+++ b/dataset/das_classifier/README.md
@@ -18,3 +18,6 @@ This folder contains code and data to classify DAS. Proceed as follows:
     - [glove embeddings (50d)](input/glove.6B.50d.txt.zip) glove embeddings used for one of the classifier (50d, compressed, please unzip before use).
 * OUTPUT: this folder will contain classification of DAS contained in [das_full.csv](input/das_full.csv) for all models, as well as the [overview file](output/overview_models_parameters.csv) to compare them (we provide an example of the latter).
 
+## Requirements
+
+See [requirements](../requirements.txt).
\ No newline at end of file
diff --git a/notebooks/DescriptiveFigures.ipynb b/notebooks/DescriptiveFigures.ipynb
index 5c5a1b9..7caaf21 100644
--- a/notebooks/DescriptiveFigures.ipynb
+++ b/notebooks/DescriptiveFigures.ipynb
@@ -18,7 +18,7 @@
    "outputs": [],
    "source": [
     "### We start by importing a lot of important python packages and loading the csv into a pandas dataframe\n",
-    "from __future__ import print_function #We need this for something...\n",
+    "from __future__ import print_function\n",
     "\n",
     "import datetime as dt\n",
     "from IPython.display import display\n",
@@ -104,22 +104,6 @@
     "sns.set_palette(palette)\n",
     "sns.palplot(sns.color_palette())\n",
     "\n",
-    "# We need to make a more nuanced one though that works for the \n",
-    "# 5 das classes though :)\n",
-    "# Lets keep the first 3 colours the same, and then add in 5 more\n",
-    "# for the different classes.\n",
-    "# The 3rd colour will correspond to das class 0, and then the\n",
-    "# 4th - 8th will be classes 1 - 5.\n",
-    "# The main colour palette is linked at\n",
-    "# https://coolors.co/6a5acd-6495ed-d4012b-f46036-5f0f40\n",
-    "# The light colour palette was created by taking the alternate \n",
-    "# shade that was 5 steps above the main colour and is linked at\n",
-    "# https://coolors.co/ada5e3-aac5f5-e7748b-f9a891-a77c96\n",
-    "# The dark colour palette was created by taking the alternate \n",
-    "# shade that was 5 steps below the main colour and is linked at\n",
-    "# https://coolors.co/3a3270-375282-740118-86351e-340923\n",
-    "\n",
-    "\n",
     "# CLASS 1: https://coolors.co/574aa8-527ac2-ae0124-c84f2d-4e0d35\n",
     "# CLASS 2: https://coolors.co/6a5acd-6495ed-d4012b-f46036-5f0f40\n",
     "# CLASS 3 (2): https://coolors.co/8578d6-80a8f0-db2f51-f67c5a-7c3a62\n",
@@ -182,8 +166,7 @@
     "* `h_index_max`: maximum h-index of the authors at publication time\n",
     "* `n_index_mean`: mean h-index of the authors at publication time\n",
     "* `n_index_median`: median h-index of the authors at publication time\n",
-    "* `das_class`: das classified as ????\n",
-    "* `das_class_simple`: das classified as ????\n",
+    "* `das_class`: das class\n",
     "* `j_lower`: name of the journal all in lower case"
    ]
   },
@@ -4735,160 +4718,13 @@
     "            print (\"  -------\")\n",
     "        print (\"\\n============\")"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([['Single-trial dynamics explain magnitude sensitive decision making',\n",
-       "        'Data are available here 10.17605/OSF.IO/29ZGP.'],\n",
-       "       ['Expertise-related functional brain network efficiency in healthy older adults',\n",
-       "        'The datasets supporting the conclusions of this article are available in the Open Science Framework repository: https://osf.io/zk6vs/?view_only=aab980f14f5445f5897f106f9d19133a.'],\n",
-       "       ['Measuring eye states in functional MRI',\n",
-       "        'All of our methods are outlined in the freely available MATLAB toolbox (http://www.neuro.uniklinikum-jena.de/Forschung/AG+Neuroimaging.html or https://sourceforge.net/projects/eye-state-fmri/files) under the GNU public license for non-commercial use and open-source development.']],\n",
-       "      dtype=object)"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.loc[(df['das_class']==das_class) & (article_selection_mask), ['title', 'das']].sample(n=3).values"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Make the big combo figure for the paper\n",
-    "\n",
-    "Make a plot that has the following dimensions:\n",
-    "\n",
-    "* Big grid: 24cm wide, 17cm high\n",
-    "* Top row: two columns, "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'das_required_prop-2'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-22-aad005d57751>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf_journal_das\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdf_journal_das\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'N-tot'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m>\u001b[0m\u001b[1;36m2000\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'das_required_prop-2'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\kwpython3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36msort_values\u001b[1;34m(self, by, axis, ascending, inplace, kind, na_position)\u001b[0m\n\u001b[0;32m   4717\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4718\u001b[0m             \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4719\u001b[1;33m             \u001b[0mk\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   4720\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   4721\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\kwpython3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m   1704\u001b[0m             \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1705\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1706\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1707\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1708\u001b[0m         \u001b[1;31m# Check for duplicates\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mKeyError\u001b[0m: 'das_required_prop-2'"
-     ]
-    }
-   ],
-   "source": [
-    "df_journal_das.loc[df_journal_das['N-tot']>2000, :].sort_values(by='das_required_prop-2', ascending=False).head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_journal_das_corr = df_journal_das.corr()\n",
-    "\n",
-    "sns.heatmap(df_journal_das_corr)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "widgets.Select(\n",
-    "    options=['Linux', 'Windows', 'OSX'],\n",
-    "    value='OSX',\n",
-    "    # rows=10,\n",
-    "    description='OS:',\n",
-    "    disabled=False\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "widgets.IntSlider()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.arange?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "year_list = np.arange(2000,2019+1)\n",
-    "print (year_list)\n",
-    "\n",
-    "print (year_list[:-1])\n",
-    "\n",
-    "print (year_list[:-1:3])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:kwpython3]",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda-env-kwpython3-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -4900,7 +4736,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.7.3"
   }
  },
  "nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index 15cd7d5..97ce013 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,10 @@ matplotlib==3.*
 numpy
 pandas
 seaborn==0.9*
+gensim==3.4*
+nltk==3.4*
+pymongo==3.7*
+scikit-learn
+scipy
+beautifulsoup4
+textblob