From c77913e7b341cb65716641e1abc4aab70115e7da Mon Sep 17 00:00:00 2001 From: Giovanni1085 Date: Thu, 27 Jun 2019 19:31:02 +0100 Subject: [PATCH] minor updates and checks from merge of K's code --- .gitignore | 1 + README.md | 7 +- dataset/README.md | 15 +-- dataset/das_classifier/README.md | 3 + notebooks/DescriptiveFigures.ipynb | 174 +---------------------------- requirements.txt | 7 ++ 6 files changed, 22 insertions(+), 185 deletions(-) diff --git a/.gitignore b/.gitignore index 148c606..df364bc 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,4 @@ venv.bak/ # mypy .mypy_cache/ +.Rproj.user diff --git a/README.md b/README.md index 34c20bb..058a276 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ # das-public -Data availability statements (public repo). +Data availability statements (public repo). Pre-print: ADD. -See the [dataset folder](dataset) to create a dataset for analysis from the [PubMed Central OA collection](https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist). See instead the [analysis folder](analysis) to replicate analytical results from the paper. The [dataset analysed in the paper](analysis/dataset/export_full.csv.zip) is provided, so that the two replication steps can be done independently. +* See the [dataset folder](dataset) to create a dataset for analysis from the [PubMed Central OA collection](https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist). +* See the [notebooks](notebooks) and [scripts](scripts) folders to replicate Figure 2 and have a descriptive overview of the dataset. +* See the [analysis folder](analysis) to replicate analytical results from the paper. The [dataset analysed in the paper](analysis/dataset/export_full.csv.zip) is provided, so that the two replication steps can be done independently. +* The [figures](figures) and [resources](resources) folders contain supporting files. ## Report issues diff --git a/dataset/README.md b/dataset/README.md index 14530f1..1c36416 100644 --- a/dataset/README.md +++ b/dataset/README.md @@ -25,17 +25,4 @@ Folder containing the necessary code to create a dataset for analysis from the P ## Requirements -We used the following Python libraries (and versions): - -* Python 3.6+ -* Gensim 3.4.0 -* Matplotlib 3.0.2 -* Numpy 1.15.4 -* NLTK 3.4.1 -* Pandas 0.23.4 -* Pymongo 3.7.2 -* Scikit-learn 0.20.3 -* Scipy 1.1.0 -* Seaborn 0.9.0 -* BeautifulSoup 4.7.1 -* TextBlob 0.15.3 \ No newline at end of file +See [requirements](../requirements.txt). \ No newline at end of file diff --git a/dataset/das_classifier/README.md b/dataset/das_classifier/README.md index 91b5d2d..607f846 100644 --- a/dataset/das_classifier/README.md +++ b/dataset/das_classifier/README.md @@ -18,3 +18,6 @@ This folder contains code and data to classify DAS. Proceed as follows: - [glove embeddings (50d)](input/glove.6B.50d.txt.zip) glove embeddings used for one of the classifier (50d, compressed, please unzip before use). * OUTPUT: this folder will contain classification of DAS contained in [das_full.csv](input/das_full.csv) for all models, as well as the [overview file](output/overview_models_parameters.csv) to compare them (we provide an example of the latter). +## Requirements + +See [requirements](../requirements.txt). \ No newline at end of file diff --git a/notebooks/DescriptiveFigures.ipynb b/notebooks/DescriptiveFigures.ipynb index 5c5a1b9..7caaf21 100644 --- a/notebooks/DescriptiveFigures.ipynb +++ b/notebooks/DescriptiveFigures.ipynb @@ -18,7 +18,7 @@ "outputs": [], "source": [ "### We start by importing a lot of important python packages and loading the csv into a pandas dataframe\n", - "from __future__ import print_function #We need this for something...\n", + "from __future__ import print_function\n", "\n", "import datetime as dt\n", "from IPython.display import display\n", @@ -104,22 +104,6 @@ "sns.set_palette(palette)\n", "sns.palplot(sns.color_palette())\n", "\n", - "# We need to make a more nuanced one though that works for the \n", - "# 5 das classes though :)\n", - "# Lets keep the first 3 colours the same, and then add in 5 more\n", - "# for the different classes.\n", - "# The 3rd colour will correspond to das class 0, and then the\n", - "# 4th - 8th will be classes 1 - 5.\n", - "# The main colour palette is linked at\n", - "# https://coolors.co/6a5acd-6495ed-d4012b-f46036-5f0f40\n", - "# The light colour palette was created by taking the alternate \n", - "# shade that was 5 steps above the main colour and is linked at\n", - "# https://coolors.co/ada5e3-aac5f5-e7748b-f9a891-a77c96\n", - "# The dark colour palette was created by taking the alternate \n", - "# shade that was 5 steps below the main colour and is linked at\n", - "# https://coolors.co/3a3270-375282-740118-86351e-340923\n", - "\n", - "\n", "# CLASS 1: https://coolors.co/574aa8-527ac2-ae0124-c84f2d-4e0d35\n", "# CLASS 2: https://coolors.co/6a5acd-6495ed-d4012b-f46036-5f0f40\n", "# CLASS 3 (2): https://coolors.co/8578d6-80a8f0-db2f51-f67c5a-7c3a62\n", @@ -182,8 +166,7 @@ "* `h_index_max`: maximum h-index of the authors at publication time\n", "* `n_index_mean`: mean h-index of the authors at publication time\n", "* `n_index_median`: median h-index of the authors at publication time\n", - "* `das_class`: das classified as ????\n", - "* `das_class_simple`: das classified as ????\n", + "* `das_class`: das class\n", "* `j_lower`: name of the journal all in lower case" ] }, @@ -4735,160 +4718,13 @@ " print (\" -------\")\n", " print (\"\\n============\")" ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([['Single-trial dynamics explain magnitude sensitive decision making',\n", - " 'Data are available here 10.17605/OSF.IO/29ZGP.'],\n", - " ['Expertise-related functional brain network efficiency in healthy older adults',\n", - " 'The datasets supporting the conclusions of this article are available in the Open Science Framework repository: https://osf.io/zk6vs/?view_only=aab980f14f5445f5897f106f9d19133a.'],\n", - " ['Measuring eye states in functional MRI',\n", - " 'All of our methods are outlined in the freely available MATLAB toolbox (http://www.neuro.uniklinikum-jena.de/Forschung/AG+Neuroimaging.html or https://sourceforge.net/projects/eye-state-fmri/files) under the GNU public license for non-commercial use and open-source development.']],\n", - " dtype=object)" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.loc[(df['das_class']==das_class) & (article_selection_mask), ['title', 'das']].sample(n=3).values" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Make the big combo figure for the paper\n", - "\n", - "Make a plot that has the following dimensions:\n", - "\n", - "* Big grid: 24cm wide, 17cm high\n", - "* Top row: two columns, " - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'das_required_prop-2'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mdf_journal_das\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdf_journal_das\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'N-tot'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m>\u001b[0m\u001b[1;36m2000\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'das_required_prop-2'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\kwpython3\\lib\\site-packages\\pandas\\core\\frame.py\u001b[0m in \u001b[0;36msort_values\u001b[1;34m(self, by, axis, ascending, inplace, kind, na_position)\u001b[0m\n\u001b[0;32m 4717\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4718\u001b[0m \u001b[0mby\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mby\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 4719\u001b[1;33m \u001b[0mk\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_label_or_level_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mby\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4720\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4721\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mtuple\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\kwpython3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m_get_label_or_level_values\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 1704\u001b[0m \u001b[0mvalues\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_level_values\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1705\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1706\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1707\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1708\u001b[0m \u001b[1;31m# Check for duplicates\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mKeyError\u001b[0m: 'das_required_prop-2'" - ] - } - ], - "source": [ - "df_journal_das.loc[df_journal_das['N-tot']>2000, :].sort_values(by='das_required_prop-2', ascending=False).head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_journal_das_corr = df_journal_das.corr()\n", - "\n", - "sns.heatmap(df_journal_das_corr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "widgets.Select(\n", - " options=['Linux', 'Windows', 'OSX'],\n", - " value='OSX',\n", - " # rows=10,\n", - " description='OS:',\n", - " disabled=False\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "widgets.IntSlider()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.arange?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "year_list = np.arange(2000,2019+1)\n", - "print (year_list)\n", - "\n", - "print (year_list[:-1])\n", - "\n", - "print (year_list[:-1:3])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:kwpython3]", + "display_name": "Python 3", "language": "python", - "name": "conda-env-kwpython3-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -4900,7 +4736,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index 15cd7d5..97ce013 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,10 @@ matplotlib==3.* numpy pandas seaborn==0.9* +gensim==3.4* +nltk==3.4* +pymongo==3.7* +scikit-learn +scipy +beautifulsoup4 +textblob