-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' of https://github.com/petermr/pygetpapers
- Loading branch information
Showing
1 changed file
with
344 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,344 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"name": "pygetpapers_module_demo.ipynb", | ||
"provenance": [], | ||
"authorship_tag": "ABX9TyPlY3RdaYSv5gb4fWxQWAG6", | ||
"include_colab_link": true | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "view-in-github", | ||
"colab_type": "text" | ||
}, | ||
"source": [ | ||
"<a href=\"https://colab.research.google.com/github/petermr/pygetpapers/blob/main/pygetpapers_module_demo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "wrvi5k_eWe6s", | ||
"outputId": "81af3788-3e88-41fe-a669-92c5ed511870" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Collecting pygetpapers\n", | ||
" Downloading pygetpapers-1.1.6-py3-none-any.whl (40 kB)\n", | ||
"\u001b[?25l\r\u001b[K |████████▏ | 10 kB 18.1 MB/s eta 0:00:01\r\u001b[K |████████████████▎ | 20 kB 24.3 MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 30 kB 29.8 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 40 kB 4.4 MB/s \n", | ||
"\u001b[?25hCollecting dict2xml\n", | ||
" Downloading dict2xml-1.7.1.tar.gz (6.6 kB)\n", | ||
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | ||
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | ||
" Preparing wheel metadata ... \u001b[?25l\u001b[?25hdone\n", | ||
"Collecting configargparse\n", | ||
" Downloading ConfigArgParse-1.5.3-py3-none-any.whl (20 kB)\n", | ||
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from pygetpapers) (1.3.5)\n", | ||
"Collecting xmltodict\n", | ||
" Downloading xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)\n", | ||
"Requirement already satisfied: lxml in /usr/local/lib/python3.7/dist-packages (from pygetpapers) (4.2.6)\n", | ||
"Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from pygetpapers) (4.63.0)\n", | ||
"Collecting arxiv\n", | ||
" Downloading arxiv-1.4.2-py3-none-any.whl (11 kB)\n", | ||
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from pygetpapers) (2.23.0)\n", | ||
"Collecting coloredlogs\n", | ||
" Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", | ||
"\u001b[K |████████████████████████████████| 46 kB 3.3 MB/s \n", | ||
"\u001b[?25hCollecting habanero\n", | ||
" Downloading habanero-1.0.0-py2.py3-none-any.whl (42 kB)\n", | ||
"\u001b[K |████████████████████████████████| 42 kB 1.0 MB/s \n", | ||
"\u001b[?25hCollecting feedparser\n", | ||
" Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)\n", | ||
"\u001b[K |████████████████████████████████| 81 kB 8.8 MB/s \n", | ||
"\u001b[?25hCollecting humanfriendly>=9.1\n", | ||
" Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", | ||
"\u001b[K |████████████████████████████████| 86 kB 5.5 MB/s \n", | ||
"\u001b[?25hCollecting sgmllib3k\n", | ||
" Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)\n", | ||
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->pygetpapers) (1.24.3)\n", | ||
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->pygetpapers) (3.0.4)\n", | ||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->pygetpapers) (2021.10.8)\n", | ||
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->pygetpapers) (2.10)\n", | ||
"Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pygetpapers) (1.21.5)\n", | ||
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pygetpapers) (2.8.2)\n", | ||
"Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->pygetpapers) (2018.9)\n", | ||
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas->pygetpapers) (1.15.0)\n", | ||
"Building wheels for collected packages: dict2xml, sgmllib3k\n", | ||
" Building wheel for dict2xml (PEP 517) ... \u001b[?25l\u001b[?25hdone\n", | ||
" Created wheel for dict2xml: filename=dict2xml-1.7.1-py3-none-any.whl size=6930 sha256=d5bf25f687fc127db8df231a9b9d73733a89c8da50d2d334fe60ddcce0e30ed4\n", | ||
" Stored in directory: /root/.cache/pip/wheels/8e/ad/d7/b1cd889d18b22fa7d175a92006236862743f39cdefda238397\n", | ||
" Building wheel for sgmllib3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | ||
" Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6066 sha256=4108bf1e8fb1815f486d778abc0d5f510d30b1849d40231f97f36c4c6d7068b7\n", | ||
" Stored in directory: /root/.cache/pip/wheels/73/ad/a4/0dff4a6ef231fc0dfa12ffbac2a36cebfdddfe059f50e019aa\n", | ||
"Successfully built dict2xml sgmllib3k\n", | ||
"Installing collected packages: sgmllib3k, humanfriendly, feedparser, xmltodict, habanero, dict2xml, configargparse, coloredlogs, arxiv, pygetpapers\n", | ||
"Successfully installed arxiv-1.4.2 coloredlogs-15.0.1 configargparse-1.5.3 dict2xml-1.7.1 feedparser-6.0.8 habanero-1.0.0 humanfriendly-10.0 pygetpapers-1.1.6 sgmllib3k-1.0.0 xmltodict-0.12.0\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install pygetpapers" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from pygetpapers import Pygetpapers\n", | ||
"pygetpapers_call = Pygetpapers ()" | ||
], | ||
"metadata": { | ||
"id": "QfIfSwp8Wmzy" | ||
}, | ||
"execution_count": 3, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Download papers from EPMC\n" | ||
], | ||
"metadata": { | ||
"id": "vI3SxbhzXIli" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"pygetpapers_call.run_command(query='lantana camara', limit = 30, output = 'lantana_camara_30', xml=True)" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "cMTZrd-qWqrg", | ||
"outputId": "eff8c18b-b731-4b57-d9a6-ae30bcbdd037" | ||
}, | ||
"execution_count": 4, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stderr", | ||
"text": [ | ||
"INFO: Total Hits are 1433\n", | ||
"1it [00:00, 174.18it/s]\n", | ||
"INFO: Saving XML files to /content/lantana_camara_30/*/fulltext.xml\n", | ||
"100%|██████████| 30/30 [00:20<00:00, 1.46it/s]\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Update existing corpus with new set of papers " | ||
], | ||
"metadata": { | ||
"id": "g1J96nK7XpcR" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"pygetpapers_call.run_command(query='lantana', limit = 30, output = 'lantana_camara_30', update = True, xml=True)" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "ezSYDUAjX7At", | ||
"outputId": "0faab190-9ded-4e46-e367-3378c069958d" | ||
}, | ||
"execution_count": 5, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stderr", | ||
"text": [ | ||
"INFO: Please ensure that you are providing the same --api as the one in the corpus or you may get errors\n", | ||
"INFO: Total Hits are 2109\n", | ||
"1it [00:00, 141.40it/s]\n", | ||
"INFO: Saving XML files to /content/lantana_camara_30/*/fulltext.xml\n", | ||
"100%|██████████| 60/60 [00:35<00:00, 1.67it/s]\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Download fulltext/metadata in a different format " | ||
], | ||
"metadata": { | ||
"id": "uBRLAHD6YVjk" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"pygetpapers_call.run_command(output = 'lantana_camara_30', restart = True, pdf=True, makecsv=True, makehtml=True)" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "VtYQk-1GYhCB", | ||
"outputId": "ec7ed831-a676-454f-a985-10e6871e2779" | ||
}, | ||
"execution_count": 6, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stderr", | ||
"text": [ | ||
" 2%|▏ | 1/60 [00:00<00:22, 2.58it/s]INFO: Wrote the pdf file for PMC8593682\n", | ||
" 3%|▎ | 2/60 [00:01<00:31, 1.87it/s]INFO: Wrote the pdf file for PMC8405894\n", | ||
" 5%|▌ | 3/60 [00:03<01:28, 1.55s/it]INFO: Wrote the pdf file for PMC8449179\n", | ||
" 7%|▋ | 4/60 [00:04<01:02, 1.11s/it]INFO: Wrote the pdf file for PMC8478869\n", | ||
" 8%|▊ | 5/60 [00:04<00:52, 1.05it/s]INFO: Wrote the pdf file for PMC8840049\n", | ||
" 10%|█ | 6/60 [00:06<01:10, 1.31s/it]INFO: Wrote the pdf file for PMC8734550\n", | ||
" 12%|█▏ | 7/60 [00:09<01:36, 1.82s/it]INFO: Wrote the pdf file for PMC8462158\n", | ||
" 13%|█▎ | 8/60 [00:12<01:54, 2.20s/it]INFO: Wrote the pdf file for PMC8348691\n", | ||
" 15%|█▌ | 9/60 [00:13<01:27, 1.72s/it]INFO: Wrote the pdf file for PMC8110560\n", | ||
" 17%|█▋ | 10/60 [00:15<01:34, 1.89s/it]INFO: Wrote the pdf file for PMC8623576\n", | ||
" 18%|█▊ | 11/60 [00:16<01:15, 1.55s/it]INFO: Wrote the pdf file for PMC8310452\n", | ||
" 20%|██ | 12/60 [00:17<00:59, 1.23s/it]INFO: Wrote the pdf file for PMC8839486\n", | ||
" 22%|██▏ | 13/60 [00:17<00:46, 1.01it/s]INFO: Wrote the pdf file for PMC8322784\n", | ||
" 23%|██▎ | 14/60 [00:17<00:37, 1.22it/s]INFO: Wrote the pdf file for PMC8396508\n", | ||
" 25%|██▌ | 15/60 [00:18<00:33, 1.36it/s]INFO: Wrote the pdf file for PMC8812253\n", | ||
" 27%|██▋ | 16/60 [00:18<00:28, 1.56it/s]INFO: Wrote the pdf file for PMC7915326\n", | ||
" 28%|██▊ | 17/60 [00:19<00:25, 1.66it/s]INFO: Wrote the pdf file for PMC8183526\n", | ||
" 30%|███ | 18/60 [00:19<00:24, 1.74it/s]INFO: Wrote the pdf file for PMC8879102\n", | ||
" 32%|███▏ | 19/60 [00:20<00:23, 1.74it/s]INFO: Wrote the pdf file for PMC8178325\n", | ||
" 33%|███▎ | 20/60 [00:21<00:24, 1.66it/s]INFO: Wrote the pdf file for PMC8848737\n", | ||
" 35%|███▌ | 21/60 [00:21<00:21, 1.78it/s]INFO: Wrote the pdf file for PMC8579071\n", | ||
" 37%|███▋ | 22/60 [00:23<00:32, 1.19it/s]INFO: Wrote the pdf file for PMC8258845\n", | ||
" 38%|███▊ | 23/60 [00:23<00:27, 1.36it/s]INFO: Wrote the pdf file for PMC8750514\n", | ||
" 40%|████ | 24/60 [00:24<00:24, 1.46it/s]INFO: Wrote the pdf file for PMC8698633\n", | ||
" 42%|████▏ | 25/60 [00:25<00:36, 1.04s/it]INFO: Wrote the pdf file for PMC8623234\n", | ||
" 43%|████▎ | 26/60 [00:26<00:29, 1.15it/s]INFO: Wrote the pdf file for PMC7522305\n", | ||
" 45%|████▌ | 27/60 [00:28<00:41, 1.25s/it]INFO: Wrote the pdf file for PMC7480948\n", | ||
" 47%|████▋ | 28/60 [00:29<00:33, 1.04s/it]INFO: Wrote the pdf file for PMC8878085\n", | ||
" 48%|████▊ | 29/60 [00:31<00:48, 1.55s/it]INFO: Wrote the pdf file for PMC8371560\n", | ||
" 50%|█████ | 30/60 [00:32<00:36, 1.22s/it]INFO: Wrote the pdf file for PMC8161263\n", | ||
" 52%|█████▏ | 31/60 [00:32<00:28, 1.03it/s]INFO: Wrote the pdf file for PMC8645380\n", | ||
" 53%|█████▎ | 32/60 [00:35<00:39, 1.42s/it]INFO: Wrote the pdf file for PMC8325027\n", | ||
" 55%|█████▌ | 33/60 [00:38<00:50, 1.87s/it]INFO: Wrote the pdf file for PMC8830149\n", | ||
" 58%|█████▊ | 35/60 [00:41<00:44, 1.80s/it]INFO: Wrote the pdf file for PMC7645447\n", | ||
" 60%|██████ | 36/60 [00:41<00:33, 1.40s/it]INFO: Wrote the pdf file for PMC8112658\n", | ||
" 62%|██████▏ | 37/60 [00:42<00:25, 1.11s/it]INFO: Wrote the pdf file for PMC7748606\n", | ||
" 65%|██████▌ | 39/60 [00:42<00:14, 1.48it/s]INFO: Wrote the pdf file for PMC7287757\n", | ||
" 67%|██████▋ | 40/60 [00:43<00:12, 1.64it/s]INFO: Wrote the pdf file for PMC7426850\n", | ||
" 70%|███████ | 42/60 [00:43<00:08, 2.17it/s]INFO: Wrote the pdf file for PMC7447938\n", | ||
" 72%|███████▏ | 43/60 [00:44<00:07, 2.21it/s]INFO: Wrote the pdf file for PMC6724078\n", | ||
" 73%|███████▎ | 44/60 [00:44<00:07, 2.23it/s]INFO: Wrote the pdf file for PMC7011350\n", | ||
" 75%|███████▌ | 45/60 [00:45<00:06, 2.19it/s]INFO: Wrote the pdf file for PMC7201468\n", | ||
" 77%|███████▋ | 46/60 [00:45<00:06, 2.27it/s]INFO: Wrote the pdf file for PMC7455955\n", | ||
" 78%|███████▊ | 47/60 [00:45<00:05, 2.37it/s]INFO: Wrote the pdf file for PMC6636566\n", | ||
" 80%|████████ | 48/60 [00:46<00:05, 2.33it/s]INFO: Wrote the pdf file for PMC5899389\n", | ||
" 82%|████████▏ | 49/60 [00:46<00:04, 2.28it/s]INFO: Wrote the pdf file for PMC6413047\n", | ||
" 83%|████████▎ | 50/60 [00:47<00:04, 2.40it/s]INFO: Wrote the pdf file for PMC6016949\n", | ||
" 87%|████████▋ | 52/60 [00:47<00:02, 2.67it/s]INFO: Wrote the pdf file for PMC4963993\n", | ||
" 88%|████████▊ | 53/60 [00:48<00:02, 2.63it/s]INFO: Wrote the pdf file for PMC5379525\n", | ||
" 90%|█████████ | 54/60 [00:48<00:02, 2.62it/s]INFO: Wrote the pdf file for PMC5637243\n", | ||
" 92%|█████████▏| 55/60 [00:49<00:01, 2.64it/s]INFO: Wrote the pdf file for PMC6272997\n", | ||
" 93%|█████████▎| 56/60 [00:49<00:01, 2.65it/s]INFO: Wrote the pdf file for PMC4153567\n", | ||
" 95%|█████████▌| 57/60 [00:49<00:01, 2.61it/s]INFO: Wrote the pdf file for PMC8879267\n", | ||
" 97%|█████████▋| 58/60 [00:50<00:00, 2.34it/s]INFO: Wrote the pdf file for PMC4208836\n", | ||
"100%|██████████| 60/60 [00:50<00:00, 1.18it/s]\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Get only the number of hits for a query without downloading papers" | ||
], | ||
"metadata": { | ||
"id": "8tw3gPBtZdl3" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"pygetpapers_call.run_command(query = 'lantana camara', noexecute=True) " | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "MIWtzACFZl1U", | ||
"outputId": "73e726d0-c951-4cef-d1c7-8fb054834048" | ||
}, | ||
"execution_count": 10, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stderr", | ||
"text": [ | ||
"INFO: Total number of hits for the query are 1433\n" | ||
] | ||
} | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Query other repositories. `pygetpapers` supports:\n", | ||
"- crossref\n", | ||
"- arxiv\n", | ||
"- biorxiv\n", | ||
"- medarxiv" | ||
], | ||
"metadata": { | ||
"id": "4O8ZPZwpaB7Y" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"pygetpapers_call.run_command( limit = 30, output = 'biorxiv_30', api = 'biorxiv' ) " | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "D8lLawBKaYCU", | ||
"outputId": "6326ee54-364c-489b-8196-93e2fb92e678" | ||
}, | ||
"execution_count": 13, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stderr", | ||
"text": [ | ||
"INFO: Making Request to rxiv\n", | ||
"WARNING: No more papers found\n", | ||
"INFO: Wrote metadata file for the query\n", | ||
"INFO: Writing metadata file for the papers at /content/lantana_camara_30/lantana_camara_30_bioarxiv/lantana_camara_30_rxvist/biorxiv_30\n", | ||
"100%|██████████| 30/30 [00:00<00:00, 3197.53it/s]\n" | ||
] | ||
} | ||
] | ||
} | ||
] | ||
} |