From 41777330eed7d0a48a12d5b18f8ef0ffbc23e384 Mon Sep 17 00:00:00 2001 From: harshad Date: Fri, 16 Oct 2020 11:05:35 -0500 Subject: [PATCH] Pushing notebook and output file. addresses #1 --- requirements.txt | 1 + .../xmlParsing-checkpoint.ipynb | 236 ++++++++++++++++++ src/notebooks/xmlParsing.ipynb | 236 ++++++++++++++++++ 3 files changed, 473 insertions(+) create mode 100644 src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb create mode 100644 src/notebooks/xmlParsing.ipynb diff --git a/requirements.txt b/requirements.txt index d769fd8..38c16aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,6 +27,7 @@ jupyterlab==2.2.4 jupyterlab-server==1.2.0 kiwisolver==1.2.0 locket==0.2.0 +lxml==4.5.2 MarkupSafe==1.1.1 matplotlib==3.3.0 mistune==0.8.4 diff --git a/src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb b/src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb new file mode 100644 index 0000000..1fdff11 --- /dev/null +++ b/src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import data and libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get XML file\n", + "# !wget -P ../../target -c \"https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#!tail -n 100 '../input/bioproject.xml'\n", + "#!head -n 100 '../input/bioproject.xml'" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 34 µs, sys: 70 µs, total: 104 µs\n", + "Wall time: 116 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "import lxml.etree as ET\n", + "import pandas as pd\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using lxml\n", + "\n", + "https://lxml.de/api/lxml-module.html" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs\n", + "Wall time: 12.2 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "def findTagInfo(fileName):\n", + " count = 0\n", + " col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']\n", + " \n", + " fn = '../../target/biosampleDescriptionDF.tsv'\n", + " \n", + " for _, elem in ET.iterparse(fileName, events=('end',),\n", + " tag=('ArchiveID', 'ProjectDescr'),\n", + " remove_blank_text=True):\n", + " studyName = None\n", + " studyTitle = None\n", + " studyDesc = None\n", + " biosampleList = []\n", + " df = pd.DataFrame(columns = col)\n", + " \n", + " if elem.tag == 'ArchiveID':\n", + " studyId = elem.get('accession')\n", + " \n", + " elif elem.tag == 'ProjectDescr':\n", + " studyName = elem.findtext('Name')\n", + " studyTitle = elem.findtext('Title')\n", + " studyDesc = elem.findtext('Description')\n", + " locus = elem.findall('LocusTagPrefix')\n", + " for l in locus:\n", + " if l.get('biosample_id') is not None:\n", + " biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))\n", + "\n", + " else:\n", + " elem.clear()\n", + " \n", + " if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:\n", + " count += 1\n", + " for i,v in enumerate(biosampleList):\n", + " df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))\n", + " \n", + " \n", + " if count == 1:\n", + " df.to_csv(fn , sep='\\t', index=False)\n", + " else:\n", + " df.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s\n", + "Wall time: 1h 18min 57s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "xml = '../input/bioproject.xml'\n", + "findTagInfo(xml)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "###################DEPRECATED############\n", + "\n", + "# Using standard python library xml.etree.ElementTree (Slow)\n", + "\n", + "'''%%time\n", + "xtree = et.parse('../../target/bioproject.xml')\n", + "xroot = xtree.getroot()\n", + "\n", + "def nodeParse(node, col):\n", + " \n", + " df = pd.DataFrame(columns = col)\n", + " biosampleIdList = []\n", + " studyName = None \n", + " studyTitle = None\n", + " studyDesc = None\n", + " \n", + " \n", + " for childNode in node:\n", + " #print(childNode.tag, childNode.attrib)\n", + " if childNode.tag == 'Name':\n", + " studyName = childNode.text\n", + " #print(studyName)\n", + " if childNode.tag == 'Title':\n", + " studyTitle = childNode.text\n", + " #print(studyTitle)\n", + " if childNode.tag == 'Description':\n", + " studyDesc = childNode.text\n", + " #print(studyDesc)\n", + " if childNode.tag == 'LocusTagPrefix':\n", + " #if 'assembly_id' in childNode.attrib:\n", + " #assemblyId = childNode.attrib['assembly_id']\n", + " #print(assemblyId)\n", + " \n", + " if 'biosample_id' in childNode.attrib:\n", + " biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])\n", + " \n", + " for i,v in enumerate(biosampleIdList):\n", + " if None not in (studyName, studyTitle, studyDesc, v):\n", + " df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))\n", + "\n", + " return df\n", + "\n", + "%timeit\n", + "count = 0\n", + "fn = '../../target/biosampleDescriptionDF.tsv'\n", + "dfCols = ['Name', 'Title', 'Description', 'BiosampleId']\n", + "dfMain = pd.DataFrame(columns = dfCols)\n", + "for n in xroot.iterparse('ProjectDescr'):\n", + " count += 1\n", + " dfMain = dfMain.append(nodeParse(n, dfCols))\n", + " if count % 1000 == 0:\n", + " print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime(\"%d/%m/%Y %H:%M:%S\"))\n", + " if count == 1000:\n", + " dfMain.to_csv(fn , sep='\\t', index=False)\n", + " elif count > 1000:\n", + " dfMain.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)\n", + " \n", + " dfMain = pd.DataFrame(columns = dfCols)\n", + " #break;'''\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/notebooks/xmlParsing.ipynb b/src/notebooks/xmlParsing.ipynb new file mode 100644 index 0000000..1fdff11 --- /dev/null +++ b/src/notebooks/xmlParsing.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import data and libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get XML file\n", + "# !wget -P ../../target -c \"https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#!tail -n 100 '../input/bioproject.xml'\n", + "#!head -n 100 '../input/bioproject.xml'" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 34 µs, sys: 70 µs, total: 104 µs\n", + "Wall time: 116 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "import lxml.etree as ET\n", + "import pandas as pd\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using lxml\n", + "\n", + "https://lxml.de/api/lxml-module.html" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs\n", + "Wall time: 12.2 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "def findTagInfo(fileName):\n", + " count = 0\n", + " col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']\n", + " \n", + " fn = '../../target/biosampleDescriptionDF.tsv'\n", + " \n", + " for _, elem in ET.iterparse(fileName, events=('end',),\n", + " tag=('ArchiveID', 'ProjectDescr'),\n", + " remove_blank_text=True):\n", + " studyName = None\n", + " studyTitle = None\n", + " studyDesc = None\n", + " biosampleList = []\n", + " df = pd.DataFrame(columns = col)\n", + " \n", + " if elem.tag == 'ArchiveID':\n", + " studyId = elem.get('accession')\n", + " \n", + " elif elem.tag == 'ProjectDescr':\n", + " studyName = elem.findtext('Name')\n", + " studyTitle = elem.findtext('Title')\n", + " studyDesc = elem.findtext('Description')\n", + " locus = elem.findall('LocusTagPrefix')\n", + " for l in locus:\n", + " if l.get('biosample_id') is not None:\n", + " biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))\n", + "\n", + " else:\n", + " elem.clear()\n", + " \n", + " if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:\n", + " count += 1\n", + " for i,v in enumerate(biosampleList):\n", + " df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))\n", + " \n", + " \n", + " if count == 1:\n", + " df.to_csv(fn , sep='\\t', index=False)\n", + " else:\n", + " df.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s\n", + "Wall time: 1h 18min 57s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "xml = '../input/bioproject.xml'\n", + "findTagInfo(xml)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "###################DEPRECATED############\n", + "\n", + "# Using standard python library xml.etree.ElementTree (Slow)\n", + "\n", + "'''%%time\n", + "xtree = et.parse('../../target/bioproject.xml')\n", + "xroot = xtree.getroot()\n", + "\n", + "def nodeParse(node, col):\n", + " \n", + " df = pd.DataFrame(columns = col)\n", + " biosampleIdList = []\n", + " studyName = None \n", + " studyTitle = None\n", + " studyDesc = None\n", + " \n", + " \n", + " for childNode in node:\n", + " #print(childNode.tag, childNode.attrib)\n", + " if childNode.tag == 'Name':\n", + " studyName = childNode.text\n", + " #print(studyName)\n", + " if childNode.tag == 'Title':\n", + " studyTitle = childNode.text\n", + " #print(studyTitle)\n", + " if childNode.tag == 'Description':\n", + " studyDesc = childNode.text\n", + " #print(studyDesc)\n", + " if childNode.tag == 'LocusTagPrefix':\n", + " #if 'assembly_id' in childNode.attrib:\n", + " #assemblyId = childNode.attrib['assembly_id']\n", + " #print(assemblyId)\n", + " \n", + " if 'biosample_id' in childNode.attrib:\n", + " biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])\n", + " \n", + " for i,v in enumerate(biosampleIdList):\n", + " if None not in (studyName, studyTitle, studyDesc, v):\n", + " df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))\n", + "\n", + " return df\n", + "\n", + "%timeit\n", + "count = 0\n", + "fn = '../../target/biosampleDescriptionDF.tsv'\n", + "dfCols = ['Name', 'Title', 'Description', 'BiosampleId']\n", + "dfMain = pd.DataFrame(columns = dfCols)\n", + "for n in xroot.iterparse('ProjectDescr'):\n", + " count += 1\n", + " dfMain = dfMain.append(nodeParse(n, dfCols))\n", + " if count % 1000 == 0:\n", + " print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime(\"%d/%m/%Y %H:%M:%S\"))\n", + " if count == 1000:\n", + " dfMain.to_csv(fn , sep='\\t', index=False)\n", + " elif count > 1000:\n", + " dfMain.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)\n", + " \n", + " dfMain = pd.DataFrame(columns = dfCols)\n", + " #break;'''\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}