Merge pull request #33 from INCATools/xmlParsing_hhegde

Pushing notebook and output file. addresses #1
INCATools · Oct 16, 2020 · 36ac94e · 36ac94e
2 parents 2f830e5 + 4177733
commit 36ac94e
Show file tree

Hide file tree

Showing 3 changed files with 473 additions and 0 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -27,6 +27,7 @@ jupyterlab==2.2.4
 jupyterlab-server==1.2.0
 kiwisolver==1.2.0
 locket==0.2.0
+lxml==4.5.2
 MarkupSafe==1.1.1
 matplotlib==3.3.0
 mistune==0.8.4

diff --git a/src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb b/src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import data and libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get XML file\n",
+    "# !wget -P ../../target -c \"https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!tail -n 100 '../input/bioproject.xml'\n",
+    "#!head -n 100 '../input/bioproject.xml'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 34 µs, sys: 70 µs, total: 104 µs\n",
+      "Wall time: 116 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "import lxml.etree as ET\n",
+    "import pandas as pd\n",
+    "from datetime import datetime"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using lxml\n",
+    "\n",
+    "https://lxml.de/api/lxml-module.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs\n",
+      "Wall time: 12.2 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "def findTagInfo(fileName):\n",
+    "    count = 0\n",
+    "    col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']\n",
+    "    \n",
+    "    fn = '../../target/biosampleDescriptionDF.tsv'\n",
+    "    \n",
+    "    for _, elem in ET.iterparse(fileName, events=('end',),\n",
+    "                                   tag=('ArchiveID', 'ProjectDescr'),\n",
+    "                                   remove_blank_text=True):\n",
+    "        studyName = None\n",
+    "        studyTitle = None\n",
+    "        studyDesc = None\n",
+    "        biosampleList = []\n",
+    "        df = pd.DataFrame(columns = col)\n",
+    "        \n",
+    "        if elem.tag == 'ArchiveID':\n",
+    "            studyId = elem.get('accession')\n",
+    "        \n",
+    "        elif elem.tag == 'ProjectDescr':\n",
+    "            studyName = elem.findtext('Name')\n",
+    "            studyTitle = elem.findtext('Title')\n",
+    "            studyDesc = elem.findtext('Description')\n",
+    "            locus = elem.findall('LocusTagPrefix')\n",
+    "            for l in locus:\n",
+    "                if l.get('biosample_id') is not None:\n",
+    "                    biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))\n",
+    "\n",
+    "        else:\n",
+    "            elem.clear()\n",
+    "        \n",
+    "        if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:\n",
+    "            count += 1\n",
+    "            for i,v in enumerate(biosampleList):\n",
+    "                df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))\n",
+    "                \n",
+    "        \n",
+    "        if count == 1:\n",
+    "            df.to_csv(fn , sep='\\t', index=False)\n",
+    "        else:\n",
+    "            df.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s\n",
+      "Wall time: 1h 18min 57s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "xml = '../input/bioproject.xml'\n",
+    "findTagInfo(xml)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "###################DEPRECATED############\n",
+    "\n",
+    "# Using standard python library xml.etree.ElementTree (Slow)\n",
+    "\n",
+    "'''%%time\n",
+    "xtree = et.parse('../../target/bioproject.xml')\n",
+    "xroot = xtree.getroot()\n",
+    "\n",
+    "def nodeParse(node, col):\n",
+    "  \n",
+    "    df = pd.DataFrame(columns = col)\n",
+    "    biosampleIdList = []\n",
+    "    studyName =  None \n",
+    "    studyTitle = None\n",
+    "    studyDesc = None\n",
+    "    \n",
+    "    \n",
+    "    for childNode in node:\n",
+    "        #print(childNode.tag, childNode.attrib)\n",
+    "        if childNode.tag == 'Name':\n",
+    "            studyName = childNode.text\n",
+    "            #print(studyName)\n",
+    "        if childNode.tag == 'Title':\n",
+    "            studyTitle = childNode.text\n",
+    "            #print(studyTitle)\n",
+    "        if childNode.tag == 'Description':\n",
+    "            studyDesc = childNode.text\n",
+    "            #print(studyDesc)\n",
+    "        if childNode.tag == 'LocusTagPrefix':\n",
+    "            #if 'assembly_id' in childNode.attrib:\n",
+    "                #assemblyId = childNode.attrib['assembly_id']\n",
+    "                #print(assemblyId)\n",
+    "                \n",
+    "            if 'biosample_id' in childNode.attrib:\n",
+    "                biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])\n",
+    "                \n",
+    "                for i,v in enumerate(biosampleIdList):\n",
+    "                    if None not in (studyName, studyTitle, studyDesc, v):\n",
+    "                        df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))\n",
+    "\n",
+    "    return df\n",
+    "\n",
+    "%timeit\n",
+    "count = 0\n",
+    "fn = '../../target/biosampleDescriptionDF.tsv'\n",
+    "dfCols = ['Name', 'Title', 'Description', 'BiosampleId']\n",
+    "dfMain = pd.DataFrame(columns = dfCols)\n",
+    "for n in xroot.iterparse('ProjectDescr'):\n",
+    "    count += 1\n",
+    "    dfMain = dfMain.append(nodeParse(n, dfCols))\n",
+    "    if count % 1000 == 0:\n",
+    "        print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime(\"%d/%m/%Y %H:%M:%S\"))\n",
+    "        if count == 1000:\n",
+    "            dfMain.to_csv(fn , sep='\\t', index=False)\n",
+    "        elif count > 1000:\n",
+    "            dfMain.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)\n",
+    "            \n",
+    "        dfMain = pd.DataFrame(columns = dfCols)\n",
+    "        #break;'''\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}