Skip to content

Commit

Permalink
Merge pull request #33 from INCATools/xmlParsing_hhegde
Browse files Browse the repository at this point in the history
Pushing notebook and output file. addresses #1
  • Loading branch information
hrshdhgd authored Oct 16, 2020
2 parents 2f830e5 + 4177733 commit 36ac94e
Show file tree
Hide file tree
Showing 3 changed files with 473 additions and 0 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ jupyterlab==2.2.4
jupyterlab-server==1.2.0
kiwisolver==1.2.0
locket==0.2.0
lxml==4.5.2
MarkupSafe==1.1.1
matplotlib==3.3.0
mistune==0.8.4
Expand Down
236 changes: 236 additions & 0 deletions src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import data and libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get XML file\n",
"# !wget -P ../../target -c \"https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml\""
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#!tail -n 100 '../input/bioproject.xml'\n",
"#!head -n 100 '../input/bioproject.xml'"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 34 µs, sys: 70 µs, total: 104 µs\n",
"Wall time: 116 µs\n"
]
}
],
"source": [
"%%time\n",
"import lxml.etree as ET\n",
"import pandas as pd\n",
"from datetime import datetime"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Using lxml\n",
"\n",
"https://lxml.de/api/lxml-module.html"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs\n",
"Wall time: 12.2 µs\n"
]
}
],
"source": [
"%%time\n",
"\n",
"def findTagInfo(fileName):\n",
" count = 0\n",
" col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']\n",
" \n",
" fn = '../../target/biosampleDescriptionDF.tsv'\n",
" \n",
" for _, elem in ET.iterparse(fileName, events=('end',),\n",
" tag=('ArchiveID', 'ProjectDescr'),\n",
" remove_blank_text=True):\n",
" studyName = None\n",
" studyTitle = None\n",
" studyDesc = None\n",
" biosampleList = []\n",
" df = pd.DataFrame(columns = col)\n",
" \n",
" if elem.tag == 'ArchiveID':\n",
" studyId = elem.get('accession')\n",
" \n",
" elif elem.tag == 'ProjectDescr':\n",
" studyName = elem.findtext('Name')\n",
" studyTitle = elem.findtext('Title')\n",
" studyDesc = elem.findtext('Description')\n",
" locus = elem.findall('LocusTagPrefix')\n",
" for l in locus:\n",
" if l.get('biosample_id') is not None:\n",
" biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))\n",
"\n",
" else:\n",
" elem.clear()\n",
" \n",
" if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:\n",
" count += 1\n",
" for i,v in enumerate(biosampleList):\n",
" df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))\n",
" \n",
" \n",
" if count == 1:\n",
" df.to_csv(fn , sep='\\t', index=False)\n",
" else:\n",
" df.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s\n",
"Wall time: 1h 18min 57s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"xml = '../input/bioproject.xml'\n",
"findTagInfo(xml)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"###################DEPRECATED############\n",
"\n",
"# Using standard python library xml.etree.ElementTree (Slow)\n",
"\n",
"'''%%time\n",
"xtree = et.parse('../../target/bioproject.xml')\n",
"xroot = xtree.getroot()\n",
"\n",
"def nodeParse(node, col):\n",
" \n",
" df = pd.DataFrame(columns = col)\n",
" biosampleIdList = []\n",
" studyName = None \n",
" studyTitle = None\n",
" studyDesc = None\n",
" \n",
" \n",
" for childNode in node:\n",
" #print(childNode.tag, childNode.attrib)\n",
" if childNode.tag == 'Name':\n",
" studyName = childNode.text\n",
" #print(studyName)\n",
" if childNode.tag == 'Title':\n",
" studyTitle = childNode.text\n",
" #print(studyTitle)\n",
" if childNode.tag == 'Description':\n",
" studyDesc = childNode.text\n",
" #print(studyDesc)\n",
" if childNode.tag == 'LocusTagPrefix':\n",
" #if 'assembly_id' in childNode.attrib:\n",
" #assemblyId = childNode.attrib['assembly_id']\n",
" #print(assemblyId)\n",
" \n",
" if 'biosample_id' in childNode.attrib:\n",
" biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])\n",
" \n",
" for i,v in enumerate(biosampleIdList):\n",
" if None not in (studyName, studyTitle, studyDesc, v):\n",
" df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))\n",
"\n",
" return df\n",
"\n",
"%timeit\n",
"count = 0\n",
"fn = '../../target/biosampleDescriptionDF.tsv'\n",
"dfCols = ['Name', 'Title', 'Description', 'BiosampleId']\n",
"dfMain = pd.DataFrame(columns = dfCols)\n",
"for n in xroot.iterparse('ProjectDescr'):\n",
" count += 1\n",
" dfMain = dfMain.append(nodeParse(n, dfCols))\n",
" if count % 1000 == 0:\n",
" print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime(\"%d/%m/%Y %H:%M:%S\"))\n",
" if count == 1000:\n",
" dfMain.to_csv(fn , sep='\\t', index=False)\n",
" elif count > 1000:\n",
" dfMain.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)\n",
" \n",
" dfMain = pd.DataFrame(columns = dfCols)\n",
" #break;'''\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 36ac94e

Please sign in to comment.