-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #33 from INCATools/xmlParsing_hhegde
Pushing notebook and output file. addresses #1
- Loading branch information
Showing
3 changed files
with
473 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
236 changes: 236 additions & 0 deletions
236
src/notebooks/.ipynb_checkpoints/xmlParsing-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Import data and libraries" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# get XML file\n", | ||
"# !wget -P ../../target -c \"https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#!tail -n 100 '../input/bioproject.xml'\n", | ||
"#!head -n 100 '../input/bioproject.xml'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 98, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 34 µs, sys: 70 µs, total: 104 µs\n", | ||
"Wall time: 116 µs\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%time\n", | ||
"import lxml.etree as ET\n", | ||
"import pandas as pd\n", | ||
"from datetime import datetime" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Using lxml\n", | ||
"\n", | ||
"https://lxml.de/api/lxml-module.html" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 99, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs\n", | ||
"Wall time: 12.2 µs\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%time\n", | ||
"\n", | ||
"def findTagInfo(fileName):\n", | ||
" count = 0\n", | ||
" col = ['StudyId', 'Name', 'Title', 'Description', 'BiosampleId']\n", | ||
" \n", | ||
" fn = '../../target/biosampleDescriptionDF.tsv'\n", | ||
" \n", | ||
" for _, elem in ET.iterparse(fileName, events=('end',),\n", | ||
" tag=('ArchiveID', 'ProjectDescr'),\n", | ||
" remove_blank_text=True):\n", | ||
" studyName = None\n", | ||
" studyTitle = None\n", | ||
" studyDesc = None\n", | ||
" biosampleList = []\n", | ||
" df = pd.DataFrame(columns = col)\n", | ||
" \n", | ||
" if elem.tag == 'ArchiveID':\n", | ||
" studyId = elem.get('accession')\n", | ||
" \n", | ||
" elif elem.tag == 'ProjectDescr':\n", | ||
" studyName = elem.findtext('Name')\n", | ||
" studyTitle = elem.findtext('Title')\n", | ||
" studyDesc = elem.findtext('Description')\n", | ||
" locus = elem.findall('LocusTagPrefix')\n", | ||
" for l in locus:\n", | ||
" if l.get('biosample_id') is not None:\n", | ||
" biosampleList.append('BIOSAMPLE:'+l.get('biosample_id'))\n", | ||
"\n", | ||
" else:\n", | ||
" elem.clear()\n", | ||
" \n", | ||
" if None not in (studyId, studyName, studyTitle, studyDesc) and len(biosampleList)>0:\n", | ||
" count += 1\n", | ||
" for i,v in enumerate(biosampleList):\n", | ||
" df = df.append(pd.DataFrame([[studyId, studyName, studyTitle, studyDesc, v]],columns = col))\n", | ||
" \n", | ||
" \n", | ||
" if count == 1:\n", | ||
" df.to_csv(fn , sep='\\t', index=False)\n", | ||
" else:\n", | ||
" df.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 100, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"CPU times: user 1h 7min 57s, sys: 3min 50s, total: 1h 11min 47s\n", | ||
"Wall time: 1h 18min 57s\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%%time\n", | ||
"\n", | ||
"xml = '../input/bioproject.xml'\n", | ||
"findTagInfo(xml)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"###################DEPRECATED############\n", | ||
"\n", | ||
"# Using standard python library xml.etree.ElementTree (Slow)\n", | ||
"\n", | ||
"'''%%time\n", | ||
"xtree = et.parse('../../target/bioproject.xml')\n", | ||
"xroot = xtree.getroot()\n", | ||
"\n", | ||
"def nodeParse(node, col):\n", | ||
" \n", | ||
" df = pd.DataFrame(columns = col)\n", | ||
" biosampleIdList = []\n", | ||
" studyName = None \n", | ||
" studyTitle = None\n", | ||
" studyDesc = None\n", | ||
" \n", | ||
" \n", | ||
" for childNode in node:\n", | ||
" #print(childNode.tag, childNode.attrib)\n", | ||
" if childNode.tag == 'Name':\n", | ||
" studyName = childNode.text\n", | ||
" #print(studyName)\n", | ||
" if childNode.tag == 'Title':\n", | ||
" studyTitle = childNode.text\n", | ||
" #print(studyTitle)\n", | ||
" if childNode.tag == 'Description':\n", | ||
" studyDesc = childNode.text\n", | ||
" #print(studyDesc)\n", | ||
" if childNode.tag == 'LocusTagPrefix':\n", | ||
" #if 'assembly_id' in childNode.attrib:\n", | ||
" #assemblyId = childNode.attrib['assembly_id']\n", | ||
" #print(assemblyId)\n", | ||
" \n", | ||
" if 'biosample_id' in childNode.attrib:\n", | ||
" biosampleIdList.append('BIOSAMPLE:'+ childNode.attrib['biosample_id'])\n", | ||
" \n", | ||
" for i,v in enumerate(biosampleIdList):\n", | ||
" if None not in (studyName, studyTitle, studyDesc, v):\n", | ||
" df = df.append(pd.DataFrame([[studyName, studyTitle, studyDesc, v]],columns = col))\n", | ||
"\n", | ||
" return df\n", | ||
"\n", | ||
"%timeit\n", | ||
"count = 0\n", | ||
"fn = '../../target/biosampleDescriptionDF.tsv'\n", | ||
"dfCols = ['Name', 'Title', 'Description', 'BiosampleId']\n", | ||
"dfMain = pd.DataFrame(columns = dfCols)\n", | ||
"for n in xroot.iterparse('ProjectDescr'):\n", | ||
" count += 1\n", | ||
" dfMain = dfMain.append(nodeParse(n, dfCols))\n", | ||
" if count % 1000 == 0:\n", | ||
" print('Counter value: '+ str(count//1000)+ 'K - '+ datetime.now().strftime(\"%d/%m/%Y %H:%M:%S\"))\n", | ||
" if count == 1000:\n", | ||
" dfMain.to_csv(fn , sep='\\t', index=False)\n", | ||
" elif count > 1000:\n", | ||
" dfMain.to_csv(fn, sep='\\t', mode='a+', header=False, index=False)\n", | ||
" \n", | ||
" dfMain = pd.DataFrame(columns = dfCols)\n", | ||
" #break;'''\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.