Skip to content

Commit

Permalink
Merge pull request #153 from martinghunt/add_megares
Browse files Browse the repository at this point in the history
Add megares
  • Loading branch information
martinghunt authored Jan 17, 2017
2 parents 93bb5f7 + 1f6f349 commit 81bc46b
Show file tree
Hide file tree
Showing 18 changed files with 385 additions and 21 deletions.
2 changes: 2 additions & 0 deletions ariba/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
'histogram',
'link',
'mapping',
'megares_data_finder',
'megares_zip_parser',
'mlst_profile',
'mlst_reporter',
'pubmlst_getter',
Expand Down
25 changes: 25 additions & 0 deletions ariba/common.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
import os
import time
import sys
import subprocess
import urllib.request
import pyfastaq


class Error (Exception): pass


def syscall(cmd, allow_fail=False, verbose=False, verbose_filehandle=sys.stdout, print_errors=True):
if verbose:
print('syscall:', cmd, flush=True, file=verbose_filehandle)
Expand Down Expand Up @@ -44,3 +50,22 @@ def cat_files(infiles, outfile):
pyfastaq.utils.close(f_in)

pyfastaq.utils.close(f_out)


def download_file(url, outfile, max_attempts=3, sleep_time=2, verbose=False):
if verbose:
print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='', flush=True)

for i in range(max_attempts):
time.sleep(sleep_time)
try:
urllib.request.urlretrieve(url, filename=outfile)
except:
continue
break
else:
raise Error('Error downloading: ' + url)

if verbose:
print(' done', flush=True)

68 changes: 68 additions & 0 deletions ariba/megares_data_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import urllib.request
from bs4 import BeautifulSoup
from distutils.version import LooseVersion


class Error (Exception): pass


class MegaresDataFinder:
def __init__(self, version=None):
self.url_root = 'https://megares.meglab.org/download/'
self.index_url = self.url_root + 'index.php'
self.version = version


def _get_available_zips(self):
try:
response = urllib.request.urlopen(self.index_url)
html_text = response.read()
except:
raise Error('Error getting megares download page ' + self.index_url)

return html_text


@classmethod
def _zips_from_index_page_string(cls, html_text):
try:
soup = BeautifulSoup(html_text, 'html.parser')
except:
raise Error('Error parsing contents of megares download page. Cannot continue')

prefix = 'megares_v'
suffix = '.zip'
zips = {}

for link in soup.find_all('a'):
href = link.get('href')
if href.startswith(prefix) and href.endswith(suffix):
version = href[len(prefix):-len(suffix)]
zips[version] = href

return zips


@classmethod
def _get_url_for_version(cls, zips, version=None):
if version is None:
versions = list(zips.keys())
versions.sort(key=LooseVersion)
return zips[versions[-1]]
else:
try:
return zips[version]
except:
versions = ', '.join(list(zips.keys()))
raise Error('Error! version ' + version + ' of megares not found. Available versions: ' + versions)


def run(self):
print('Finding available megares versions from', self.index_url)
html_text = self._get_available_zips()
zips = MegaresDataFinder._zips_from_index_page_string(html_text)
print('Found versions: ', ', '.join(list(zips.keys())))
url = MegaresDataFinder._get_url_for_version(zips, version=self.version)
return self.url_root + url


119 changes: 119 additions & 0 deletions ariba/megares_zip_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import sys
import csv
import zipfile
import shutil
import pyfastaq
from ariba import common

class Error (Exception): pass


class MegaresZipParser:
def __init__(self, zip_url, outprefix):
self.zip_url = zip_url
self.outprefix = outprefix
self.zip_file = self.outprefix + '.downloaded.zip'


@classmethod
def _extract_files(cls, zip_file, outdir):
original_files = {'annotations': None, 'fasta': None, 'header_mappings': None}

try:
os.mkdir(outdir)
except:
raise Error('Error making directory ' + outdir)

zfile = zipfile.ZipFile(zip_file)
for member in zfile.namelist():
if '_annotations_' in member:
original_files['annotations'] = member
elif '_database_' in member and member.endswith('.fasta'):
original_files['fasta'] = member
elif '_header_mappings_' in member:
original_files['header_mappings'] = member
else:
continue

zfile.extract(member, path=outdir)

if None in original_files.values():
shutil.rmtree(outdir)
raise Error('Error. Not all expected files found in downloaded megares zipfile. ' + str(original_files))

return original_files


@classmethod
def _csv_to_dict(cls, infile, delimiter, expected_columns, key_column):
data = {}
non_key_columns = expected_columns - {key_column}

with open(infile) as f:
reader = csv.DictReader(f, delimiter=delimiter)
if set(expected_columns) != set(reader.fieldnames):
raise Error('Unexpected header in annotations file. Expected columns: ' + ','.join(expected_columns) + ' but got: ' + ','.join(reader.fieldnames))

for row in reader:
data[row[key_column]] = {x: row[x] for x in non_key_columns}

return data


@classmethod
def _load_annotations_file(cls, infile):
return MegaresZipParser._csv_to_dict(infile, ',', {'header', 'class', 'mechanism', 'group'}, 'header')


@classmethod
def _load_header_mappings_file(cls, infile):
return MegaresZipParser._csv_to_dict(infile, '\t', {'Source_Database', 'MEGARes_Header', 'Source_Headers(space_separated)'}, 'MEGARes_Header')


@classmethod
def _write_files(cls, outprefix, sequences, annotations, header_mappings):
fasta = outprefix + '.fa'
tsv = outprefix + '.tsv'
fh_fasta = pyfastaq.utils.open_file_write(fasta)
fh_tsv = pyfastaq.utils.open_file_write(tsv)

for seq in sorted(sequences):
final_column = []

if seq in annotations:
group = annotations[seq]['group']
final_column.append('class:' + annotations[seq]['class'] + '; mechanism:' + annotations[seq]['mechanism'] + '; group:' + group)
else:
group = 'unknown'
print('WARNING: sequence "', seq, '" has no record in annotations file', sep='', file=sys.stderr)

if seq in header_mappings:
final_column.append('Source_Database:' + header_mappings[seq]['Source_Database'] + '; Source_Headers:' + header_mappings[seq]['Source_Headers(space_separated)'])
else:
print('WARNING: sequence "', seq, '" has no record in header mappings file', sep='', file=sys.stderr)

if len(final_column) > 0:
print(group + '.' + seq, '1', '0', '.', '.', '; '.join(final_column), sep='\t', file=fh_tsv)
else:
print(group + '.' + seq, '1', '0', '.', '.', '.', sep='\t', file=fh_tsv)

sequences[seq].id = group + '.' + sequences[seq].id
print(sequences[seq], file=fh_fasta)

fh_fasta.close()
fh_tsv.close()


def run(self):
common.download_file(self.zip_url, self.zip_file, verbose=True)
tmpdir = self.zip_file + '.tmp.extract'
original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir)
annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations']))
header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings']))
sequences = {}
pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences)
MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data)
shutil.rmtree(tmpdir)
os.unlink(self.zip_file)

38 changes: 19 additions & 19 deletions ariba/ref_genes_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ class Error (Exception): pass
import shutil
import tarfile
import pyfastaq
import urllib.request
import time
import json
from ariba import common, card_record, vfdb_parser
from ariba import common, card_record, vfdb_parser, megares_data_finder, megares_zip_parser


allowed_ref_dbs = {
'argannot',
'card',
'megares',
'plasmidfinder',
'resfinder',
'srst2_argannot',
Expand All @@ -37,23 +37,9 @@ def __init__(self, ref_db, version=None, debug=False):
pyfastaq.sequences.genetic_code = self.genetic_code


def _download_file(self, url, outfile):
print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='', flush=True)
for i in range(self.max_download_attempts):
time.sleep(self.sleep_time)
try:
urllib.request.urlretrieve(url, filename=outfile)
except:
continue
break
else:
raise Error('Error downloading: ' + url)
print(' done', flush=True)


def _get_card_versions(self, tmp_file):
print('Getting available CARD versions')
self._download_file('https://card.mcmaster.ca/download', tmp_file)
common.download_file('https://card.mcmaster.ca/download', tmp_file, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
p = re.compile(r'''href="(/download/.*?broad.*?v([0-9]+\.[0-9]+\.[0-9]+)\.tar\.gz)"''')
versions = {}

Expand Down Expand Up @@ -269,7 +255,7 @@ def _get_from_argannot(self, outprefix):
raise Error('Error mkdir/chdir ' + tmpdir)

zipfile = 'arg-annot-database_doc.zip'
self._download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile)
common.download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
common.syscall('unzip ' + zipfile)
os.chdir(current_dir)
print('Extracted files.')
Expand Down Expand Up @@ -301,6 +287,20 @@ def _get_from_argannot(self, outprefix):
print(argannot_ref)


def _get_from_megares(self, outprefix):
data_finder = megares_data_finder.MegaresDataFinder(version=self.version)
download_url = data_finder.run()
zip_parser = megares_zip_parser.MegaresZipParser(download_url, outprefix)
zip_parser.run()
final_fasta = outprefix + '.fa'
final_tsv = outprefix + '.tsv'
print('Finished. Final files are:', final_fasta, final_tsv, sep='\n\t', end='\n\n')
print('You can use them with ARIBA like this:')
print('ariba prepareref -f', final_fasta, '-m', final_tsv, 'output_directory\n')
print('If you use this downloaded data, please cite:')
print('"MEGARes: an antimicrobial database for high throughput sequencing", Lakin et al 2016, PMID: PMC5210519\n')


def _get_from_plasmidfinder(self, outprefix):
outprefix = os.path.abspath(outprefix)
final_fasta = outprefix + '.fa'
Expand Down Expand Up @@ -408,7 +408,7 @@ def _get_from_vfdb_common(self, outprefix, filename, info_text):
raise Error('Error mkdir ' + tmpdir)

zipfile = os.path.join(tmpdir, filename)
self._download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile)
common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile, max_attempts=self.max_download_attempts, sleep_time=self.sleep_time, verbose=True)
print('Extracting files ... ', end='', flush=True)
vparser = vfdb_parser.VfdbParser(zipfile, outprefix)
vparser.run()
Expand Down
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions ariba/tests/data/megares_zip_parser_load_annotations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
header,class,mechanism,group
Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA,betalactams,Class A betalactamases,OXA
Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar,Class,foobar,Bar
3 changes: 3 additions & 0 deletions ariba/tests/data/megares_zip_parser_load_header_mappings.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Source_Database MEGARes_Header Source_Headers(space_separated)
SOURCE1 Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA source header 1
SOURCE2 Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar source header 2
6 changes: 6 additions & 0 deletions ariba/tests/data/megares_zip_parser_write_files.expect.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>OXA.Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA
ATGACCGAAAGCAGCGAACGCGCGTGCACCTGA
>group1.Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar
ATGTGCGCGCGCTGCGCGAGCAGCCGCGTGCTGGAATGA
>unknown.Only_in_fasta_file
ATGTGA
3 changes: 3 additions & 0 deletions ariba/tests/data/megares_zip_parser_write_files.expect.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
OXA.Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA 1 0 . . class:betalactams; mechanism:Class A betalactamases; group:OXA; Source_Database:SOURCE1; Source_Headers:source header 1
group1.Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar 1 0 . . class:Class foobar; mechanism:Bar; group:group1; Source_Database:SOURCE2; Source_Headers:source header 2
unknown.Only_in_fasta_file 1 0 . . .
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
header,class,mechanism,group
Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA,betalactams,Class A betalactamases,OXA
Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar,Class foobar,Bar,group1
only in annotations file,foo,bar,baz
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA
ATGACCGAAAGCAGCGAACGCGCGTGCACCTGA
>Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar
ATGTGCGCGCGCTGCGCGAGCAGCCGCGTGCTGGAATGA
>Only_in_fasta_file
ATGTGA
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Source_Database MEGARes_Header Source_Headers(space_separated)
SOURCE1 Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA source header 1
SOURCE2 Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar source header 2
sourceX only in header mapping file source header X
36 changes: 36 additions & 0 deletions ariba/tests/megares_data_finder_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import unittest
import os
from ariba import megares_data_finder

modules_dir = os.path.dirname(os.path.abspath(megares_data_finder.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')


class TestMegaresDataFinder(unittest.TestCase):
def test_zips_from_index_page_string(self):
'''test _zips_from_index_page_string'''
html_string = r''''<!doctype html>
<html>
<head>
</head>
<ul>
<li><a href="megares_v1.01.zip">All Files</a></li>
<li><a href="foo.zip">All Files</a></li>
<li><a href="megares_v1.00.zip">All Files</a></li>
</html>'''

expected = {'1.00': 'megares_v1.00.zip', '1.01': 'megares_v1.01.zip'}
got = megares_data_finder.MegaresDataFinder._zips_from_index_page_string(html_string)
self.assertEqual(expected, got)


def test_get_url_for_version(self):
'''test _get_url_for_version'''
zips = {'1.00': 'megares_v1.00.zip', '1.01': 'megares_v1.01.zip'}
self.assertEqual('megares_v1.01.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips))
self.assertEqual('megares_v1.00.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips, version='1.00'))
with self.assertRaises(megares_data_finder.Error):
self.assertEqual('megares_v1.00.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips, version='0.42'))

Loading

0 comments on commit 81bc46b

Please sign in to comment.