-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #153 from martinghunt/add_megares
Add megares
- Loading branch information
Showing
18 changed files
with
385 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
import urllib.request | ||
from bs4 import BeautifulSoup | ||
from distutils.version import LooseVersion | ||
|
||
|
||
class Error (Exception): pass | ||
|
||
|
||
class MegaresDataFinder: | ||
def __init__(self, version=None): | ||
self.url_root = 'https://megares.meglab.org/download/' | ||
self.index_url = self.url_root + 'index.php' | ||
self.version = version | ||
|
||
|
||
def _get_available_zips(self): | ||
try: | ||
response = urllib.request.urlopen(self.index_url) | ||
html_text = response.read() | ||
except: | ||
raise Error('Error getting megares download page ' + self.index_url) | ||
|
||
return html_text | ||
|
||
|
||
@classmethod | ||
def _zips_from_index_page_string(cls, html_text): | ||
try: | ||
soup = BeautifulSoup(html_text, 'html.parser') | ||
except: | ||
raise Error('Error parsing contents of megares download page. Cannot continue') | ||
|
||
prefix = 'megares_v' | ||
suffix = '.zip' | ||
zips = {} | ||
|
||
for link in soup.find_all('a'): | ||
href = link.get('href') | ||
if href.startswith(prefix) and href.endswith(suffix): | ||
version = href[len(prefix):-len(suffix)] | ||
zips[version] = href | ||
|
||
return zips | ||
|
||
|
||
@classmethod | ||
def _get_url_for_version(cls, zips, version=None): | ||
if version is None: | ||
versions = list(zips.keys()) | ||
versions.sort(key=LooseVersion) | ||
return zips[versions[-1]] | ||
else: | ||
try: | ||
return zips[version] | ||
except: | ||
versions = ', '.join(list(zips.keys())) | ||
raise Error('Error! version ' + version + ' of megares not found. Available versions: ' + versions) | ||
|
||
|
||
def run(self): | ||
print('Finding available megares versions from', self.index_url) | ||
html_text = self._get_available_zips() | ||
zips = MegaresDataFinder._zips_from_index_page_string(html_text) | ||
print('Found versions: ', ', '.join(list(zips.keys()))) | ||
url = MegaresDataFinder._get_url_for_version(zips, version=self.version) | ||
return self.url_root + url | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import os | ||
import sys | ||
import csv | ||
import zipfile | ||
import shutil | ||
import pyfastaq | ||
from ariba import common | ||
|
||
class Error (Exception): pass | ||
|
||
|
||
class MegaresZipParser: | ||
def __init__(self, zip_url, outprefix): | ||
self.zip_url = zip_url | ||
self.outprefix = outprefix | ||
self.zip_file = self.outprefix + '.downloaded.zip' | ||
|
||
|
||
@classmethod | ||
def _extract_files(cls, zip_file, outdir): | ||
original_files = {'annotations': None, 'fasta': None, 'header_mappings': None} | ||
|
||
try: | ||
os.mkdir(outdir) | ||
except: | ||
raise Error('Error making directory ' + outdir) | ||
|
||
zfile = zipfile.ZipFile(zip_file) | ||
for member in zfile.namelist(): | ||
if '_annotations_' in member: | ||
original_files['annotations'] = member | ||
elif '_database_' in member and member.endswith('.fasta'): | ||
original_files['fasta'] = member | ||
elif '_header_mappings_' in member: | ||
original_files['header_mappings'] = member | ||
else: | ||
continue | ||
|
||
zfile.extract(member, path=outdir) | ||
|
||
if None in original_files.values(): | ||
shutil.rmtree(outdir) | ||
raise Error('Error. Not all expected files found in downloaded megares zipfile. ' + str(original_files)) | ||
|
||
return original_files | ||
|
||
|
||
@classmethod | ||
def _csv_to_dict(cls, infile, delimiter, expected_columns, key_column): | ||
data = {} | ||
non_key_columns = expected_columns - {key_column} | ||
|
||
with open(infile) as f: | ||
reader = csv.DictReader(f, delimiter=delimiter) | ||
if set(expected_columns) != set(reader.fieldnames): | ||
raise Error('Unexpected header in annotations file. Expected columns: ' + ','.join(expected_columns) + ' but got: ' + ','.join(reader.fieldnames)) | ||
|
||
for row in reader: | ||
data[row[key_column]] = {x: row[x] for x in non_key_columns} | ||
|
||
return data | ||
|
||
|
||
@classmethod | ||
def _load_annotations_file(cls, infile): | ||
return MegaresZipParser._csv_to_dict(infile, ',', {'header', 'class', 'mechanism', 'group'}, 'header') | ||
|
||
|
||
@classmethod | ||
def _load_header_mappings_file(cls, infile): | ||
return MegaresZipParser._csv_to_dict(infile, '\t', {'Source_Database', 'MEGARes_Header', 'Source_Headers(space_separated)'}, 'MEGARes_Header') | ||
|
||
|
||
@classmethod | ||
def _write_files(cls, outprefix, sequences, annotations, header_mappings): | ||
fasta = outprefix + '.fa' | ||
tsv = outprefix + '.tsv' | ||
fh_fasta = pyfastaq.utils.open_file_write(fasta) | ||
fh_tsv = pyfastaq.utils.open_file_write(tsv) | ||
|
||
for seq in sorted(sequences): | ||
final_column = [] | ||
|
||
if seq in annotations: | ||
group = annotations[seq]['group'] | ||
final_column.append('class:' + annotations[seq]['class'] + '; mechanism:' + annotations[seq]['mechanism'] + '; group:' + group) | ||
else: | ||
group = 'unknown' | ||
print('WARNING: sequence "', seq, '" has no record in annotations file', sep='', file=sys.stderr) | ||
|
||
if seq in header_mappings: | ||
final_column.append('Source_Database:' + header_mappings[seq]['Source_Database'] + '; Source_Headers:' + header_mappings[seq]['Source_Headers(space_separated)']) | ||
else: | ||
print('WARNING: sequence "', seq, '" has no record in header mappings file', sep='', file=sys.stderr) | ||
|
||
if len(final_column) > 0: | ||
print(group + '.' + seq, '1', '0', '.', '.', '; '.join(final_column), sep='\t', file=fh_tsv) | ||
else: | ||
print(group + '.' + seq, '1', '0', '.', '.', '.', sep='\t', file=fh_tsv) | ||
|
||
sequences[seq].id = group + '.' + sequences[seq].id | ||
print(sequences[seq], file=fh_fasta) | ||
|
||
fh_fasta.close() | ||
fh_tsv.close() | ||
|
||
|
||
def run(self): | ||
common.download_file(self.zip_url, self.zip_file, verbose=True) | ||
tmpdir = self.zip_file + '.tmp.extract' | ||
original_files = MegaresZipParser._extract_files(self.zip_file, tmpdir) | ||
annotation_data = MegaresZipParser._load_annotations_file(os.path.join(tmpdir, original_files['annotations'])) | ||
header_data = MegaresZipParser._load_header_mappings_file(os.path.join(tmpdir, original_files['header_mappings'])) | ||
sequences = {} | ||
pyfastaq.tasks.file_to_dict(os.path.join(tmpdir, original_files['fasta']), sequences) | ||
MegaresZipParser._write_files(self.outprefix, sequences, annotation_data, header_data) | ||
shutil.rmtree(tmpdir) | ||
os.unlink(self.zip_file) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
header,class,mechanism,group | ||
Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA,betalactams,Class A betalactamases,OXA | ||
Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar,Class,foobar,Bar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Source_Database MEGARes_Header Source_Headers(space_separated) | ||
SOURCE1 Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA source header 1 | ||
SOURCE2 Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar source header 2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
>OXA.Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA | ||
ATGACCGAAAGCAGCGAACGCGCGTGCACCTGA | ||
>group1.Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar | ||
ATGTGCGCGCGCTGCGCGAGCAGCCGCGTGCTGGAATGA | ||
>unknown.Only_in_fasta_file | ||
ATGTGA |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
OXA.Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA 1 0 . . class:betalactams; mechanism:Class A betalactamases; group:OXA; Source_Database:SOURCE1; Source_Headers:source header 1 | ||
group1.Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar 1 0 . . class:Class foobar; mechanism:Bar; group:group1; Source_Database:SOURCE2; Source_Headers:source header 2 | ||
unknown.Only_in_fasta_file 1 0 . . . |
4 changes: 4 additions & 0 deletions
4
ariba/tests/data/megares_zip_parser_write_files/megares_annotations_v1.01.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
header,class,mechanism,group | ||
Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA,betalactams,Class A betalactamases,OXA | ||
Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar,Class foobar,Bar,group1 | ||
only in annotations file,foo,bar,baz |
6 changes: 6 additions & 0 deletions
6
ariba/tests/data/megares_zip_parser_write_files/megares_database_v1.01.fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
>Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA | ||
ATGACCGAAAGCAGCGAACGCGCGTGCACCTGA | ||
>Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar | ||
ATGTGCGCGCGCTGCGCGAGCAGCCGCGTGCTGGAATGA | ||
>Only_in_fasta_file | ||
ATGTGA |
4 changes: 4 additions & 0 deletions
4
...a/tests/data/megares_zip_parser_write_files/megares_to_external_header_mappings_v1.01.tsv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
Source_Database MEGARes_Header Source_Headers(space_separated) | ||
SOURCE1 Bla|OXA-1|JN123456|42-141|100|betalactams|Class_A_betalactamases|OXA source header 1 | ||
SOURCE2 Foo|Bar-1|JN42|1-11|10|foobar|Class_foobar|Bar source header 2 | ||
sourceX only in header mapping file source header X |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import unittest | ||
import os | ||
from ariba import megares_data_finder | ||
|
||
modules_dir = os.path.dirname(os.path.abspath(megares_data_finder.__file__)) | ||
data_dir = os.path.join(modules_dir, 'tests', 'data') | ||
|
||
|
||
class TestMegaresDataFinder(unittest.TestCase): | ||
def test_zips_from_index_page_string(self): | ||
'''test _zips_from_index_page_string''' | ||
html_string = r''''<!doctype html> | ||
<html> | ||
<head> | ||
</head> | ||
<ul> | ||
<li><a href="megares_v1.01.zip">All Files</a></li> | ||
<li><a href="foo.zip">All Files</a></li> | ||
<li><a href="megares_v1.00.zip">All Files</a></li> | ||
</html>''' | ||
|
||
expected = {'1.00': 'megares_v1.00.zip', '1.01': 'megares_v1.01.zip'} | ||
got = megares_data_finder.MegaresDataFinder._zips_from_index_page_string(html_string) | ||
self.assertEqual(expected, got) | ||
|
||
|
||
def test_get_url_for_version(self): | ||
'''test _get_url_for_version''' | ||
zips = {'1.00': 'megares_v1.00.zip', '1.01': 'megares_v1.01.zip'} | ||
self.assertEqual('megares_v1.01.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips)) | ||
self.assertEqual('megares_v1.00.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips, version='1.00')) | ||
with self.assertRaises(megares_data_finder.Error): | ||
self.assertEqual('megares_v1.00.zip', megares_data_finder.MegaresDataFinder._get_url_for_version(zips, version='0.42')) | ||
|
Oops, something went wrong.