-
Notifications
You must be signed in to change notification settings - Fork 8
/
validate.py
executable file
·208 lines (186 loc) · 8.58 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
import collections
import csv
import json
import math
import sys
from bisect import bisect_right
from time import time
try:
import lxml.etree as etree
except ImportError:
import xml.etree.ElementTree as etree
import logging
import JSON_templates
from helpers import auto_open
logger = logging.getLogger("validator")
def load_mapping(path):
with auto_open(path, 'rb') as fh:
data = json.loads(fh.read().decode('utf-8'))
return data
def parse_orthoxml(fh, valid_ids, excluded_ids):
nsmap = {}
og_level = 0
in_species = False
nr_species_done = 0
nr_excluded_genes = 0
max_invalid_ids = 50
def fixtag(ns, tag):
return "{" + nsmap[ns] + "}" + tag
logger.info("start mapping of orthoxml formatted input file")
# The equivalent to ancestor-or-self
parentStack = []
for event, elem in etree.iterparse(fh, events=('start-ns', 'start', 'end')):
if event == 'start':
elemTag = elem.tag
#if elem.tag == fixtag('', 'orthologGroup'):
if elemTag.endswith('}orthologGroup'):
assert not in_species
og_level += 1
#elif elem.tag == fixtag('', 'groups'):
elif elemTag.endswith('}groups'):
assert nr_species_done > 0
#elif elem.tag == fixtag('', 'species'):
elif elemTag.endswith('}species'):
assert not in_species
in_species = True
# The list of ancestors to the elem
# is kept here because xml.etree.ElementTree
# does not keep the parent in the element nodes
parentStack.append(elem)
elif event == 'end':
parentStack.pop()
elemTag = elem.tag
#if elem.tag == fixtag('', 'orthologGroup'):
if elemTag.endswith('}orthologGroup'):
og_level -= 1
assert og_level >= 0
#elif elem.tag == fixtag('', 'species'):
elif elemTag.endswith('}species'):
assert in_species
in_species = False
nr_species_done += 1
#elif elem.tag == fixtag('', 'gene'):
elif elemTag.endswith('}gene'):
try:
protId = elem.get('protId')
if protId not in valid_ids:
if protId not in excluded_ids:
max_invalid_ids -= 1
logger.warning("\"{}\" is an invalid protein id for this reference dataset"
.format(protId))
if max_invalid_ids < 0:
raise AssertionError(
'Too many invalid crossreferences found. Did you select the right reference dataset?')
else:
logger.debug("excluding protein \"{}\" from the benchmark analysis".format(protId))
nr_excluded_genes += 1
except KeyError:
raise AssertionError('<gene> elements must encode xref in "protId" attribute')
# we can clear all elements right away
elem.clear()
# This commented piece of code only works in lxml
# and it was obtained from
# https://web.archive.org/web/20210309115224/http://www.ibm.com/developerworks/xml/library/x-hiperfparse/#listing4
# Basically, it removes all the previous siblings
# of current element
#while elem.getprevious() is not None:
# del elem.getparent()[0]
# It is the inspiration to next one.
if len(parentStack) > 0:
eparent = parentStack[-1]
while len(eparent) > 1:
del eparent[0]
elif event == 'start-ns':
ns, url = elem
nsmap[ns] = url
assert not in_species
assert og_level == 0
assert nr_species_done > 0
if nr_excluded_genes > 0:
logger.info("Excluded {} genes form the predictions (excluded species)".format(nr_excluded_genes))
return True
def parse_tsv(fh, valid_ids, excluded_ids):
logger.info("trying to read as tsv file")
max_errors = 5
invalid_ids = set([])
reported_excluded = set([])
dialect = csv.Sniffer().sniff(fh.read(2048))
fh.seek(0)
csv_reader = csv.reader(fh, dialect)
def check_if_valid_id(id_):
if id_ not in valid_ids:
if id_ in excluded_ids:
if not id_ in reported_excluded:
reported_excluded.add(id_)
logger.debug("protein \"{}\" is excluded from benchmarking (but part of reference proteomes set)."
.format(id_))
elif id_ not in invalid_ids:
invalid_ids.add(id_)
logger.warning("\"{}\" is an invalid protein id for this reference dataset"
.format(id_))
if len(invalid_ids) > 50:
raise AssertionError(
'Too many invalid crossreferences found. Did you select the right reference dataset?')
line_nr = 0
for line_nr, row in enumerate(csv_reader):
if len(row) < 2:
logger.warning("skipping relation on line {} ({})"
.format(line_nr, row))
max_errors -= 1
if max_errors < 0:
raise AssertionError("Too many lines with less than 2 elements")
continue
for z in row[:2]:
check_if_valid_id(z)
if line_nr < 100:
raise AssertionError("Too few ortholog pairs to be analysed")
if len(reported_excluded) > 0:
logger.info("excluded {} genes from the predictions (excluded species)"
.format(len(reported_excluded)))
def identify_input_type_and_validate(fpath, valid_ids, excluded_ids):
with auto_open(fpath, 'rb') as fh:
head = fh.read(20)
if head.startswith(b'<?xml') or head.startswith(b'<ortho'):
try:
with auto_open(fpath, 'rb') as fh:
parse_orthoxml(fh, valid_ids, excluded_ids)
except AssertionError as e:
logger.error('input file is not a valid orthoxml file: {}'.format(e))
return False
else:
try:
with auto_open(fpath, 'rt') as fh:
parse_tsv(fh, valid_ids, excluded_ids)
except AssertionError as e:
logger.error('input file is not a valid tab-separated file: {}'.format(e))
return False
return True
def write_participant_dataset_file(outfn, participant_id, community, challenges, is_valid):
data_id = community + ":" + participant_id + "_P"
challenges = challenges.split() if isinstance(challenges, str) else challenges
output_json = JSON_templates.write_participant_dataset(data_id, community, challenges,
participant_id, is_valid)
with open(outfn, 'wt') as fout:
json.dump(output_json, fout, sort_keys=True, indent=4, separators=(',', ': '))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Extract Pairwise relations from uploaded data")
parser.add_argument('mapping', help="Path to mapping.json of proper QfO dataset")
parser.add_argument('input_rels', help="Path to input relation file. either tsv or orthoxml")
parser.add_argument('-d', '--debug', action="store_true", help="Set logging to debug level")
parser.add_argument('-c', '--com', required=True, help="Name or OEB permanent ID for the benchmarking community")
parser.add_argument('--challenges_ids', default=[], help="List of benchmarks that will be run")
parser.add_argument('-p', '--participant', required=True, help="Name of the tool")
parser.add_argument('-o', '--out', required=True, help="Output filename for validation json")
conf = parser.parse_args()
log_conf = {'level': logging.INFO, 'format': "%(asctime)-15s %(levelname)-7s: %(message)s"}
if conf.debug:
log_conf['level'] = logging.DEBUG
logging.basicConfig(**log_conf)
mapping_data = load_mapping(conf.mapping)
excluded_ids = mapping_data['excluded_ids'] if 'excluded_ids' in mapping_data else set([])
is_valid = identify_input_type_and_validate(conf.input_rels, mapping_data['mapping'], excluded_ids)
write_participant_dataset_file(conf.out, conf.participant, conf.com, conf.challenges_ids, is_valid)
if not is_valid:
sys.exit("ERROR: Submitted data does not validate against any reference data! Please check "+conf.out )