diff --git a/augur/export.py b/augur/export.py index 56de6248b..b138d7b18 100644 --- a/augur/export.py +++ b/augur/export.py @@ -248,7 +248,7 @@ def construct_author_info_nexflu(metadata, tree, nodes): author_info = defaultdict(lambda: {"n": 0}) for strain, data in metadata.items(): if "authors" not in data: - print("Error - {} had no authors".format(n)) + print("Error - {} had no authors".format(strain)) continue if data["authors"] not in authorsInTree: continue diff --git a/augur/filter.py b/augur/filter.py index f8ab51ad1..85a99354e 100644 --- a/augur/filter.py +++ b/augur/filter.py @@ -37,6 +37,21 @@ def write_vcf(compressed, input_file, output_file, dropped_samps): except OSError: pass +def read_priority_scores(fname): + priorities = defaultdict(float) + if not os.path.isfile(fname): + print("ERROR: priority file %s doesn't exist"%fname) + return priorities + + with open(fname) as pfile: + for l in pfile: + f = l.strip().split() + try: + priorities[f[0]] = float(f[1]) + except: + print("ERROR: malformatted priority:",l) + + return priorities def run(args): @@ -66,43 +81,64 @@ def run(args): meta_dict, meta_columns = read_metadata(args.metadata) - ####Filtering steps + ##################################### + #Filtering steps + ##################################### + + # remove sequences without meta data tmp = [ ] - for s in seq_keep: - if s in meta_dict: - tmp.append(s) + for seq_name in seq_keep: + if seq_name in meta_dict: + tmp.append(seq_name) else: - print("No meta data for",s) + print("No meta data for %s, excluding from all further analysis."%seq_name) seq_keep = tmp + # remove strains explicitly excluded by name + # read list of strains to exclude from file and prune seq_keep if args.exclude and os.path.isfile(args.exclude): with open(args.exclude, 'r') as ifile: to_exclude = set([line.strip() for line in ifile if line[0]!=comment_char]) - seq_keep = [s for s in seq_keep if s not in to_exclude] + seq_keep = [seq_name for seq_name in seq_keep if seq_name not in to_exclude] + + # exclude strain my metadata field like 'host=camel' + if args.exclude_where: + for ex in args.exclude_where: + try: + col, val = ex.split("=") + except (ValueError,TypeError): + print("invalid exclude clause %s, should be of from property=value"%ex) + else: + seq_keep = [seq_name for seq_name in seq_keep + if meta_dict[seq_name].get(col,'unknown')!=val] - if is_vcf and args.min_length: #doesn't make sense for VCF, ignore. - print("WARNING: Cannot use min_length for VCF files. Ignoring...") - elif (not is_vcf) and args.min_length: - seq_keep = [s for s in seq_keep if len(seqs[s])>=args.min_length] + # filter by sequence length + if args.min_length: + if is_vcf: #doesn't make sense for VCF, ignore. + print("WARNING: Cannot use min_length for VCF files. Ignoring...") + else: + seq_keep = [s for s in seq_keep if len(seqs[s])>=args.min_length] + # filter by date if (args.min_date or args.max_date) and 'date' in meta_columns: dates = get_numerical_dates(meta_dict, fmt="%Y-%m-%d") if args.min_date: - seq_keep = [s for s in seq_keep if dates[s] and np.min(dates[s])>args.min_date] + seq_keep = [s for s in seq_keep if dates[s] and np.max(dates[s])>args.min_date] if args.max_date: seq_keep = [s for s in seq_keep if dates[s] and np.min(dates[s])