Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD: Script to allow metaphlan3 output generation #51

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions kreport2mpa3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env python
####################################################################
# kreport2mpa3.py is an update to the kreport2mpa.py script by Jennifer
# Lu. This update acts as means of allowing the conversion of kraken/
# bracken reports to metaphlan version 3+ outputs. The script has
# been tested with the output used for HUMAnN v3.1.1. The usage remains
# the same as the original kreport2mpa.py. Added lines have been commented
# for clarity of changes. An extra argument has been added to allow for
# use the generated mpa file as an input with Humann v3. For this,
# use the -hm or --humann_mode flag.
#
# Justification for changes: Metaphlan3 bug list outputs now contain
# two additional columns -- taxid and additional species. From what I
# can see the taxid column has to be populated while the additional
# species can be empty. Other additions like the #mpa... header line
# was important for use with humann3.1.1.
#
# Damilola R Oresegun (DRO), damioresegun@gmail; [email protected]
# 03/08/2022
####################################################################
####################################################################
#kreport2mpa.py converts a Kraken-style report into mpa [MetaPhlAn) format
#Copyright (C) 2017-2020 Jennifer Lu, [email protected]

#This file is part of KrakenTools.
#KrakenTools is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 3 of the license, or
#(at your option) any later version.

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.

#You should have received a copy of the GNU General Public License
#along with this program; if not, see <http://www.gnu.org/licenses/>.

####################################################################
#Jennifer Lu, [email protected]
#11/06/2017
#Updated: 07/12/2020
#
#This program reads in a Kraken report file and generates
#an mpa-format (MetaPhlAn) style report. Each line represents
#a possible taxon classification. The first column is lists the
#domain, kingdom, phyla, etc, leading up to each taxon.
#The levels are separated by the | delimiter, with the type of
#level specified before each name with a single letter and underscore
#(d_ for domain, k_ for kingdom, etc).
#The second column is the number of reads classified within
#that taxon's subtree.
#
#Input file:
# - Kraken report file generates from the kraken raw output file
#Input Parameters to Specify [OPTIONAL]:
# - header_line = prints a header line in mpa-report
# [Default: no header]
# - intermediate-ranks = includes non-traditional taxon levels
# (traditional levels: domain, kingdom, phylum, class, order,
# family, genus, species)
# [Default: no intermediate ranks]
#Output file format (tab-delimited)
# - Taxonomy tree levels |-delimited, with level type [d,k,p,c,o,f,g,s,x]
# - Number of reads within subtree of the specified level
#
#Methods
# - main
# - process_kraken_report
#
import os, sys, argparse

#process_kraken_report
#usage: parses a single line in the kraken report and extracts relevant information
#input: kraken report file with the following tab delimited lines
# - percent of total reads
# - number of reads (including at lower levels)
# - number of reads (only at this level)
# - taxonomy classification of level
# (U, D, P, C, O, F, G, S, -)
# - taxonomy ID (0 = unclassified, 1 = root, 2 = Bacteria,...etc)
# - spaces + name
#returns:
# - classification/genome name
# - level name (U, -, D, P, C, O, F, G, S)
# - reads classified at this level and below in the tree
def process_kraken_report(curr_str):
split_str = curr_str.strip().split('\t')
try:
int(split_str[1])
except ValueError:
return []
percents = float(split_str[0])
all_reads = int(split_str[1])
level_type = split_str[3]
taxid = split_str[4] # line added by DRO to output a taxid column
#Get name and spaces
spaces = 0
name = split_str[-1]
for char in name:
if char == ' ':
name = name[1:]
spaces += 1
else:
break
name = name.replace(' ','_')
#Determine level based on number of spaces
level_num = spaces/2
return [name, level_num, level_type, all_reads, percents, taxid] # DRO added taxid to be returned

#Main method
def main():
#Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--report-file', '--report', required=True,
dest='r_file', help='Input kraken report file for converting')
parser.add_argument('-o', '--output', required=True,
dest='o_file', help='Output mpa-report file name')
parser.add_argument('--display-header', action='store_true',
dest='add_header', default=False, required=False,
help='Include header [Kraken report filename] in mpa-report file [default: no header]')
parser.add_argument('--read_count', action='store_true',
dest='use_reads', default=True, required=False,
help='Use read count for output [default]')
parser.add_argument('--percentages', action='store_false',
dest='use_reads', default=True, required=False,
help='Use percentages for output [instead of reads]')
parser.add_argument('--intermediate-ranks', action='store_true',
dest='x_include', default=False, required=False,
help='Include non-traditional taxonomic ranks in output')
parser.add_argument('--no-intermediate-ranks', action='store_false',
dest='x_include', default=False, required=False,
help='Do not include non-traditional taxonomic ranks in output [default]')
parser.add_argument('-hm','--humann_mode', action='store_true',
dest='HUMAnN_mode', default=False, required=False,
help='Tune the output for HUMAnN v3 use. Adds some additional lines to the top of the output table')
args=parser.parse_args()

#Process report file and output
curr_path = []
prev_lvl_num = -1
r_file = open(args.r_file, 'r')
o_file = open(args.o_file, 'w')
#Print header
if args.add_header:
if args.HUMAnN_mode is True:
o_file.write("#mpa_v31_CHOCOPhlAn_201901" + "\n") ## DRO added to comply with metaphlan3 bugs list output
o_file.write("#Classification\t" + os.path.basename(args.r_file) + "\n")
o_file.write("#SampleID\tMetaphlan_Analysis" + "\n") ## DRO added to comply with metaphlan3 bugs list output
if args.use_reads:
o_file.write("#clade_name\tNCBI_tax_id\tread_count\tadditional_species" + "\n") ## DRO added to comply with metaphlan3 bugs list output
else:
o_file.write("#clade_name\tNCBI_tax_id\trelative_abundance\tadditional_species" + "\n") ## DRO added to comply with metaphlan3 bugs list output
else:
o_file.write("#Classification\t" + os.path.basename(args.r_file) + "\n")
else: ## DRO added to comply with metaphlan3 bugs list output
if args.HUMAnN_mode is True:
o_file.write("#mpa_v31_CHOCOPhlAn_201901" + "\n") ## DRO added to comply with metaphlan3 bugs list output
o_file.write("#SampleID\tMetaphlan_Analysis" + "\n") ## DRO added to comply with metaphlan3 bugs list output
if args.use_reads:
o_file.write("#clade_name\tNCBI_tax_id\tread_count\tadditional_species" + "\n") ## DRO added to comply with metaphlan3 bugs list output
else:
o_file.write("#clade_name\tNCBI_tax_id\trelative_abundance\tadditional_species" + "\n") ## DRO added to comply with metaphlan3 bugs list output

#Read through report file
main_lvls = ['R','K','D','P','C','O','F','G','S']
for line in r_file:
report_vals = process_kraken_report(line)
#If header line, skip
if len(report_vals) < 5:
continue
#Get relevant information from the line
[name, level_num, level_type, all_reads, percents, taxid] = report_vals # DRO added taxid
if level_type == 'U':
continue
#Create level name
if level_type not in main_lvls:
level_type = "x"
elif level_type == "K":
level_type = "k"
elif level_type == "D":
level_type = "k"
level_str = level_type.lower() + "__" + name
#Determine full string to add
if prev_lvl_num == -1:
#First level
prev_lvl_num = level_num
curr_path.append(level_str)
else:
#Move back if needed
while level_num != (prev_lvl_num + 1):
prev_lvl_num -= 1
curr_path.pop()
#Print if at non-traditional level and that is requested
if (level_type == "x" and args.x_include) or level_type != "x":
#Print all ancestors of current level followed by |
for string in curr_path:
if (string[0] == "x" and args.x_include) or string[0] != "x":
if string[0] != "r":
o_file.write(string + "|")
#Print final level and then number of reads
if args.use_reads:
o_file.write(level_str + "\t" + str(taxid) + "\t" + str(all_reads) + "\n") # DRO added taxid to be outputted
else:
o_file.write(level_str + "\t" + str(taxid) + "\t" + str(percents) + "\n") # DRO added taxid to be outputted
#Update
curr_path.append(level_str)
prev_lvl_num = level_num
o_file.close()
r_file.close()

if __name__ == "__main__":
main()