From 5ad56584966d23ab5e9a5eb2a0b2d7901a0bb1b8 Mon Sep 17 00:00:00 2001 From: alienzj Date: Tue, 13 Apr 2021 09:11:12 +0800 Subject: [PATCH] gzip support --- README.md | 1 + combine_kreports.py | 8 ++++++-- extract_kraken_reads.py | 25 +++++++++++++++++++------ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 35ab6a6..60cefd6 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ taxonomy ID. Additional options are specified below. Optional: * `-o2, --output2 OUTPUT.FASTA.............`second output FASTA/Q file with extracted seqs (for paired reads) * `--fastq-output..........................`Instead of producing FASTA files, print FASTQ files (requires FASTQ input) +* `--gzip-output...........................`Gzip output * `--exclude...............................`Instead of finding reads matching specified taxids, finds reads NOT matching specified taxids. * `-r, --report MYFILE.KREPORT.............`Kraken report file (required if specifying --include-children or --include-parents) * `--include-children......................`include reads classified at more specific levels than specified taxonomy ID levels. diff --git a/combine_kreports.py b/combine_kreports.py index 41d74ae..97bd6eb 100755 --- a/combine_kreports.py +++ b/combine_kreports.py @@ -54,6 +54,7 @@ # - main # - process_kraken_report #################################################################### +import gzip import os, sys, argparse import operator from time import gmtime @@ -197,8 +198,11 @@ def main(): sys.stdout.write("\r\t%i/%i samples processed" % (count_samples, num_samples)) sys.stdout.flush() id2files[count_samples] = r_file - #Open File - curr_file = open(r_file,'r') + #Open File + if r_file.endswith(".gz"): + curr_file = gzip.open(r_file,'rt') + else: + curr_file = open(r_file, 'r') for line in curr_file: report_vals = process_kraken_report(line) if len(report_vals) < 5: diff --git a/extract_kraken_reads.py b/extract_kraken_reads.py index d499fd7..8274d18 100755 --- a/extract_kraken_reads.py +++ b/extract_kraken_reads.py @@ -172,6 +172,8 @@ def main(): parser.add_argument('--fastq-output', dest='fastq_out', required=False, action='store_true',default=False, help='Print output FASTQ reads [requires input FASTQ, default: output is FASTA]') + parser.add_argument('--gzip-output', dest='gzip_output', required=False, + action='store_true', default=False) parser.set_defaults(append=False) args=parser.parse_args() @@ -343,13 +345,24 @@ def main(): sys.stdout.flush() #Open output file if (args.append): - o_file = open(args.output_file, 'a') - if args.output_file2 != '': - o_file2 = open(args.output_file2, 'a') + if not args.gzip_output: + o_file = open(args.output_file, 'a') + if args.output_file2 != '': + o_file2 = open(args.output_file2, 'a') + else: + o_file = gzip.open(args.output_file, 'wta') + if args.output_file2 != '': + o_file2 = gzip.open(args.output_file2, 'wta') else: - o_file = open(args.output_file, 'w') - if args.output_file2 != '': - o_file2 = open(args.output_file2, 'w') + if not args.gzip_output: + o_file = open(args.output_file, 'w') + if args.output_file2 != '': + o_file2 = open(args.output_file2, 'w') + else: + o_file = gzip.open(args.output_file, 'wt') + if args.output_file2 != '': + o_file2 = gzip.open(args.output_file2, 'wt') + #Process SEQUENCE 1 file count_seqs = 0 count_output = 0