From 9e73a12d4b4fc5493cd3e6dade24ed7d3362ae0b Mon Sep 17 00:00:00 2001 From: Simon Ye Date: Tue, 11 Aug 2015 18:59:27 -0400 Subject: [PATCH] Fix off by one error in splitrim --- src/splitrim.d | 245 ++++++++++++++++++++++++------------------------- 1 file changed, 120 insertions(+), 125 deletions(-) diff --git a/src/splitrim.d b/src/splitrim.d index 2ef8f12..73efabf 100644 --- a/src/splitrim.d +++ b/src/splitrim.d @@ -1,11 +1,11 @@ -/* +/* Last working version: v1j3e5 BUGS: (1) readCounter in trimEntry() is not working (see first 2 fragments in: /home/traceyf/chienchi/working/largeTestset_noEmptyLines.fastq) - // TO DO: + // TO DO: (1) Read in up to 16 entries (64 lines) or more per thread, and THEN process, rather than one entry (4 lines) at a time. (2) Stagger the writes for each thread so they don't naturally overlap (3) Use lustre's "lctl" command to spread a FASTQ file across X inodes @@ -19,28 +19,28 @@ Fixed ASCII transcoding error where an extra element in the array was output along with the quality line 2013-03-07: - Added fixed-length read trimming (after quality trimming) with the "--recycle" option. + Added fixed-length read trimming (after quality trimming) with the "--recycle" option. 2013-03-19: Fixed an off-by-one error in the rawSeqNum count displayed in the stats file. - - - Example (50-bp read): + + + Example (50-bp read): Original read =================================N================ 50-bp Splitrimmed read (N's) ==================33=============|=======16======= 33-bp, 16-bp Splitrimmed read (Quals) ==============28============|==4=|=======16======= 28-bp, 4-bp, 16-bp - + ======== Option 1: minL=10 (keeps 2 frags) ==============28============ =======16======= 28-bp, 16-bp ======== (keeps 2 frags, 44-bp) - + ======== Option 2: fixL=10 ====10==== ====10==== 10-bp, 10-bp, 10-bp ======== ====10==== XXXXXX (keeps 3 frags, 30-bp) XXXXXXXX - - ======== + + ======== Option 3: fixL=10 --recycle ====10==== =======16======= 10-bp, 18-bp, 16-bp ======== ========18======== (keeps 3 frags, 44-bp) @@ -51,14 +51,14 @@ The trim START:STOP positions were moved from the end of the header, as in **** @HISEQ:148:C0F5FACXX:1:1307:2735:88518 1:N:0:CGATGT 1:30 - + and appended to the header name just before the first whitespace so that they will show up in SAM files, as in **** @HISEQ:148:C0F5FACXX:1:1307:2735:88518:1:30 1:N:0:CGATGT This will facilitate tracking of (1) the hit count in terms of the actual full-length reads, and (2) the average fraction of a read that does map. The latter would be calculated as follows: - + /////////////////////////////////////////////////////////////////////////////////////////////////// // // // Average Frac. of Read Mapping = { SUM_i=1|N [(# bases mapped)i/(# bases available)i] } / N // @@ -71,10 +71,10 @@ // (# bases available)i = no. of bases in i that passed the trimming step // // // /////////////////////////////////////////////////////////////////////////////////////////////////// - + 2013-04-04: - Reverted the positions of the trim START:STOP positions to the very end of the header line. - + Reverted the positions of the trim START:STOP positions to the very end of the header line. + All FASTQ reads are converted to the OLD Illumina format, where "/1" or "/2" is appended to the header line just before the first whitespace. Since only single-end reads are considered here, only "/1" is used. -> Uniqueness of each fragment derived from a single, full-length read, is enforced by prepending a "_" and @@ -83,7 +83,7 @@ @HISEQ:148:C0F5FACXX:1:1307:2735:88518:1:30 1:N:0:CGATGT to @HISEQ:148:C0F5FACXX:1:1307:2735:88518 1:N:0:CGATGT 1:30 to @HISEQ:148:C0F5FACXX:1:1307:2735:88518/1 1:N:0:CGATGT 1:30 - + But in a SAM file, the first whitespace-delimited header string is indistinguishable from @HISEQ:148:C0F5FACXX:1:1307:2735:88518/1 1:N:0:CGATGT 31:60 @@ -92,7 +92,7 @@ @HISEQ:148:C0F5FACXX:1:1307:2735:88518_0/1 1:N:0:CGATGT 1:30 @HISEQ:148:C0F5FACXX:1:1307:2735:88518_1/1 1:N:0:CGATGT 31:60 - + TO DO: ===== @@ -138,7 +138,7 @@ struct inputOptions { ushort userMinL; // min. length filter ushort userFixL; // fixed length filter bool recycle; // true or false - uint threads; + uint threads; ushort baseLength; // the non-zero contents of either userMinL or userFixL ubyte userMinQ; // min. quality fileter ushort userMaxN; // min. count filter @@ -230,29 +230,29 @@ void verifyFastqFormat(std.stdio.File IN, ref bool newIlluminaFormat) { if(line[0] == '@') { // valid header line auto m = match(line, ctr); // Old Illumina formats terminate their string header = m.captures[1]; // headers with either a /1 or /2 for - // paired-end reads, while single-end + // paired-end reads, while single-end // reads terminate with /1 if(header == "") { // NEW Illumina format newIlluminaFormat = true; } } else { // invalid - writeln("*FATAL*: Unrecognized FASTQ format!"); + writeln("*FATAL*: Unrecognized FASTQ format!"); exit(1); } - + } if(lineCount == 2) { // 2nd valid line better be DNA! foreach(base; line) { //writeln("Base: ", base); if(base !in stdNucleotides) { - writeln("*FATAL*: Unrecognized FASTQ format!"); + writeln("*FATAL*: Unrecognized FASTQ format!"); exit(1); } } } if((lineCount == 3) && (line[0] != '+')) { // 3rd valid line should start with "+" - writeln("*FATAL*: Unrecognized FASTQ format!"); + writeln("*FATAL*: Unrecognized FASTQ format!"); exit(1); } if(lineCount == 4) { // 4th line is the quality line @@ -278,7 +278,7 @@ bool calcStats(ref seqLenStats myStats, ref ulong[ushort] trimSeqLenHash) { //foreach(readLen; sort!((a,b) {return a > b; }) (trimSeqLenHash.keys)) { // writeln(readLen," => ", trimSeqLenHash[readLen]); //} - + //auto max = trimSeqLenArray[$-1]; //auto min = trimSeqLenArray[0]; @@ -287,18 +287,18 @@ bool calcStats(ref seqLenStats myStats, ref ulong[ushort] trimSeqLenHash) { // median = trimSeqLenArray[cast(ulong)(trimSeqLenArray.length/2)]; //} //else { - // median = (trimSeqLenArray[trimSeqLenArray.length/2] + // median = (trimSeqLenArray[trimSeqLenArray.length/2] // + trimSeqLenArray[trimSeqLenArray.length/2 - 1]) / 2; //} // Step1: Find mean ulong total1 = 0; ulong numElems = 0; - foreach(ushort len; trimSeqLenHash.keys) { + foreach(ushort len; trimSeqLenHash.keys) { total1 += len*trimSeqLenHash[len]; numElems += trimSeqLenHash[len]; } - + // EDIT //double mean1 = total1/trimSeqLenArray.length; double mean1 = total1/numElems; @@ -337,7 +337,7 @@ void processOptions1(ref inputOptions inOpts) { writeln("*FATAL*: Input FASTQ file \"",inOpts.inFastq,"\" does not exist!"); exit(0); } - + // Input file is a file? if(!inOpts.inFastq.isFile) { writeln("*FATAL*: Input file \"",inOpts.inFastq,"\" is not a file"); @@ -367,7 +367,7 @@ void processOptions1(ref inputOptions inOpts) { if(inOpts.verbose) writeln("successfully created."); } - + if(!inOpts.outPath.isDir) { writeln("*FATAL*: Output path \"",inOpts.outPath,"\" is not a directory."); exit(0); @@ -382,12 +382,12 @@ void processOptions1(ref inputOptions inOpts) { writeln("Please specify only one of \"--minL\" or \"--fixL\". Abort."); exit(0); } - + if(inOpts.threads < 1) { writeln("*FATAL*: Thread count [",inOpts.threads,"] must be >= 1. Abort."); exit(0); } - + } /**********************************************************************/ void processOptions2(inputOptions inOpts) { @@ -400,45 +400,45 @@ void processOptions2(inputOptions inOpts) { enforce((inOpts.sortLenAsc & inOpts.sortLenDesc) == false, "Cannot specify both --sortLenAsc and --sortLenDesc"); enforce(((inOpts.minAveQ > 0) && (inOpts.minAveQ <= qBest)), "Minimum average read quality is out of range"); if( - (inOpts.userMinL > 0) - && (inOpts.userFixL == 0) + (inOpts.userMinL > 0) + && (inOpts.userFixL == 0) && (inOpts.recycle) - ) + ) writeln("Note: Option \"--recycle\" is meaningless when \"--minL\" is specified. Use \"--fixL\" instead. Ignoring...\n"); } /**********************************************************************/ -void initFiles(ref inputOptions inOpts, ref std.stdio.File IN, - ref std.stdio.File OUT, ref std.stdio.File STATS, +void initFiles(ref inputOptions inOpts, ref std.stdio.File IN, + ref std.stdio.File OUT, ref std.stdio.File STATS, ref std.stdio.File HISTO) { debug writeln("outStatsFilename=",inOpts.outStatsFilename); debug writeln("outHistoFilename=",inOpts.outHistoFilename); - IN = std.stdio.File(inOpts.inFastq, "r"); + IN = std.stdio.File(inOpts.inFastq, "r"); OUT = std.stdio.File(inOpts.outFastq, "w"); STATS = std.stdio.File(inOpts.outStatsFilename, "w"); HISTO = std.stdio.File(inOpts.outHistoFilename, "w"); //GC = std.stdio.File(inOpts.outGCFilename, "w"); - + } /**********************************************************************/ void verifyAsciiEncoding(ref inputOptions inOpts, string q) { - + // swap encoding if wrong encoding bool wrongEncoding = false; QUAL: foreach(qual; q) { - + //write(qual,"=",cast(int)qual," - 64 =",cast(int)qual-64,"/ "); - + // True encoding is 33 if this is negative if((cast(int)qual - 64) < 0) { // It's 33, but given encoding is 64; swap - if (inOpts.asciiEncoding == 64) { + if (inOpts.asciiEncoding == 64) { debug writeln("Wrong encoding (64). Switching to 33!"); - wrongEncoding = true; + wrongEncoding = true; inOpts.asciiEncoding = 33; - break QUAL; + break QUAL; } else { debug writeln("Encoding is 33..."); @@ -458,10 +458,10 @@ void verifyAsciiEncoding(ref inputOptions inOpts, string q) { break QUAL; } } - + } - - if((inOpts.asciiEncoding == 33) && (inOpts.outEncoding == 64)) + + if((inOpts.asciiEncoding == 33) && (inOpts.outEncoding == 64)) inOpts.outOffset = (64-33); else if((inOpts.asciiEncoding == 64) && (inOpts.outEncoding == 33)) inOpts.outOffset = -(64-33); @@ -476,19 +476,19 @@ void writeHistoFile(std.stdio.File HISTO, in inputOptions inOpts) { if(inOpts.sortLenAsc) { // sort ascending foreach(len; sort!((a,b) { return b > a; }) (inOpts.trimSeqLenHash.keys)) - HISTO.writeln(len,"\t",inOpts.trimSeqLenHash[len]); + HISTO.writeln(len,"\t",inOpts.trimSeqLenHash[len]); return; } if(inOpts.sortLenDesc) { // sort descending foreach(len; sort!((a,b) { return a > b; }) (inOpts.trimSeqLenHash.keys)) - HISTO.writeln(len,"\t",inOpts.trimSeqLenHash[len]); + HISTO.writeln(len,"\t",inOpts.trimSeqLenHash[len]); return; } - + // unsorted - foreach(len, freq; inOpts.trimSeqLenHash) + foreach(len, freq; inOpts.trimSeqLenHash) HISTO.writeln(len,"\t",freq); } @@ -512,7 +512,7 @@ void writeStatsFile(std.stdio.File STATS, in inputOptions inOpts, STATS.write (" # of Bases:\t",inOpts.totalRawSeqLen,"\t",inOpts.totalTrimSeqLen,"\t("); STATS.writef ("%.2f", cast(double)inOpts.totalTrimSeqLen/inOpts.totalRawSeqLen*100); STATS.writeln(" %)"); - + STATS.write ("Mean Read Length:\t"); double rawMeanReadLen = cast(double) inOpts.totalRawSeqLen/inOpts.rawSeqNum; STATS.writef ("%.2f", rawMeanReadLen); @@ -544,7 +544,7 @@ void writeStatsFile(std.stdio.File STATS, in inputOptions inOpts, STATS.write( "Total bases:\t",totalTrimSeqLen,"\t("); STATS.writef( "%.2f", cast(float)totalTrimSeqLen/totalRawSeqLen*100); STATS.writeln( "%)"); - + STATS.write( "Mean Read Length:\t"); if(trimSeqNum > 0) { STATS.writef( "%.2f", myStats.trimAvg); @@ -554,7 +554,7 @@ void writeStatsFile(std.stdio.File STATS, in inputOptions inOpts, else { STATS.writeln("0.00"); } - + // Edit: STATS.writeln(); STATS.writeln("Total Trim Time (ms):\t", programTime); @@ -562,18 +562,18 @@ void writeStatsFile(std.stdio.File STATS, in inputOptions inOpts, /* Perl Equiv: - + if (@paired_files){ printf $fh (" Paired Reads #:\t\%d (%.2f %%)\n",$paired_seq_num, $paired_seq_num/$trimmed_num*100); printf $fh (" Paired total bases:\t\%d (%.2f %%)\n",$total_paired_bases,$total_paired_bases/$total_trimmed_seq_len*100); printf $fh (" Unpaired Reads #:\t\%d (%.2f %%)\n", $trimmed_num - $paired_seq_num, ($trimmed_num - $paired_seq_num)/$trimmed_num*100); printf $fh (" Unpaired total bases:\t\%d (%.2f %%)\n", $total_trimmed_seq_len - $total_paired_bases , ($total_trimmed_seq_len - $total_paired_bases)/$total_trimmed_seq_len*100); } - + printf $fh ("\nDiscarded reads #:\t\%d (%.2f %%)\n", $total_num - $trimmed_num , ($total_num - $trimmed_num)/$total_num*100); printf $fh ("Trimmed bases:\t\%d (%.2f %%)\n", $total_raw_seq_len - $total_trimmed_seq_len, ($total_raw_seq_len - $total_trimmed_seq_len)/$total_raw_seq_len*100); -*/ - +*/ + } /******************************************************************************/ //void splitFilter(in string oligo, in ushort[] quals, std.stdio.File OUT) { @@ -591,9 +591,9 @@ void splitFilter(in string oligo, in ushort[] quals) { while((quals[i] >= minQ) && (i < oligo.length)) { i++; } - + stopPos = i; - + // Is fragment of sufficient length to attempt to split on N's? if((stopPos-startPos) >= minL) { // Split on N's: Save those that are of sufficent length @@ -609,9 +609,9 @@ void splitFilter(in string oligo, in ushort[] quals) { */ /******************************************************************************/ void parseFASTQ(in ulong idx, in ulong offsetStart, ref inputOptions inOpts, in ulong stagger, std.stdio.File OUT) { //in string infilename, in ulong fileSize) { - + std.stdio.File FASTQ = std.stdio.File(inOpts.inFastq, "r"); - + // Determine offset of last byte to read in ulong chunkEnd = (idx == inOpts.partitionOffsets.length-1) ? (inOpts.inFastqFileSize) @@ -634,7 +634,7 @@ void parseFASTQ(in ulong idx, in ulong offsetStart, ref inputOptions inOpts, in ulong localRawSeqCounter = 0; string trimmedFqStack; - // Staggering the flushing of trimmed reads to disk so as to prevent direct + // Staggering the flushing of trimmed reads to disk so as to prevent direct // competition between threads for the single filehandle. do { h1 = FASTQ.readln().chomp(); @@ -643,31 +643,31 @@ void parseFASTQ(in ulong idx, in ulong offsetStart, ref inputOptions inOpts, in localTotalRawSeqLen += s.length; h2 = FASTQ.readln(); q = FASTQ.readln().chomp(); - + if(s.length < inOpts.baseLength) continue; - + trimEntry(inOpts, h1, s, h2, q, localTrimSeqNum, localTotalTrimSeqLen, localTrimSeqLenHash, localTrashedBases, trimmedFqStack); // <-------------------- HERE! - } while ((localRawSeqNum < stagger) && (FASTQ.tell() < chunkEnd)); - + } while ((localRawSeqNum < stagger) && (FASTQ.tell() < chunkEnd)); + writeln("Staggering at ",stagger," reads; processed ",localRawSeqNum," reads"); flushEntries(localRawSeqCounter, trimmedFqStack, OUT); - /* + /* FIX: A single emtpy line will hoze this entire program. Implement an additional FOR or WHILE loop to screen 4 valid, non-empty lines. */ while (FASTQ.tell() < chunkEnd) { //do { - + ///////////////////////////////// // FASTQ entry every 4 lines ///////////////////////////////// h1 = FASTQ.readln().chomp(); // 1of4: seq header //writeln("HEADER: \"", h1,"\""); - + s = FASTQ.readln().chomp(); // 2of4: sequence ++localRawSeqNum; localTotalRawSeqLen += s.length; // total length of all input sequences @@ -696,7 +696,7 @@ void parseFASTQ(in ulong idx, in ulong offsetStart, ref inputOptions inOpts, in writeln("IDX(",idx,") counted ",localRawSeqNum," reads"); - synchronized { + synchronized { inOpts.rawSeqNum += localRawSeqNum; // declare inOpts.totalRawSeqLen += localTotalRawSeqLen; // declare inOpts.trimSeqNum += localTrimSeqNum; // declare @@ -727,10 +727,10 @@ void flushEntries(ref ulong localRawSeqCounter, ref string trimmedFqStack, std.s } } /******************************************************************************/ -void trimEntry(inputOptions inOpts, - in string h1, in string s, - in string h2, in string q, - ref ulong trimSeqNum, ref ulong totalTrimSeqLen, +void trimEntry(inputOptions inOpts, + in string h1, in string s, + in string h2, in string q, + ref ulong trimSeqNum, ref ulong totalTrimSeqLen, ref ulong[ushort] trimSeqLenHash, ref ulong trashedBases, ref string trimmedFqStack) { @@ -744,19 +744,17 @@ void trimEntry(inputOptions inOpts, //string trimmedEntry; ulong fragCounter = 0; - + // writeln(s.length, ' ', q.length); while (i < s.length) { - - // Decode quality and check its validity - if((cast(ushort)q[i] - inOpts.asciiEncoding) >= inOpts.userMinQ) { + // Decode quality and check its validity + if((i < s.length - 1) && (cast(ushort)q[i] - inOpts.asciiEncoding) >= inOpts.userMinQ) { qualStart = i++; // LOOP: Decode quality and check its validity - while(((cast(int)q[i] - inOpts.asciiEncoding) >= inOpts.userMinQ) && (i < s.length)) { - i++; + while((cast(int)q[i] - inOpts.asciiEncoding) >= inOpts.userMinQ && (i < s.length - 1) && i++) { } - + qualStop = i; // Is fragment of sufficient length to attempt to split on N's? @@ -779,9 +777,9 @@ void trimEntry(inputOptions inOpts, //writeln("---------> OUTSIDE LOOP1"); - // Ensure ATGC-stretch is of sufficient length + // Ensure ATGC-stretch is of sufficient length if(fragLen >= inOpts.baseLength) { - + //writeln("---------> INSIDE LOOP1"); //////////////////////////// @@ -802,11 +800,11 @@ void trimEntry(inputOptions inOpts, // OPT: Fix-trim read if(inOpts.userFixL) { - + //writeln("---------> INSIDE fixL LOOP"); debug writeln("FRAG: ", s[nStart..nStop]); - + float div = cast(float) fragLen / inOpts.userFixL; // No. of cycles (real) ushort rounds = cast(ushort) div; // No. of cycles (int) @@ -845,7 +843,7 @@ void trimEntry(inputOptions inOpts, trimmedFqStack ~= "\n"; //writeln(); } - + ++fragCounter; //writeln("....... AFTER1: ",fragCounter, "......."); @@ -860,9 +858,9 @@ void trimEntry(inputOptions inOpts, //writeln("---------> OUTSIDE recycle LOOP"); - + /////////////////////////////////////////////////// - // OPTIONAL: Process last frag containing overhang + // OPTIONAL: Process last frag containing overhang /////////////////////////////////////////////////// ushort overhang = cast(ushort) ((div - cast(float)rounds) * cast(float)inOpts.userFixL); if(inOpts.recycle) { @@ -945,7 +943,7 @@ void trimEntry(inputOptions inOpts, totalTrimSeqLen += (fragLen); trimSeqLenHash[cast(ushort)(fragLen)]++; } - + } // if(fragLen >= inOpts.baseLength) } n++; @@ -970,7 +968,7 @@ ulong getFastqEntryOffset (std.stdio.File FASTQ, in ulong offset) { bool entry_found = false; bool validated_entry = false; ulong validated_offset = 0; - + // Max no. of lines to read equals 7. // LINE1: fragment of FASTQ header <--- absence of ^@ means invalid start site // LINE2: ^(ATGC)+$ <--- absence of ^@ means invalid start site @@ -979,11 +977,11 @@ ulong getFastqEntryOffset (std.stdio.File FASTQ, in ulong offset) { // LINE5: ^@(\S+)$ <--- entry_found = 1, entry_offset = FASTQ.tell() // LINE6: ^(ATGC)+$ <--- absence of ^@ means invalid start site // LINE7: ^+(.*)$ <--- validates entry start site above - - // If there happens to be two ^@ in a row, then the only valid situation + + // If there happens to be two ^@ in a row, then the only valid situation // this could happen in is if the first ^@ belongs to the quality encoding line. // That means the second ^@ may be the official start. - // + // // LINE1: fragment of FASTQ header <--- absence of ^@ means invalid start site // LINE2: ^(ATGC)+$ <--- absence of ^@ means invalid start site // LINE3: ^+(.*)$ <--- absence of ^@ means invalid start site @@ -1006,7 +1004,7 @@ ulong getFastqEntryOffset (std.stdio.File FASTQ, in ulong offset) { debug writeln("----> entry found at offset ",entry_offset); continue OUTER; } - + if(entry_found && buf[0] == '+') { validated_entry = true; validated_offset = entry_offset; @@ -1036,7 +1034,7 @@ void profileFastqFileOffsetVitals (std.stdio.File FASTQ, ref inputOptions inOpts // Estimate size of a single entry (4 lines) from the first entry char[] buf; ulong bufLen; - + inOpts.inFastqFileSize = getSize(inOpts.inFastq); foreach(i; 0..4) { @@ -1061,7 +1059,7 @@ void profileFastqFileOffsetVitals (std.stdio.File FASTQ, ref inputOptions inOpts writeln("# Entries/thread = ", inOpts.fastqEntriesPerThread," (approximate)"); } - // Given the average position of each potential FASTQ entry offset as a + // Given the average position of each potential FASTQ entry offset as a // starting point, we extract the exact position of the nearest FASTQ entry // and specify its end offset as well. foreach (t; 0..inOpts.threads) { @@ -1074,7 +1072,7 @@ void profileFastqFileOffsetVitals (std.stdio.File FASTQ, ref inputOptions inOpts = start + entriesPerThread*bufLen - 1 */ ulong seekStart = (inOpts.fastqEntriesPerThread)*(t)*(inOpts.fastqEntrySize); - ulong seekEnd = (t == inOpts.threads-1) + ulong seekEnd = (t == inOpts.threads-1) ? (inOpts.inFastqFileSize) : seekStart + (inOpts.fastqEntriesPerThread)*(inOpts.fastqEntrySize) - 1; @@ -1085,21 +1083,21 @@ void profileFastqFileOffsetVitals (std.stdio.File FASTQ, ref inputOptions inOpts inOpts.partitionOffsets ~= nextOffset; // Add next valid offset ++inOpts.effThreadCount; - + debug { writeln("----> chunk begins at byte offset ", nextOffset); writeln(); } - + } - // Output effective thread count (since FASTQ file might not be partitionable + // Output effective thread count (since FASTQ file might not be partitionable // into the provided number of threads writeln("Threads: ",inOpts.effThreadCount," (effective)\t",inOpts.threads," (requested)"); // Last entry will never be the total file size, since that means no entries // could follow. - debug { + debug { foreach(b, ulong borderStart; inOpts.partitionOffsets) { ulong chunkEnd = (b == inOpts.partitionOffsets.length-1) ? (inOpts.inFastqFileSize) @@ -1132,7 +1130,7 @@ ulong determineFlushLimit(in uint effThreadCount) { /******************************************************************************/ /******************************************************************************/ void main(string[] args) { - + StopWatch swProgram; swProgram.start(); @@ -1166,7 +1164,7 @@ void main(string[] args) { string progName = "splitrim"; string progVersion = "0.1j3e6"; bool showVersion = false; - + getopt( args, std.getopt.config.caseSensitive, @@ -1193,7 +1191,7 @@ void main(string[] args) { ); //////////////////////////////////////////// - // Retain all user input parameters + // Retain all user input parameters //////////////////////////////////////////// inOpts.progName = progName; //args[0]; inOpts.progVersion = progVersion; @@ -1201,7 +1199,7 @@ void main(string[] args) { inOpts.asciiEncoding = asciiEncoding; inOpts.outPath = outPath; // o_path inOpts.userMinL = userMinL; // opt_min_L - inOpts.userFixL = userFixL; // + inOpts.userFixL = userFixL; // inOpts.recycle = recycle; // inOpts.threads = threads; // inOpts.trashedBases = trashedBases; // No. of bases trashed from using --fixL without --recycle @@ -1221,11 +1219,11 @@ void main(string[] args) { if(showUsage || showHelp || (outPath.length == 0) || (inFastq.length == 0)) usage(); inOpts.inFastq = inFastq; // $_[1] inOpts.statsFilename = statsFilename; - - inOpts.baseLength = (inOpts.userMinL > 0 ) ? inOpts.userMinL + + inOpts.baseLength = (inOpts.userMinL > 0 ) ? inOpts.userMinL : (inOpts.userFixL > 0 ) ? inOpts.userFixL : (0); - + //////////////////////////////////////////// // Enforce some basic input requirements //////////////////////////////////////////// @@ -1248,7 +1246,7 @@ void main(string[] args) { inOpts.outHistoFilename = (histoFilename.length == 0) ? buildPath(inOpts.outPath, inOpts.inFastqBasename~"_splitrim.histo.txt") : buildPath(inOpts.outPath, histoFilename); - + // =========================== // FASTRIM EDITED (2012-11-09): // =========================== @@ -1294,7 +1292,7 @@ void main(string[] args) { auto workers = new TaskPool; foreach(i, ref offsetStart; workers.parallel(inOpts.partitionOffsets)) { parseFASTQ(i, offsetStart, inOpts, staggeredFlushCount[i], OUT); //inOpts.inFastq, inOpts.inFastqFileSize); - } + } workers.finish(); swTrim.stop(); writeln("GLOBAL READ COUNT = ", inOpts.rawSeqNum); @@ -1304,7 +1302,7 @@ void main(string[] args) { // Calc read statistics ///////////////////////////////// seqLenStats myStats; - + // EDIT: NEED TO UPDATE FOR trimSeqLenHash COMPATABILITY bool validSeqs = calcStats(myStats, inOpts.trimSeqLenHash); // returns true if calc is OK if(!validSeqs) { @@ -1326,7 +1324,7 @@ void main(string[] args) { writeHistoFile(HISTO, inOpts); swProgram.stop(); - + return; } /***************************************************************************************/ @@ -1337,7 +1335,7 @@ void usage() { writeln; writeln("Usage: ", inOpts.progName, " [INPUT] [OUTPUT] [OTHERS]"); write( - "\n", + "\n", "Input:\n", "==============\n", "REQUIRED:\n", @@ -1387,7 +1385,7 @@ void usage() { ); writeln; exit(0); - + } /******************************************************************************/ @@ -1397,7 +1395,7 @@ void usage() { // Case 4: Catch contiguous 3'-low-quality bases // Case 5: Catch both 5'- and 3'-low quality bases // Case 6: Catch one string of contiguous low-quality bases in the middle - // Case 7: Catch multiple strings of contiguous low-quality bases in the middle + // Case 7: Catch multiple strings of contiguous low-quality bases in the middle // ---------------------------------------------------------------------------- // Case 8: Catch a single, high quality, 5' N // Case 9: Catch contiguous, high quality, 5' N's @@ -1409,8 +1407,8 @@ void usage() { // Case 14: Catch contiguous, low-quality bases at the ends (5' and 3'), along // with contiguous, high-quality strings of N's in the middle // Case 15: Catch contiguous, high-quality N's at the ends (5' and 3'), along - // with contiguous, low-quality bases in the middle. - + // with contiguous, low-quality bases in the middle. + /* GLOBAL READ COUNT = 8966559 @@ -1423,20 +1421,17 @@ PROGRAM ELAPSED TIME: 173512 ms 2 1000000 255656 8966559 16 4 2 25000 170471 8966559 16 4 (used && with readCounter) - 4 25000 182330 8966559 16 4 (used && with readCounter) - + 4 25000 182330 8966559 16 4 (used && with readCounter) + 1 100000 162215 8966559 16 4 (used && with readCounter) 2 100000 173448 4 100000 193537 8 100000 290494 - + 4 100000 204606 (Lock-Free; hitting same file) 4 100000 206233 187 (L-F) 4 100000 199519 181 (L-F; FASTQ.ftell()) - + **larger thread count with lower flush values may increase thread wait times! */ - - -