Skip to content

Commit

Permalink
Introduce -wk option to deal with more divergent genome.
Browse files Browse the repository at this point in the history
  • Loading branch information
mourisl committed Jul 23, 2016
1 parent eba3f79 commit 144602f
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ Rcorrector can also be applied to other type of sequencing data where the read c
-t INT: number of threads to use (default: 1)
-trim : allow trimming (default: false)
-maxcorK INT: the maximum number of correction within k-bp window (default: 4)
-wk FLOAT: the proportion of kmers that are used to estimate weak kmer count threshold, lower for more divergent genome (default: 0.95)
-ek INT: expected number of kmers; does not affect the correctness of program but affects the memory usage (default: 100000000)
-stdout: output the corrected reads to stdout (default: not used)
-verbose: output some correction information to stdout (default: not used)
Expand Down
15 changes: 12 additions & 3 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ void PrintHelp()
//"\t-all: output all the reads including those unfixable (default: false)\n"
"\t-maxcor INT: the maximum number of correction every 100bp (default: 8)\n"
"\t-maxcorK INT: the maximum number of correction within k-bp window (default: 4)\n"
"\t-wk FLOAT: the proportion of kmers that are used to estimate weak kmer count threshold (default: 0.95)\n"
"\t-stdout: output the corrected sequences to stdout (default: not used)\n"
"\t-verbose: output some correction information to stdout (default: not used)\n"
) ;
Expand Down Expand Up @@ -130,6 +131,7 @@ int main( int argc, char *argv[] )
{
int i, j, k ;
int kmerLength = 23 ;
double errorRateKmerPortion ;
Reads reads ;
Reads pairedReads ;

Expand All @@ -155,6 +157,7 @@ int main( int argc, char *argv[] )
agressiveCorrection = false ;
MAX_FIX_PER_100 = 8 ;
MAX_FIX_PER_K = 4 ;
errorRateKmerPortion = 0.95 ;

summary.totalCorrections = 0 ;
summary.totalReads = 0 ;
Expand Down Expand Up @@ -218,6 +221,11 @@ int main( int argc, char *argv[] )
{
agressiveCorrection = true ;
}*/
else if ( !strcmp( "-wk", argv[i] ) )
{
errorRateKmerPortion = atof( argv[i + 1 ] ) ;
++i ;
}
else if ( !strcmp( "-stdout", argv[i] ) )
{
outputStdout = true ;
Expand Down Expand Up @@ -307,7 +315,7 @@ int main( int argc, char *argv[] )
rewind( fpJellyFishDump ) ;
k = 0 ;
const int rateSize = 100000 ;
double *rates = (double *)malloc( sizeof( double ) * rateSize ) ;
double *rates = (double *)malloc( sizeof( double ) * ( rateSize + 1 ) ) ;
rates[0] = 0 ;
while ( fscanf( fpJellyFishDump, "%s", buffer ) != EOF && k < rateSize )
{
Expand Down Expand Up @@ -343,11 +351,12 @@ int main( int argc, char *argv[] )
++k ;
}
qsort( rates, k, sizeof( rates[0] ), CompDouble ) ;
ERROR_RATE = rates[(int)( k * 0.95 )] ;
rates[k] = rates[k - 1] ;
ERROR_RATE = rates[(int)( k * errorRateKmerPortion )] ;
if ( ERROR_RATE == 0 || k < 100 )
ERROR_RATE = 0.01 ;
free( rates ) ;
fprintf( stderr, "Weak kmer threshold rate: %lf\n", ERROR_RATE ) ;
fprintf( stderr, "Weak kmer threshold rate: %lf (estimated from %.3lf/1 of the chosen kmers)\n", ERROR_RATE, errorRateKmerPortion ) ;
//exit ( 1 ) ;

// Get the bad quality screo
Expand Down
9 changes: 8 additions & 1 deletion run_rcorrector.pl
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
"Other parameters:\n".
"\t-k kmer_length (<=32, default: 23)\n".
"\t-od output_file_directory (default: ./)\n".
"\t-t number of threads to use (default: 1)\n".
"\t-t number_of_threads (default: 1)\n".
#"\t-trim allow trimming (default: false)\n".
"\t-maxcorK INT: the maximum number of correction within k-bp window (default: 4)\n".
"\t-wk FLOAT: the proportion of kmers that are used to estimate weak kmer count threshold, lower for more divergent genome (default: 0.95)\n".
"\t-ek expected_number_of_kmers: does not affect the correctness of program but affect the memory usage (default: 100000000)\n".
"\t-stdout: output the corrected reads to stdout (default: not used)\n".
"\t-verbose: output some correction information to stdout (default: not used)\n".
Expand Down Expand Up @@ -191,6 +192,12 @@ sub AddJellyFishReadFile
push @rcorrectorArguments, $ARGV[$i + 1] ;
++$i ;
}
elsif ( $ARGV[$i] eq "-wk" )
{
push @rcorrectorArguments, $ARGV[$i] ;
push @rcorrectorArguments, $ARGV[$i + 1] ;
++$i ;
}
else
{
die "Unknown argument ".$ARGV[$i]."\n" ;
Expand Down

0 comments on commit 144602f

Please sign in to comment.