Skip to content

Commit

Permalink
Merge pull request #3665 from vgteam/long_read_giraffe_select_minimizers
Browse files Browse the repository at this point in the history
Enable capping number of minimizers and filtering of overlapping minimizers
  • Loading branch information
adamnovak authored May 23, 2022
2 parents 2ab8987 + 26f49a7 commit 0477507
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 5 deletions.
35 changes: 31 additions & 4 deletions src/minimizer_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2874,6 +2874,11 @@ std::vector<MinimizerMapper::Seed> MinimizerMapper::find_seeds(const std::vector
}
}

// bit vector length of read to check for overlaps
size_t num_minimizers = 0;
size_t read_len = aln.sequence().size();
std::vector<bool> read_bit_vector (read_len, false);

// Select the minimizers we use for seeds.
size_t rejected_count = 0;
std::vector<Seed> seeds;
Expand Down Expand Up @@ -2903,17 +2908,39 @@ std::vector<MinimizerMapper::Seed> MinimizerMapper::find_seeds(const std::vector
// of the selected minimizers is not high enough.
const Minimizer& minimizer = minimizers[i];

// minimizer information
size_t min_start_index = minimizer.forward_offset();
size_t min_len = minimizer.length;
bool overlapping = false;
if (this->exclude_overlapping_min) {
if (read_bit_vector[min_start_index] == true || read_bit_vector[min_start_index + min_len] == true) {
overlapping = true;
}
}

if (minimizer.hits == 0) {
// A minimizer with no hits can't go on.
took_last = false;
// We do not treat it as located for MAPQ capping purposes.
if (this->track_provenance) {
funnel.fail("any-hits", i);
}
} else if (minimizer.hits <= this->hit_cap ||
(run_hits <= this->hard_hit_cap && selected_score + minimizer.score <= target_score) ||
(took_last && i > start)) {

} else if ( // passes reads
((minimizer.hits <= this->hit_cap) ||
(run_hits <= this->hard_hit_cap && selected_score + minimizer.score <= target_score) ||
(took_last && i > start)) &&
(num_minimizers < this->max_unique_min) &&
(overlapping == false)
) {

// set minimizer overlap as a reads
num_minimizers += 1; // tracking number of minimizers selected
if (this->exclude_overlapping_min) {
for (size_t i = min_start_index; i < min_start_index + min_len; i++) {
read_bit_vector[i] = true;
}
}

// We should keep this minimizer instance because it is
// sufficiently rare, or we want it to make target_score, or it is
// the same sequence as the previous minimizer which we also took.
Expand Down
6 changes: 6 additions & 0 deletions src/minimizer_mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ class MinimizerMapper : public AlignerClient {
/// of total score
double minimizer_score_fraction = 0.9;

/// Maximum number of distinct minimizers to take
size_t max_unique_min = 500;

///Accept at least this many clusters
size_t min_extensions = 2;

Expand Down Expand Up @@ -146,6 +149,9 @@ class MinimizerMapper : public AlignerClient {
/// If set, log what the mapper is thinking in its mapping of each read.
bool show_work = false;

/// If set, exclude overlapping minimizers
bool exclude_overlapping_min = false;

////How many stdevs from fragment length distr mean do we cluster together?
double paired_distance_stdevs = 2.0;

Expand Down
38 changes: 37 additions & 1 deletion src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,12 +350,14 @@ void help_giraffe(char** argv) {
<< " -s, --cluster-score INT only extend clusters if they are within INT of the best score [50]" << endl
<< " -S, --pad-cluster-score INT also extend clusters within INT of above threshold to get a second-best cluster [0]" << endl
<< " -u, --cluster-coverage FLOAT only extend clusters if they are within FLOAT of the best read coverage [0.3]" << endl
<< " -U, --max-min INT use at most INT minimizers [500]" << endl
<< " -v, --extension-score INT only align extensions if their score is within INT of the best score [1]" << endl
<< " -w, --extension-set INT only align extension sets if their score is within INT of the best score [20]" << endl
<< " -O, --no-dp disable all gapped alignment" << endl
<< " -r, --rescue-attempts attempt up to INT rescues per read in a pair [15]" << endl
<< " -A, --rescue-algorithm NAME use algorithm NAME for rescue (none / dozeu / gssw / haplotypes) [dozeu]" << endl
<< " -L, --max-fragment-length INT assume that fragment lengths should be smaller than INT when estimating the fragment length distribution" << endl
<< " --exclude-overlapping-min exclude overlapping minimizers" << endl
<< " --fragment-mean FLOAT force the fragment length distribution to have this mean (requires --fragment-stdev)" << endl
<< " --fragment-stdev FLOAT force the fragment length distribution to have this standard deviation (requires --fragment-mean)" << endl
<< " --paired-distance-limit FLOAT cluster pairs of read using a distance limit FLOAT standard deviations greater than the mean [2.0]" << endl
Expand Down Expand Up @@ -387,6 +389,7 @@ int main_giraffe(int argc, char** argv) {
#define OPT_REF_PATHS 1010
#define OPT_SHOW_WORK 1011
#define OPT_NAMED_COORDINATES 1012
#define OPT_EXCLUDE_OVERLAPPING_MIN 1013


// initialize parameters with their default options
Expand All @@ -399,7 +402,10 @@ int main_giraffe(int argc, char** argv) {
Range<size_t> distance_limit = 200;
Range<size_t> hit_cap = 10, hard_hit_cap = 500;
Range<double> minimizer_score_fraction = 0.9;
Range<size_t> max_unique_min = 500;
bool show_progress = false;
// Should we exclude overlapping minimizers
bool exclude_overlapping_min = false;
// Should we try chaining or just give up if we can't find a full length gapless alignment?
bool do_dp = true;
// What GAM should we realign?
Expand Down Expand Up @@ -469,6 +475,7 @@ int main_giraffe(int argc, char** argv) {
.chain(hard_hit_cap)
.chain(minimizer_score_fraction)
.chain(rescue_attempts)
.chain(max_unique_min)
.chain(max_multimaps)
.chain(max_extensions)
.chain(max_alignments)
Expand Down Expand Up @@ -541,6 +548,8 @@ int main_giraffe(int argc, char** argv) {
{"cluster-score", required_argument, 0, 's'},
{"pad-cluster-score", required_argument, 0, 'S'},
{"cluster-coverage", required_argument, 0, 'u'},
{"max-min", required_argument, 0, 'U'},
{"exclude-overlapping-min", no_argument, 0, OPT_EXCLUDE_OVERLAPPING_MIN},
{"extension-score", required_argument, 0, 'v'},
{"extension-set", required_argument, 0, 'w'},
{"score-fraction", required_argument, 0, 'F'},
Expand All @@ -561,7 +570,7 @@ int main_giraffe(int argc, char** argv) {
};

int option_index = 0;
c = getopt_long (argc, argv, "hZ:x:g:H:m:s:d:pG:f:iM:N:R:o:Pnb:c:C:D:F:e:a:S:u:v:w:Ot:r:A:L:",
c = getopt_long (argc, argv, "hZ:x:g:H:m:s:d:pG:f:iM:N:R:o:Pnb:c:C:D:F:e:a:S:u:U:v:w:Ot:r:A:L:",
long_options, &option_index);


Expand Down Expand Up @@ -856,6 +865,18 @@ int main_giraffe(int argc, char** argv) {
cluster_coverage = score;
}
break;

case 'U':
{
auto maxmin = parse<Range<size_t>>(optarg);
if (maxmin <= 0) {
cerr << "error: [vg giraffe] Maximum unique minimizer (" << maxmin << ") must be a positive integer" << endl;
exit(1);
}
max_unique_min = maxmin;
}
break;

case 'v':
{
auto score = parse<Range<int>>(optarg);
Expand Down Expand Up @@ -907,6 +928,10 @@ int main_giraffe(int argc, char** argv) {
}
break;

case OPT_EXCLUDE_OVERLAPPING_MIN:
exclude_overlapping_min = true;
break;

case OPT_FRAGMENT_MEAN:
forced_mean = true;
fragment_mean = parse<double>(optarg);
Expand Down Expand Up @@ -1204,6 +1229,7 @@ int main_giraffe(int argc, char** argv) {
s << "-a" << max_alignments;
s << "-s" << cluster_score;
s << "-u" << cluster_coverage;
s << "-U" << max_unique_min;
s << "-w" << extension_set;
s << "-v" << extension_score;

Expand Down Expand Up @@ -1243,6 +1269,16 @@ int main_giraffe(int argc, char** argv) {
}
minimizer_mapper.minimizer_score_fraction = minimizer_score_fraction;

if (show_progress) {
cerr << "--max-min " << max_unique_min << endl;
}
minimizer_mapper.max_unique_min = max_unique_min;

if (show_progress) {
cerr << "--exclude-overlapping-min " << endl;
}
minimizer_mapper.exclude_overlapping_min = exclude_overlapping_min;

if (show_progress) {
cerr << "--max-extensions " << max_extensions << endl;
}
Expand Down

1 comment on commit 0477507

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11016 seconds

Please sign in to comment.