Skip to content

Commit

Permalink
Added data structures to store/extract the set of barcodes for k-mers…
Browse files Browse the repository at this point in the history
… in the Ref() and Graph() classes; Output to VCF the list of barcodes (BX) supporting ref and alt alleles; Coverage values moved to unsigned short; Updated version number to 1.1.0; update README
  • Loading branch information
gnarzisi committed Oct 18, 2019
1 parent 5afd428 commit f30682f
Show file tree
Hide file tree
Showing 18 changed files with 400 additions and 288 deletions.
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,16 @@ done
```
The previous command shows an exemplary submission of multiple parallel lancet jobs, one for each human chromosome, to the Sun Grid Engine queuing system.

### Linked-Reads analysis

The recommended command line options for 10x Genomics linked-reads analysis are:

~~~
lancet **--linked-reads** **--primary-alignment-only** --tumor T.bam --normal N.bam --ref ref.fa --reg chr1 --num-threads 8 > out.vcf
~~~

[LongRanger](https://support.10xgenomics.com/genome-exome/software/pipelines/latest/what-is-long-ranger) BAMs are directy supported, however, for improved accuarcy, we highly recommend to apply to the BAMs the [MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates) program from [Picard Tools](https://broadinstitute.github.io/picard/), which marks PCR duplicates more accurately than LongRanger.

### Output

Lancet generates in output the list of variants in VCF format (v4.1). All variants (SNVs and indels either shared, specific to the tumor, or specific to the normal) are exported in output. Following VCF conventions, high quality variants are flagged as **PASS** in the FILTER column. For non-PASS variants the FILTER info reports the list of filters that are not satisfied by each variant.
Expand Down Expand Up @@ -143,7 +153,7 @@ The final graph (after compression) containing one single variant is depicted be
_____|\__,_|_| _|\___|\___|\__|
Program: lancet (micro-assembly somatic variant caller)
Version: 1.0.7, July 16 2018
Version: 1.1.1, October 18 2019
Contact: Giuseppe Narzisi <[email protected]>
Usage: lancet [options] --tumor <BAM file> --normal <BAM file> --ref <FASTA file> --reg <chr:start-end>
Expand All @@ -163,7 +173,7 @@ Optional
--min-base-qual, -C <int> : minimum base quality required to consider a base for SNV calling [default: 17]
--quality-range, -Q <char> : quality value range [default: !]
--min-map-qual, -b <int> : minimum read mapping quality in Phred-scale [default: 15]
--max-as-xs-diff, -Z <int> : maximum different between AS and XS alignments scores [default: 5]
--max-as-xs-diff, -Z <int> : maximum difference between AS and XS alignments scores [default: 5]
--tip-len, -l <int> : max tip length [default: 11]
--cov-thr, -c <int> : min coverage threshold used to select reference anchors from the De Bruijn graph [default: 5]
--cov-ratio, -x <float> : minimum coverage ratio used to remove nodes from the De Bruijn graph [default: 0.01]
Expand Down Expand Up @@ -197,7 +207,9 @@ Short Tandem Repeat parameters
--dist-from-str, -D <int> : distance (in bp) of variant from STR locus [default: 1]
Flags
--linked-reads, -J : linked-reads analysis mode
--primary-alignment-only, -I : only use primary alignments for variant calling
--XA-tag-filter, -O : skip reads with multiple hits listed in the XA tag (BWA only)
--active-region-off, -W : turn off active region module
--kmer-recovery, -R : turn on k-mer recovery (experimental)
--print-graph, -A : print graph (in .dot format) after every stage
Expand Down
228 changes: 129 additions & 99 deletions src/Graph.cc

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions src/Graph.hh
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,9 @@ public:

VariantDB_t *vDB; // DB of variants
Filters * filters; // filter thresholds

unordered_map<Mer_t,set<string>> bx_table_tmr; // mer to barcode map for tumor
unordered_map<Mer_t,set<string>> bx_table_nml; // mer to barcode map for normal

Graph_t() : ref_m(NULL), is_ref_added(0), readCycles(0) {
clear(true);
Expand Down Expand Up @@ -205,6 +208,9 @@ public:
*/

//void loadReadsSFA(const string & filename);
void addBX(const string & bx, Mer_t & mer, int sample);
string getBXsetAt(int start, int end, string & seq, int sample);

void printAlignment(const string &ref_aln, const string &path_aln, Path_t * path);
void printVerticalAlignment(const string &ref_aln, const string &path_aln, Path_t * path, vector<cov_t> & covN, vector<cov_t> & covT, vector<cov_t> & refcovN, vector<cov_t> & refcovT);
void processPath(Path_t * path, Ref_t * ref, FILE * fp, bool printPathsToFile, int &complete, int &perfect, int &withsnps, int &withindel, int &withmix);
Expand Down
2 changes: 2 additions & 0 deletions src/Lancet.cc
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ int rLancet(string tumor_bam, string normal_bam, string ref_fasta, string reg, s
//merge variant from all threads
cerr << "Merge variants" << endl;
VariantDB_t variantDB(LR_MODE); // variants DB
variantDB.setFilters(&filters);
for( i=0; i < NUM_THREADS; ++i ) {

tot_skip += assemblers[i]->num_skip;
Expand Down Expand Up @@ -931,6 +932,7 @@ int main(int argc, char** argv)
cerr << "Merge variants" << endl;
VariantDB_t variantDB(LR_MODE); // variants DB
variantDB.setCommandLine(COMMAND_LINE);
variantDB.setFilters(&filters);
for( i=0; i < NUM_THREADS; ++i ) {

tot_skip += assemblers[i]->num_skip;
Expand Down
2 changes: 1 addition & 1 deletion src/Lancet.hh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

#include "Microassembler.hh"

string VERSION = "1.0.7, July 16 2018";
string VERSION = "1.1.0, October 18 2019";
string COMMAND_LINE;

/**** configuration parameters ****/
Expand Down
9 changes: 5 additions & 4 deletions src/Microassembler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ int Microassembler::processGraph(Graph_t & g, const string & refname, int minkme
g.countRefPath(out_prefix + ".paths.fa", refname, false);
//g.printFasta(prefix + "." + refname + ".nodes.fa");

if (PRINT_ALL) { g.printDot(out_prefix + ".final.c" + comp + ".dot",c); }
if (PRINT_ALL) { g.printDot(out_prefix + ".final.c" + comp + ".dot",c); }
}

if (rptInQry || cycleInGraph) { continue; }
Expand Down Expand Up @@ -455,7 +455,7 @@ bool Microassembler::extractReads(BamReader &reader, Graph_t &g, Ref_t *refinfo,
string rg = "";
string xt = "";
string xa = "";
string bx = ""; // 10x barcode
string bx = ""; // linked-read barcode
int hp = 0; // 10x Haplotype number of the molecule that generated the read

int nm = 0;
Expand Down Expand Up @@ -583,6 +583,7 @@ bool Microassembler::extractReads(BamReader &reader, Graph_t &g, Ref_t *refinfo,
bx = "";
al.GetTag("BX", bx); // get the BX barcode for the read
if(bx.empty()) { bx = "null"; }
//cerr << "BX=" << bx << endl;

hp = 0;
hp = extract_sam_tag("HP", al);
Expand Down Expand Up @@ -784,7 +785,7 @@ int Microassembler::processReads() {
++counter;
progress = floor(100*(double(counter)/(double)reftable->size()));
if (progress > old_progress) {
cerr << "Thread " << ID << " is " << progress << "\% done." << endl;
cerr << "Thread " << ID << " is " << progress << "\% done, with " << vDB.getNumVariants() << " variants collected so far." << endl;
old_progress = progress;
}

Expand Down Expand Up @@ -847,7 +848,7 @@ int Microassembler::processReads() {
welapsed = (wfinish.tv_sec - wstart.tv_sec);
welapsed += (wfinish.tv_nsec - wstart.tv_nsec) / 1000000000.0;
ofile << welapsed << "\t" << refinfo->refchr << ":" << refinfo->refstart << "-" << refinfo->refend << "\t" << numreads_g << endl;
*/
*/
}
//ofile.close();

Expand Down
2 changes: 1 addition & 1 deletion src/Microassembler.hh
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ public:
bool extractReads(BamReader &reader, Graph_t &g, Ref_t *refinfo, BamRegion &region, int &readcnt, int code);
bool isActiveRegion(BamReader &reader, Ref_t *refinfo, BamRegion &region, int code);
int processReads();
void setFilters(Filters * fs) { filters = fs; }
void setFilters(Filters * fs) { filters = fs; vDB.setFilters(fs); }
void setID(int i) { ID = i; }
string retriveSampleName(SamHeader &header);
};
Expand Down
16 changes: 3 additions & 13 deletions src/Node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ int Node_t::cntReadCode(char code)
}

// isstatusCnt
// return true if mor than 90% of the postion in the nodes are
// return true if more than 80% of the postions in the nodes are
// of type c (T or N)
//////////////////////////////////////////////////////////////
bool Node_t::isStatusCnt(char c) {
Expand Down Expand Up @@ -472,31 +472,21 @@ void Node_t::updateCovDistr(int cov, const string & qv, unsigned int strand, int
vector<cov_t> * cov_distr = NULL;
unordered_set<string> bxset;

if(sample == TMR) {
cov_distr = &cov_distr_tmr;
if(strand == FWD) { bxset = bxset_tmr_fwd; }
if(strand == REV) { bxset = bxset_tmr_rev; }
}
else if(sample == NML) {
cov_distr = &cov_distr_nml;
if(strand == FWD) { bxset = bxset_nml_fwd; }
if(strand == REV) { bxset = bxset_nml_rev; }
}
if(sample == TMR) { cov_distr = &cov_distr_tmr; }
else if(sample == NML) { cov_distr = &cov_distr_nml; }
else { cerr << "Error: unrecognized sample " << sample << endl; }

//string::const_iterator it=qv.begin();
for (unsigned int i = 0; i < cov_distr->size(); ++i) {

if(strand == FWD) {
((*cov_distr)[i]).fwd = cov;
((*cov_distr)[i]).bxset_fwd = bxset;
if(qv[i] >= MIN_QUAL) {
++(((*cov_distr)[i]).minqv_fwd);
}
}
else if(strand == REV) {
((*cov_distr)[i]).rev = cov;
((*cov_distr)[i]).bxset_rev = bxset;
if(qv[i] >= MIN_QUAL) {
++(((*cov_distr)[i]).minqv_rev);
}
Expand Down
23 changes: 23 additions & 0 deletions src/Node.hh
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,29 @@ public:
hpset_tmr.resize(3,0);
hpset_nml.resize(3,0);
}

~Node_t() { //destructor
//cerr << "Node_t " << nodeid_m << " destructor called" << endl;
bxset_tmr_fwd.clear(); unordered_set<string>().swap(bxset_tmr_fwd);
bxset_tmr_rev.clear(); unordered_set<string>().swap(bxset_tmr_rev);
bxset_nml_fwd.clear(); unordered_set<string>().swap(bxset_nml_fwd);
bxset_nml_rev.clear(); unordered_set<string>().swap(bxset_nml_rev);

reads_m.clear(); unordered_set<ReadId_t>().swap(reads_m);
edges_m.clear(); vector<Edge_t>().swap(edges_m);
cov_status.clear(); vector<char>().swap(cov_status);

hpset_nml.clear(); vector<int>().swap(hpset_nml);
hpset_tmr.clear(); vector<int>().swap(hpset_tmr);

cov_distr_tmr.clear(); vector<cov_t>().swap(cov_distr_tmr);
cov_distr_nml.clear(); vector<cov_t>().swap(cov_distr_nml);

mate1_name.clear(); vector<string>().swap(mate1_name);
mate2_name.clear(); vector<string>().swap(mate2_name);

readstarts_m.clear(); vector<ReadStart_t>().swap(readstarts_m);
}

friend ostream& operator<<(std::ostream& o, const Node_t & n) { return n.print(o); }
friend ostream & operator<<(std::ostream & o, const Node_t * n) { return n->print(o); }
Expand Down
107 changes: 57 additions & 50 deletions src/Ref.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
void Ref_t::init() {

// allocate memory for mertables
mertable_nml = new unordered_map<string,cov_t>();
mertable_tmr = new unordered_map<string,cov_t>();
mertable_nml = new unordered_map<string,cov_t>();
mertable_tmr = new unordered_map<string,cov_t>();

// allocate memory for coverage info
normal_coverage = new vector<cov_t>(); // normal k-mer coverage across the reference
Expand Down Expand Up @@ -70,51 +70,65 @@ bool Ref_t::hasMer(const string & cmer)
return mertable_nml->count(cmer);
}

// updated coverage for input mer
void Ref_t::updateBXset(const string & cmer,
unordered_set<string> & bxset_tmr_fwd,
unordered_set<string> & bxset_tmr_rev,
unordered_set<string> & bxset_nml_fwd,
unordered_set<string> & bxset_nml_rev,
unsigned int strand, int sample) {
// addBX
//////////////////////////////////////////////////////////////
void Ref_t::addBX(const string & bx, Mer_t & mer, int sample) {

indexMers();

unordered_map<Mer_t,set<string>> * map = NULL;
unordered_map<string,cov_t> * mertable = NULL;
unordered_set<string> bxset;

if(sample == TMR) {
mertable = mertable_tmr;
if(strand == FWD) { bxset = bxset_tmr_fwd; }
if(strand == REV) { bxset = bxset_tmr_rev; }
}
else if(sample == NML) {
mertable = mertable_nml;
if(strand == FWD) { bxset = bxset_nml_fwd; }
if(strand == REV) { bxset = bxset_nml_rev; }
}
else { cerr << "Error: unrecognized sample " << sample << endl; }
indexMers();

if(sample == TMR) { map = &bx_table_tmr; mertable = mertable_tmr; }
if(sample == NML) { map = &bx_table_nml; mertable = mertable_nml; }

assert(mertable != NULL);
//if (mertable == NULL) { cerr << "Error: null pointer to mer-table!" << endl; }

std::unordered_map<string,cov_t>::iterator it = mertable->find(cmer);
auto it = mertable->find(mer);
if (it != mertable->end()) {
if(strand == FWD) {
((*it).second).bxset_fwd = bxset;
}
else if(strand == REV) {
((*it).second).bxset_rev = bxset;
(*map)[mer].insert(bx);
}
}

// getBXsetAt
/////////////////////////////////////////////////////////////
string Ref_t::getBXsetAt(int start, int end, string & rseq, int sample) {

CanonicalMer_t cmer;
string result;
set<string> bxset;
unordered_map<Mer_t,set<string>> * map;

if(sample == TMR) { map = &bx_table_tmr; }
else if(sample == NML) { map = &bx_table_nml; }
else { cerr << "Error: unrecognized sample " << sample << endl; }

for (int i=start; i<=end; i++) {
cmer.set(rseq.substr(i,K));

auto it = map->find(cmer.mer_m);
if(it != map->end()) {
bxset.insert((it->second).begin(),(it->second).end());
}
}

for ( auto itv = bxset.begin(); itv != bxset.end(); ++itv ) {
if (next(itv) == bxset.end()) { result += *itv; }
else { result += *itv + ";"; }
}

if(result == "") { result = "."; }

//cerr << "BX set: " << result << endl;

return result;
}

// updated coverage for input mer
void Ref_t::updateCoverage(const string & cmer, int cov, unsigned int strand, int sample) {
indexMers();

unordered_map<string,cov_t> * mertable = NULL;
unordered_set<string> bxset;

if(sample == TMR) { mertable = mertable_tmr; }
else if(sample == NML) { mertable = mertable_nml; }
Expand Down Expand Up @@ -179,8 +193,6 @@ void Ref_t::computeCoverage(int sample) {
int cov_hp0 = ((*it).second).hp0;
int cov_hp1 = ((*it).second).hp1;
int cov_hp2 = ((*it).second).hp2;
unordered_set<string> bxset_fwd = ((*it).second).bxset_fwd;
unordered_set<string> bxset_rev = ((*it).second).bxset_rev;

/*
normal_coverage.at(i) = n_cov;
Expand All @@ -200,19 +212,15 @@ void Ref_t::computeCoverage(int sample) {
coverage->at(j).rev = cov_rev;
coverage->at(j).hp0 = cov_hp0;
coverage->at(j).hp1 = cov_hp1;
coverage->at(j).hp2 = cov_hp2;
coverage->at(j).bxset_fwd = bxset_fwd;
coverage->at(j).bxset_rev = bxset_rev;
coverage->at(j).hp2 = cov_hp2;
}
}
else {
coverage->at(i+K-1).fwd = cov_fwd;
coverage->at(i+K-1).rev = cov_rev;
coverage->at(i+K-1).hp0 = cov_hp0;
coverage->at(i+K-1).hp1 = cov_hp1;
coverage->at(i+K-1).hp2 = cov_hp2;
coverage->at(i+K-1).bxset_fwd = bxset_fwd;
coverage->at(i+K-1).bxset_rev = bxset_rev;
coverage->at(i+K-1).hp2 = cov_hp2;

//for (int l = 0; l < K-1; l++) {
//if(normal_coverage.at(i+l) < n_cov) { normal_coverage.at(i+l) = n_cov; }
Expand All @@ -225,22 +233,18 @@ void Ref_t::computeCoverage(int sample) {
else {
if(i==0) {
for (int j=i; j<K; ++j) {
coverage->at(j).fwd = 0;
coverage->at(j).fwd = 0;
coverage->at(j).rev = 0;
coverage->at(j).hp0 = 0;
coverage->at(j).hp1 = 0;
coverage->at(j).hp2 = 0;
coverage->at(j).bxset_fwd.clear();
coverage->at(j).bxset_rev.clear();
coverage->at(j).hp1 = 0;
coverage->at(j).hp2 = 0;
}
}
coverage->at(i+K-1).fwd = 0;
coverage->at(i+K-1).rev = 0;
coverage->at(i+K-1).hp0 = 0;
coverage->at(i+K-1).hp1 = 0;
coverage->at(i+K-1).hp2 = 0;
coverage->at(i+K-1).bxset_fwd.clear();
coverage->at(i+K-1).bxset_rev.clear();
}
}
}
Expand Down Expand Up @@ -354,10 +358,13 @@ void Ref_t::printKmerCoverage(int sample) {
// clear DT and free memory
void Ref_t::clear() {

if(mertable_nml != NULL) { mertable_nml->clear(); delete mertable_nml; mertable_nml = NULL; }
if(mertable_tmr != NULL) { mertable_tmr->clear(); delete mertable_tmr; mertable_tmr = NULL; }
if(normal_coverage != NULL) { normal_coverage->clear(); delete normal_coverage; normal_coverage = NULL; }
if(tumor_coverage != NULL) { tumor_coverage->clear(); delete tumor_coverage; tumor_coverage = NULL; }
if(mertable_nml != NULL) { mertable_nml->clear(); unordered_map<string,cov_t>().swap(*mertable_nml); delete mertable_nml; mertable_nml = NULL; }
if(mertable_tmr != NULL) { mertable_tmr->clear(); unordered_map<string,cov_t>().swap(*mertable_tmr); delete mertable_tmr; mertable_tmr = NULL; }
if(normal_coverage != NULL) { normal_coverage->clear(); vector<cov_t>().swap(*normal_coverage); delete normal_coverage; normal_coverage = NULL; }
if(tumor_coverage != NULL) { tumor_coverage->clear(); vector<cov_t>().swap(*tumor_coverage); delete tumor_coverage; tumor_coverage = NULL; }

bx_table_tmr.clear(); unordered_map<Mer_t,set<string>>().swap(bx_table_tmr);
bx_table_nml.clear(); unordered_map<Mer_t,set<string>>().swap(bx_table_nml);
}

// reset coverage to 0
Expand Down
Loading

0 comments on commit f30682f

Please sign in to comment.