logR

dellytools · Feb 7, 2024 · 33d1991 · 33d1991
1 parent 9ca3aab
commit 33d1991
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 33 deletions.
diff --git a/R/gcbias.R b/R/gcbias.R
@@ -3,7 +3,6 @@ library(reshape2)
 
 args = commandArgs(trailingOnly=TRUE)
 x = read.table(args[1], header=T)
-x = read.table("gc.table", header=T)
 x$gc = x$gcsum / (nrow(x)-1)
 x$fractionSample = x$fractionSample * 100
 x$fractionReference = x$fractionReference * 100

diff --git a/README.md b/README.md
@@ -46,11 +46,11 @@ Delly needs a sorted, indexed and duplicate marked bam file for every input samp
 An indexed reference genome is required to identify split-reads.
 Common workflows for germline and somatic SV calling are outlined below.
 
-`delly call -g hg19.fa input.bam > delly.vcf`
+`delly call -g hg38.fa input.bam > delly.vcf`
 
 You can also specify an output file in [BCF](http://samtools.github.io/bcftools/) format.
 
-`delly call -o delly.bcf -g hg19.fa input.bam`
+`delly call -o delly.bcf -g hg38.fa input.bam`
 
 `bcftools view delly.bcf > delly.vcf`
 
@@ -60,27 +60,27 @@ Example
 
 A small example is included for short-read, long-read and copy-number variant calling.
 
-`delly call -g example/ref.fa example/sr.bam > sr.vcf`
+`delly call -g example/ref.fa -o sr.bcf example/sr.bam`
 
-`delly lr -g example/ref.fa example/lr.bam > lr.vcf`
+`delly lr -g example/ref.fa -o lr.bcf example/lr.bam`
 
-`delly cnv -g example/ref.fa -m example/map.fa.gz example/sr.bam > cnv.vcf`
+`delly cnv -g example/ref.fa -m example/map.fa.gz -c out.cov.gz -o cnv.bcf example/sr.bam`
 
 
 Somatic SV calling
 ------------------
 
 * At least one tumor sample and a matched control sample are required for SV discovery
 
-`delly call -x hg19.excl -o t1.bcf -g hg19.fa tumor1.bam control1.bam`
+`delly call -x hg38.excl -o t1.bcf -g hg38.fa tumor1.bam control1.bam`
 
 * Somatic pre-filtering requires a tab-delimited sample description file where the first column is the sample id (as in the VCF/BCF file) and the second column is either tumor or control.
 
 `delly filter -f somatic -o t1.pre.bcf -s samples.tsv t1.bcf`
 
 * Genotype pre-filtered somatic sites across a larger panel of control samples to efficiently filter false postives and germline SVs. For performance reasons, this can be run in parallel for each sample of the control panel and you may want to combine multiple pre-filtered somatic site lists from multiple tumor samples.
 
-`delly call -g hg19.fa -v t1.pre.bcf -o geno.bcf -x hg19.excl tumor1.bam control1.bam ... controlN.bam`
+`delly call -g hg38.fa -v t1.pre.bcf -o geno.bcf -x hg38.excl tumor1.bam control1.bam ... controlN.bam`
 
 * Post-filter for somatic SVs using all control samples.
 
@@ -93,17 +93,17 @@ Germline SV calling
 
 * SV calling is done by sample for high-coverage genomes or in small batches for low-coverage genomes
 
-`delly call -g hg19.fa -o s1.bcf -x hg19.excl sample1.bam`
+`delly call -g hg38.fa -o s1.bcf -x hg38.excl sample1.bam`
 
 * Merge SV sites into a unified site list 
 
 `delly merge -o sites.bcf s1.bcf s2.bcf ... sN.bcf`
 
 * Genotype this merged SV site list across all samples. This can be run in parallel for each sample.
 
-`delly call -g hg19.fa -v sites.bcf -o s1.geno.bcf -x hg19.excl s1.bam`
+`delly call -g hg38.fa -v sites.bcf -o s1.geno.bcf -x hg38.excl s1.bam`
 
-`delly call -g hg19.fa -v sites.bcf -o sN.geno.bcf -x hg19.excl sN.bam`
+`delly call -g hg38.fa -v sites.bcf -o sN.geno.bcf -x hg38.excl sN.bam`
 
 * Merge all genotyped samples to get a single VCF/BCF using bcftools merge
 
@@ -119,9 +119,9 @@ Delly for long reads from PacBio or ONT
 
 Delly also supports long-reads for SV discovery.
 
-`delly lr -y ont -o delly.bcf -g hg19.fa input.bam`
+`delly lr -y ont -o delly.bcf -g hg38.fa input.bam`
 
-`delly lr -y pb -o delly.bcf -g hg19.fa input.bam`
+`delly lr -y pb -o delly.bcf -g hg38.fa input.bam`
 
 
 Read-depth profiles
@@ -133,23 +133,31 @@ You can generate read-depth profiles with delly. This requires a mappability map
 
 The command to count reads in 10kbp mappable windows and normalize the coverage is:
 
-`delly cnv -a -g hg19.fa -m hg19.map input.bam`
+`delly cnv -a -g hg38.fa -m hg38.map -c out.cov.gz -o out.bcf input.bam`
 
 The output file `out.cov.gz` can be plotted using [R](https://www.r-project.org/) to generate normalized copy-number profiles:
 
 `Rscript R/rd.R out.cov.gz`
 
+With `-s` you can output a statistics file with GC bias information.
+
+`delly cnv -g hg38.fa -m hg38.map -c out.cov.gz -o out.bcf -s stats.gz input.bam`
+
+`zcat stats.gz | grep "^GC" > gc.bias.tsv`
+
+`Rscript R/gcbias.R gc.bias.tsv`
+
 
 Copy-number segmentation
 ------------------------
 
 Read-depth profiles can also be segmented at the same time.
 
-`delly cnv -a -u -g hg19.fa -m hg19.map input.bam`
+`delly cnv -a -u -g hg38.fa -m hg38.map -c out.cov.gz -o out.bcf input.bam`
 
 The segmentation is in VCF format but you can extract a BED-like file using bcftools.
 
-`bcftools query -f "%CHROM\t%POS\t%INFO/END\t%ID[\t%RDCN]\n" cnv.bcf > segmentation.bed`
+`bcftools query -f "%CHROM\t%POS\t%INFO/END\t%ID[\t%RDCN]\n" out.bcf > segmentation.bed`
 
 Plotting:
 
@@ -162,15 +170,15 @@ Delly uses GC and mappability fragment correction to call CNVs. This requires a
 
 * Call CNVs for each sample and optionally refine breakpoints using delly SV calls
 
-`delly cnv -o c1.bcf -g hg19.fa -m hg19.map -l delly.sv.bcf input.bam`
+`delly cnv -o c1.bcf -g hg38.fa -m hg38.map -l delly.sv.bcf input.bam`
 
 * Merge CNVs into a unified site list
 
 `delly merge -e -p -o sites.bcf -m 1000 -n 100000 c1.bcf c2.bcf ... cN.bcf`
 
 * Genotype CNVs for each sample
 
-`delly cnv -u -v sites.bcf -g hg19.fa -m hg19.map -o geno1.bcf input.bam`
+`delly cnv -u -v sites.bcf -g hg38.fa -m hg38.map -o geno1.bcf input.bam`
 
 * Merge genotypes using [bcftools](https://github.com/samtools/bcftools)
 
@@ -192,11 +200,11 @@ Somatic copy-number alterations (SCNAs)
 
 * For somatic copy-number alterations, delly first segments the tumor genome (`-u` is required). Depending on the coverage, tumor purity and heterogeneity you can adapt parameters `-z`, `-t` and `-x` which control the sensitivity of SCNA detection.
 
-`delly cnv -u -z 10000 -o tumor.bcf -c tumor.cov.gz -g hg19.fa -m hg19.map tumor.bam`
+`delly cnv -u -z 10000 -o tumor.bcf -c tumor.cov.gz -g hg38.fa -m hg38.map tumor.bam`
 
 * Then these tumor SCNAs are genotyped in the control sample (`-u` is required).
 
-`delly cnv -u -v tumor.bcf -o control.bcf -g hg19.fa -m hg19.map control.bam`
+`delly cnv -u -v tumor.bcf -o control.bcf -g hg38.fa -m hg38.map control.bam`
 
 * The VCF IDs are matched between tumor and control. Thus, you can merge both files using [bcftools](https://github.com/samtools/bcftools).
 

diff --git a/src/coral.h b/src/coral.h
@@ -101,7 +101,7 @@ namespace torali
     if (!c.covfile.empty()) {
       dataOut.push(boost::iostreams::gzip_compressor());
       dataOut.push(boost::iostreams::file_sink(c.covfile.c_str(), std::ios_base::out | std::ios_base::binary));
-      dataOut << "chr\tstart\tend\t" << c.sampleName << "_mappable\t" << c.sampleName << "_counts\t" << c.sampleName << "_CN" << std::endl;
+      dataOut << "chr\tstart\tend\t" << c.sampleName << "_mappable\t" << c.sampleName << "_logR\t" << c.sampleName << "_CN" << std::endl;
     }
 
     // CNVs
@@ -299,10 +299,14 @@ namespace torali
 		    ++winlen;
 		    if (winlen == c.window_size) {
 		      obsexp /= (double) winlen;
-		      double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		      // Normalized counts: double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
 		      double cn = chrPloidy;
-		      if (expcov > 0) cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
-		      if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		      double logR = 0;
+		      if (expcov > 0) {
+			cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
+			logR = std::log2(covsum / expcov);
+		      }
+		      if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << logR << "\t" << cn << std::endl;
 		      // reset
 		      covsum = 0;
 		      expcov = 0;
@@ -355,10 +359,14 @@ namespace torali
 	      }
 	      if (winlen >= c.fracWindow * (it->second - it->first)) {
 		obsexp /= (double) winlen;
-		double count = ((double) covsum / obsexp ) * (double) (it->second - it->first) / (double) winlen;
+		// Normalized counts: double count = ((double) covsum / obsexp ) * (double) (it->second - it->first) / (double) winlen;
 		double cn = chrPloidy;
-		if (expcov > 0) cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
-		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		double logR = 0;
+		if (expcov > 0) {
+		  cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
+		  logR = std::log2(covsum / expcov);
+		}
+		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\t" << winlen << "\t" << logR << "\t" << cn << std::endl;
 	      } else {
 		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\tNA\tNA\tNA" << std::endl;
 	      }
@@ -382,10 +390,14 @@ namespace torali
 	      ++winlen;
 	      if (winlen == c.window_size) {
 		obsexp /= (double) winlen;
-		double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		// Normalized counts: double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
 		double cn = chrPloidy;
-		if (expcov > 0) cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
-		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		double logR = 0;
+		if (expcov > 0) {
+		  cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
+		  logR = std::log2(covsum / expcov);
+		}
+		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << logR << "\t" << cn << std::endl;
 		// reset
 		covsum = 0;
 		expcov = 0;
@@ -430,10 +442,14 @@ namespace torali
 	      }
 	      if (winlen >= c.fracWindow * c.window_size) {
 		obsexp /= (double) winlen;
-		double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		// Normalized counts: double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
 		double cn = chrPloidy;
-		if (expcov > 0) cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
-		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (start + c.window_size) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		double logR = 0;
+		if (expcov > 0) {
+		  cn = (c.expectedCN * covsum / expcov - chrCtrlPloidy * (1 - c.purity)) / c.purity;
+		  logR = std::log2(covsum / expcov);
+		}
+		if (!c.covfile.empty()) dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (start + c.window_size) << "\t" << winlen << "\t" << logR << "\t" << cn << std::endl;
 	      }
 	    }
 	  }