Skip to content

Commit

Permalink
Update melt docker image to current latest build. (#592)
Browse files Browse the repository at this point in the history
Add scramble dockerfile and update wdl

Create /app dir

Add docker Rscript test

Delete backslash

Update scramble docker

Bump Part1 memory to 3gb

Add cores option to scramble R script

Fix wdls

Add bcftools to docker

Update docker; 15 cores

cores -> threads

Update scramble repo commit

Fix dockerfile

Update scramble docker

Try 2 cores in Part1

Update runtimes

Update ubuntu, htslib, bcftools versions

Add wget

wget install

Move wget install

install just bcftools and htslib

Clean up bcftools source

Delete backslash

Update docker
  • Loading branch information
VJalili authored and mwalker174 committed Aug 31, 2023
1 parent 4950ac6 commit fe51a9b
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 53 deletions.
69 changes: 69 additions & 0 deletions dockerfiles/scramble/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# source Image
FROM ubuntu:22.04

# set noninterative mode
ENV DEBIAN_FRONTEND noninteractive

# apt-get update and install global requirements
RUN apt-get clean all && \
apt-get update && \
apt-get upgrade -y && \
apt-get install -y \
autoconf \
autogen \
build-essential \
curl \
git \
libbz2-dev \
libcurl4-openssl-dev \
liblzma-dev \
libncurses5-dev \
libnss-sss \
libssl-dev \
libxml2-dev \
ncbi-blast+ \
r-base \
r-bioc-biostrings \
r-bioc-rsamtools \
r-cran-biocmanager \
r-cran-devtools \
r-cran-stringr \
r-cran-optparse \
wget \
zlib1g-dev

# apt-get clean and remove cached source lists
RUN apt-get clean && \
rm -rf /var/lib/apt/lists/*

# install global r requirements
RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile
RUN Rscript -e "library(devtools); install_github('mhahsler/rBLAST')"

# install bcftools and htslib
ARG HTSLIB_VERSION="1.18"
RUN mkdir -p /opt && cd /opt && \
wget -q https://github.com/samtools/bcftools/releases/download/${HTSLIB_VERSION}/bcftools-${HTSLIB_VERSION}.tar.bz2 && \
tar xjf bcftools-${HTSLIB_VERSION}.tar.bz2 && \
cd bcftools-${HTSLIB_VERSION} && \
./configure --quiet && \
make -s all && \
make -s install install-htslib && \
cd / && rm -r /opt/bcftools-${HTSLIB_VERSION} /opt/bcftools-${HTSLIB_VERSION}.tar.bz2

ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH

# install scramble
RUN mkdir /app && cd /app \
&& git clone https://github.com/mwalker174/scramble-gatk-sv.git \
&& cd scramble-gatk-sv \
&& git checkout dc1afd7f825d188e8ad8caa12e998a591ad70a88 \
&& cd cluster_identifier/src \
&& make \
&& ln -s /app/scramble-gatk-sv/cluster_identifier/src/build/cluster_identifier /usr/local/bin

# test; note that cluster_identifier has no help command currently
RUN Rscript --vanilla /app/scramble-gatk-sv/cluster_analysis/bin/SCRAMble.R --help

# define default command
CMD ["Rscript"]
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"linux_docker": "marketplace.gcr.io/google/ubuntu1804",
"manta_docker": "us.gcr.io/broad-dsde-methods/vjalili/manta:5994670",
"melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:3159ce1",
"scramble_docker": "us.gcr.io/broad-dsde-methods/tsharpe/scramble:1.0.2",
"scramble_docker": "us.gcr.io/broad-dsde-methods/markw/scramble:mw-scramble-048c95e",
"samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
Expand Down
4 changes: 2 additions & 2 deletions inputs/values/dockers_azure.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"genomes_in_the_cloud_docker": "vahid.azurecr.io/genomes-in-the-cloud:2.3.2-1510681135",
"linux_docker": "vahid.azurecr.io/google/ubuntu1804",
"manta_docker": "vahid.azurecr.io/vjalili/manta:5994670",
"melt_docker": "vahid.azurecr.io/melt:vj-4ff9de9f",
"melt_docker": "vahid.azurecr.io/melt:3159ce1",
"scramble_docker": "vahid.azurecr.io/tsharpe/scramble:1.0.2",
"samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
Expand All @@ -32,4 +32,4 @@
"sv_utils_docker": "vahid.azurecr.io/gatk-sv/sv-utils:2023-08-04-v0.28.1-beta-4959f62e",
"gq_recalibrator_docker": "vahid.azurecr.io/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
"str": "vahid.azurecr.io/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6"
}
}
6 changes: 4 additions & 2 deletions wdl/GATKSVPipelineSingleSample.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ workflow GATKSVPipelineSingleSample {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
Expand Down Expand Up @@ -651,7 +652,8 @@ workflow GATKSVPipelineSingleSample {
runtime_attr_melt_coverage=runtime_attr_melt_coverage,
runtime_attr_melt_metrics=runtime_attr_melt_metrics,
runtime_attr_melt=runtime_attr_melt,
runtime_attr_scramble=runtime_attr_scramble,
runtime_attr_scramble_part1=runtime_attr_scramble_part1,
runtime_attr_scramble_part2=runtime_attr_scramble_part2,
runtime_attr_pesr=runtime_attr_pesr,
runtime_attr_wham=runtime_attr_wham
}
Expand Down
11 changes: 8 additions & 3 deletions wdl/GatherSampleEvidence.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ workflow GatherSampleEvidence {
Float? total_reads
Int? pf_reads_improper_pairs

# Scramble inputs
Int? scramble_part2_threads

# Wham inputs
File wham_include_list_bed_file

Expand Down Expand Up @@ -95,7 +98,8 @@ workflow GatherSampleEvidence {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
Expand Down Expand Up @@ -227,9 +231,10 @@ workflow GatherSampleEvidence {
bam_or_cram_index = reads_index_,
sample_name = sample_id,
reference_fasta = reference_fasta,
detect_deletions = false,
part2_threads = scramble_part2_threads,
scramble_docker = select_first([scramble_docker]),
runtime_attr_scramble = runtime_attr_scramble
runtime_attr_scramble_part1 = runtime_attr_scramble_part1,
runtime_attr_scramble_part2 = runtime_attr_scramble_part2
}
}
Expand Down
6 changes: 4 additions & 2 deletions wdl/GatherSampleEvidenceBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ workflow GatherSampleEvidenceBatch {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
RuntimeAttr? runtime_attr_cat_metrics
Expand Down Expand Up @@ -147,7 +148,8 @@ workflow GatherSampleEvidenceBatch {
runtime_attr_melt_coverage = runtime_attr_melt_coverage,
runtime_attr_melt_metrics = runtime_attr_melt_metrics,
runtime_attr_melt = runtime_attr_melt,
runtime_attr_scramble = runtime_attr_scramble,
runtime_attr_scramble_part1 = runtime_attr_scramble_part1,
runtime_attr_scramble_part2 = runtime_attr_scramble_part2,
runtime_attr_pesr = runtime_attr_pesr,
runtime_attr_wham = runtime_attr_wham
}
Expand Down
118 changes: 75 additions & 43 deletions wdl/Scramble.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ workflow Scramble {
File bam_or_cram_index
String sample_name
File reference_fasta
Boolean detect_deletions
String scramble_docker
RuntimeAttr? runtime_attr_scramble
Int? part2_threads
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
}
parameter_meta {
Expand All @@ -27,41 +28,98 @@ workflow Scramble {
detect_deletions: "Run deletion detection as well as mobile element insertion."
}

call RunScramble {
call ScramblePart1 {
input:
bam_or_cram_file = bam_or_cram_file,
bam_or_cram_index = bam_or_cram_index,
sample_name = sample_name,
reference_fasta = reference_fasta,
detect_deletions = detect_deletions,
scramble_docker = scramble_docker,
runtime_attr_override = runtime_attr_scramble
runtime_attr_override = runtime_attr_scramble_part1
}
call ScramblePart2 {
input:
clusters_file = ScramblePart1.clusters_file,
sample_name = sample_name,
reference_fasta = reference_fasta,
threads = part2_threads,
scramble_docker = scramble_docker,
runtime_attr_override = runtime_attr_scramble_part2
}
output {
File vcf = RunScramble.vcf
File index = RunScramble.index
File vcf = ScramblePart2.vcf
File index = ScramblePart2.index
}
}

task RunScramble {
task ScramblePart1 {
input {
File bam_or_cram_file
File bam_or_cram_index
String sample_name
File reference_fasta
Boolean detect_deletions
String scramble_docker
RuntimeAttr? runtime_attr_override
File? NOT_A_FILE
}
Int mem_size_gb = if detect_deletions then 16 else 3
Int disk_size_gb = ceil(size(bam_or_cram_file,"GiB") + size(bam_or_cram_index,"GiB") + size(reference_fasta,"GiB") + 10)
Int disk_size_gb = ceil(size(bam_or_cram_file,"GiB") + size(reference_fasta,"GiB")*1.5 + 50)
RuntimeAttr default_attr = object {
cpu_cores: 2,
mem_gb: 3.0,
disk_gb: disk_size_gb,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File clusters_file = "~{sample_name}.scramble_clusters.tsv.gz"
}
command <<<
set -euo pipefail

xDir=$PWD
clusterFile=$xDir/clusters
scrambleDir="/app/scramble-gatk-sv"
meiRef=$scrambleDir/cluster_analysis/resources/MEI_consensus_seqs.fa

# create a blast db from the reference
cat ~{reference_fasta} | makeblastdb -in - -parse_seqids -title ref -dbtype nucl -out ref

# Identify clusters of split reads
$scrambleDir/cluster_identifier/src/build/cluster_identifier ~{bam_or_cram_file} \
| gzip > ~{sample_name}.scramble_clusters.tsv.gz
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: scramble_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task ScramblePart2 {
input {
File clusters_file
String sample_name
File reference_fasta
String scramble_docker
Int threads = 7 # Number of threads, usually 1 less than cpu_cores
RuntimeAttr? runtime_attr_override
}
Int disk_size_gb = ceil(10*size(clusters_file,"GiB") + size(reference_fasta,"GiB") + 10)
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: mem_size_gb,
cpu_cores: 8,
mem_gb: 8.0,
disk_gb: disk_size_gb,
boot_disk_gb: 10,
preemptible_tries: 3,
Expand All @@ -78,19 +136,18 @@ task RunScramble {

xDir=$PWD
clusterFile=$xDir/clusters
scrambleDir="/app"
scrambleDir="/app/scramble-gatk-sv"
meiRef=$scrambleDir/cluster_analysis/resources/MEI_consensus_seqs.fa

# create a blast db from the reference
cat ~{reference_fasta} | makeblastdb -in - -parse_seqids -title ref -dbtype nucl -out ref

# Identify clusters of split reads
$scrambleDir/cluster_identifier/src/build/cluster_identifier ~{bam_or_cram_file} > $clusterFile
gunzip -c ~{clusters_file} > $clusterFile

# Produce ${clusterFile}_MEIs.txt
Rscript --vanilla $scrambleDir/cluster_analysis/bin/SCRAMble.R --out-name $clusterFile \
--cluster-file $clusterFile --install-dir $scrambleDir/cluster_analysis/bin \
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-meis
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-meis --cores ~{threads}

# create a header for the output vcf
echo \
Expand Down Expand Up @@ -139,31 +196,6 @@ task RunScramble {
# transform the MEI descriptions into VCF lines
awk -f awkScript.awk ${clusterFile}_MEIs.txt >> tmp.vcf

# work on deletions, if requested
if [ ~{detect_deletions} == "true" ]
then
# split the file of clusters to keep memory bounded
# The awk script removes lines where field 4 (the left consensus) contains nothing but 'n's
# because the deletion hunter in Scramble barfs on these.
awk '{left=$4; gsub(/n/,"",left); if ( length(left) > 0 ) print}' $clusterFile | split -a3 -l1500 - xyzzy

# produce a xyzzy???_PredictedDeletions.txt file for each split
for fil in xyzzy???
do Rscript --vanilla $scrambleDir/cluster_analysis/bin/SCRAMble.R --out-name $xDir/$fil \
--cluster-file $xDir/$fil --install-dir $scrambleDir/cluster_analysis/bin \
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-dels
done

# transform the *_PredictedDeletions.txt files into VCF lines, and add them to the body
awk \
'BEGIN{ FS=OFS="\t" }
{ if(FNR<2)next
Q= $11=="NA" ? ($15=="NA"?".":$15) : ($15=="NA"?$11:($11+$15)/2)
print $1,$2+1,".","N","<DEL>",Q=="."?".":int(Q),"PASS",\
"END=" $3+1 ";SVTYPE=DEL;SVLEN=" $5 ";STRANDS=+-;CHR2=" $1 ";ALGORITHMS=scramble",\
"GT","0/1" }' xyzzy???_PredictedDeletions.txt >> tmp.vcf
fi

# sort and index the output VCF
bcftools sort -Oz <tmp.vcf >"~{sample_name}.scramble.vcf.gz"
bcftools index -ft "~{sample_name}.scramble.vcf.gz"
Expand Down

0 comments on commit fe51a9b

Please sign in to comment.