Skip to content

Commit

Permalink
Optimize scramble
Browse files Browse the repository at this point in the history
Add cores to batch wdl

Update scramble git commit

Update scramble wdl

Fix dockerfile

Fix dockerfile again

Update docker

Clean up wdl a bit

Update scramble commit

Update docker; reduce ScramblePart1 mem to 2gb

Increase part2 mem to 15gib

Decrease part2 mem to 12gib

Optimize resources

Azure docker

Update scramble vcfs
  • Loading branch information
mwalker174 committed Sep 12, 2023
1 parent b157b03 commit 3fa8e18
Show file tree
Hide file tree
Showing 8 changed files with 482 additions and 364 deletions.
71 changes: 71 additions & 0 deletions dockerfiles/scramble/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# source Image
FROM ubuntu:22.04

# set noninterative mode
ENV DEBIAN_FRONTEND noninteractive

# apt-get update and install global requirements
RUN apt-get clean all && \
apt-get update && \
apt-get upgrade -y && \
apt-get install -y \
autoconf \
autogen \
build-essential \
curl \
git \
libbz2-dev \
libcurl4-openssl-dev \
liblzma-dev \
libncurses5-dev \
libnss-sss \
libssl-dev \
libxml2-dev \
ncbi-blast+ \
r-base \
r-bioc-biostrings \
r-bioc-rsamtools \
r-cran-biocmanager \
r-cran-devtools \
r-cran-stringr \
r-cran-optparse \
wget \
zlib1g-dev

# apt-get clean and remove cached source lists
RUN apt-get clean && \
rm -rf /var/lib/apt/lists/*

# install global r requirements
RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile
RUN Rscript -e "library(devtools); install_github('mhahsler/rBLAST')"

# install bcftools and htslib
ARG HTSLIB_VERSION="1.18"
RUN mkdir -p /opt && cd /opt && \
wget -q https://github.com/samtools/bcftools/releases/download/${HTSLIB_VERSION}/bcftools-${HTSLIB_VERSION}.tar.bz2 && \
tar xjf bcftools-${HTSLIB_VERSION}.tar.bz2 && \
cd bcftools-${HTSLIB_VERSION} && \
./configure --quiet && \
make -s all && \
make -s install install-htslib && \
cd / && rm -r /opt/bcftools-${HTSLIB_VERSION} /opt/bcftools-${HTSLIB_VERSION}.tar.bz2

ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH

# install scramble
ARG SCRAMBLE_COMMIT="f320d604ac030e4a7fa96b0663bcae02994c7d94"
RUN mkdir /app && cd /app \
&& git clone https://github.com/mwalker174/scramble-gatk-sv.git \
&& cd scramble-gatk-sv \
&& git checkout ${SCRAMBLE_COMMIT} \
&& cd cluster_identifier/src \
&& make \
&& ln -s /app/scramble-gatk-sv/cluster_identifier/src/build/cluster_identifier /usr/local/bin

# test
RUN Rscript --vanilla /app/scramble-gatk-sv/cluster_analysis/bin/SCRAMble.R --help
RUN /app/scramble-gatk-sv/cluster_identifier/src/build/cluster_identifier -v

# define default command
CMD ["Rscript"]
2 changes: 1 addition & 1 deletion inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"linux_docker": "marketplace.gcr.io/google/ubuntu1804",
"manta_docker": "us.gcr.io/broad-dsde-methods/vjalili/manta:5994670",
"melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:3159ce1",
"scramble_docker": "us.gcr.io/broad-dsde-methods/tsharpe/scramble:1.0.2",
"scramble_docker": "us.gcr.io/broad-dsde-methods/markw/scramble:mw-scramble-99af4c50",
"samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670",
Expand Down
2 changes: 1 addition & 1 deletion inputs/values/dockers_azure.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"linux_docker": "vahid.azurecr.io/google/ubuntu1804",
"manta_docker": "vahid.azurecr.io/vjalili/manta:5994670",
"melt_docker": "vahid.azurecr.io/melt:3159ce1",
"scramble_docker": "vahid.azurecr.io/tsharpe/scramble:1.0.2",
"scramble_docker": "vahid.azurecr.io/scramble:mw-scramble-99af4c50",
"samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2023-07-28-v0.28.1-beta-e70dfbd7",
"sv_base_mini_docker": "vahid.azurecr.io/vjalili/sv-base-mini:5994670",
Expand Down
624 changes: 312 additions & 312 deletions inputs/values/ref_panel_1kg.json

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions wdl/GATKSVPipelineSingleSample.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ workflow GATKSVPipelineSingleSample {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
Expand Down Expand Up @@ -651,7 +652,8 @@ workflow GATKSVPipelineSingleSample {
runtime_attr_melt_coverage=runtime_attr_melt_coverage,
runtime_attr_melt_metrics=runtime_attr_melt_metrics,
runtime_attr_melt=runtime_attr_melt,
runtime_attr_scramble=runtime_attr_scramble,
runtime_attr_scramble_part1=runtime_attr_scramble_part1,
runtime_attr_scramble_part2=runtime_attr_scramble_part2,
runtime_attr_pesr=runtime_attr_pesr,
runtime_attr_wham=runtime_attr_wham
}
Expand Down
13 changes: 10 additions & 3 deletions wdl/GatherSampleEvidence.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ workflow GatherSampleEvidence {
Float? total_reads
Int? pf_reads_improper_pairs

# Scramble inputs
Int? scramble_part2_threads

# Wham inputs
File wham_include_list_bed_file

Expand Down Expand Up @@ -95,7 +98,8 @@ workflow GatherSampleEvidence {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
Expand Down Expand Up @@ -227,9 +231,12 @@ workflow GatherSampleEvidence {
bam_or_cram_index = reads_index_,
sample_name = sample_id,
reference_fasta = reference_fasta,
detect_deletions = false,
reference_index = reference_index,
regions_list = primary_contigs_list,
part2_threads = scramble_part2_threads,
scramble_docker = select_first([scramble_docker]),
runtime_attr_scramble = runtime_attr_scramble
runtime_attr_scramble_part1 = runtime_attr_scramble_part1,
runtime_attr_scramble_part2 = runtime_attr_scramble_part2
}
}
Expand Down
10 changes: 8 additions & 2 deletions wdl/GatherSampleEvidenceBatch.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ workflow GatherSampleEvidenceBatch {
Array[Float]? total_reads
Array[Int]? pf_reads_improper_pairs

# Scramble inputs
Int? scramble_part2_threads

# Wham inputs
File wham_include_list_bed_file

Expand Down Expand Up @@ -80,7 +83,8 @@ workflow GatherSampleEvidenceBatch {
RuntimeAttr? runtime_attr_melt_coverage
RuntimeAttr? runtime_attr_melt_metrics
RuntimeAttr? runtime_attr_melt
RuntimeAttr? runtime_attr_scramble
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
RuntimeAttr? runtime_attr_pesr
RuntimeAttr? runtime_attr_wham
RuntimeAttr? runtime_attr_cat_metrics
Expand Down Expand Up @@ -124,6 +128,7 @@ workflow GatherSampleEvidenceBatch {
pct_chimeras = if defined(pct_chimeras) then select_first([pct_chimeras])[i] else NONE_FLOAT_,
total_reads = if defined(total_reads) then select_first([total_reads])[i] else NONE_FLOAT_,
pf_reads_improper_pairs = if defined(pf_reads_improper_pairs) then select_first([pf_reads_improper_pairs])[i] else NONE_INT_,
scramble_part2_threads=scramble_part2_threads,
wham_include_list_bed_file = wham_include_list_bed_file,
run_module_metrics = run_module_metrics_,
sv_pipeline_base_docker = sv_pipeline_base_docker,
Expand All @@ -147,7 +152,8 @@ workflow GatherSampleEvidenceBatch {
runtime_attr_melt_coverage = runtime_attr_melt_coverage,
runtime_attr_melt_metrics = runtime_attr_melt_metrics,
runtime_attr_melt = runtime_attr_melt,
runtime_attr_scramble = runtime_attr_scramble,
runtime_attr_scramble_part1 = runtime_attr_scramble_part1,
runtime_attr_scramble_part2 = runtime_attr_scramble_part2,
runtime_attr_pesr = runtime_attr_pesr,
runtime_attr_wham = runtime_attr_wham
}
Expand Down
118 changes: 75 additions & 43 deletions wdl/Scramble.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ workflow Scramble {
File bam_or_cram_index
String sample_name
File reference_fasta
Boolean detect_deletions
File reference_index
File regions_list
String scramble_docker
RuntimeAttr? runtime_attr_scramble
Int? part2_threads
RuntimeAttr? runtime_attr_scramble_part1
RuntimeAttr? runtime_attr_scramble_part2
}
parameter_meta {
Expand All @@ -27,41 +30,96 @@ workflow Scramble {
detect_deletions: "Run deletion detection as well as mobile element insertion."
}

call RunScramble {
call ScramblePart1 {
input:
bam_or_cram_file = bam_or_cram_file,
bam_or_cram_index = bam_or_cram_index,
sample_name = sample_name,
regions_list = regions_list,
reference_fasta = reference_fasta,
detect_deletions = detect_deletions,
reference_index = reference_index,
scramble_docker = scramble_docker,
runtime_attr_override = runtime_attr_scramble
runtime_attr_override = runtime_attr_scramble_part1
}
call ScramblePart2 {
input:
clusters_file = ScramblePart1.clusters_file,
sample_name = sample_name,
reference_fasta = reference_fasta,
threads = part2_threads,
scramble_docker = scramble_docker,
runtime_attr_override = runtime_attr_scramble_part2
}
output {
File vcf = RunScramble.vcf
File index = RunScramble.index
File vcf = ScramblePart2.vcf
File index = ScramblePart2.index
}
}

task RunScramble {
task ScramblePart1 {
input {
File bam_or_cram_file
File bam_or_cram_index
String sample_name
File regions_list
File reference_fasta
Boolean detect_deletions
File reference_index
String scramble_docker
RuntimeAttr? runtime_attr_override
File? NOT_A_FILE
}
Int mem_size_gb = if detect_deletions then 16 else 3
Int disk_size_gb = ceil(size(bam_or_cram_file,"GiB") + size(bam_or_cram_index,"GiB") + size(reference_fasta,"GiB") + 10)
Int disk_size_gb = ceil(size(bam_or_cram_file,"GiB") + size(reference_fasta,"GiB")*1.5 + 50)
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: mem_size_gb,
cpu_cores: 1,
mem_gb: 2.0,
disk_gb: disk_size_gb,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File clusters_file = "~{sample_name}.scramble_clusters.tsv.gz"
}
command <<<
set -euo pipefail

# Identify clusters of split reads
while read region; do
time /app/scramble-gatk-sv/cluster_identifier/src/build/cluster_identifier -l -r "${region}" -t ~{reference_fasta} ~{bam_or_cram_file} \
| gzip >> ~{sample_name}.scramble_clusters.tsv.gz
done < ~{regions_list}
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: scramble_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task ScramblePart2 {
input {
File clusters_file
String sample_name
File reference_fasta
String scramble_docker
Int threads = 7 # Number of threads
RuntimeAttr? runtime_attr_override
}
Int disk_size_gb = ceil(10*size(clusters_file,"GiB") + size(reference_fasta,"GiB") + 10)
RuntimeAttr default_attr = object {
cpu_cores: 8,
mem_gb: 12.0,
disk_gb: disk_size_gb,
boot_disk_gb: 10,
preemptible_tries: 3,
Expand All @@ -78,19 +136,18 @@ task RunScramble {

xDir=$PWD
clusterFile=$xDir/clusters
scrambleDir="/app"
scrambleDir="/app/scramble-gatk-sv"
meiRef=$scrambleDir/cluster_analysis/resources/MEI_consensus_seqs.fa

# create a blast db from the reference
cat ~{reference_fasta} | makeblastdb -in - -parse_seqids -title ref -dbtype nucl -out ref

# Identify clusters of split reads
$scrambleDir/cluster_identifier/src/build/cluster_identifier ~{bam_or_cram_file} > $clusterFile
gunzip -c ~{clusters_file} > $clusterFile

# Produce ${clusterFile}_MEIs.txt
Rscript --vanilla $scrambleDir/cluster_analysis/bin/SCRAMble.R --out-name $clusterFile \
--cluster-file $clusterFile --install-dir $scrambleDir/cluster_analysis/bin \
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-meis
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-meis --cores ~{threads}

# create a header for the output vcf
echo \
Expand Down Expand Up @@ -139,31 +196,6 @@ task RunScramble {
# transform the MEI descriptions into VCF lines
awk -f awkScript.awk ${clusterFile}_MEIs.txt >> tmp.vcf

# work on deletions, if requested
if [ ~{detect_deletions} == "true" ]
then
# split the file of clusters to keep memory bounded
# The awk script removes lines where field 4 (the left consensus) contains nothing but 'n's
# because the deletion hunter in Scramble barfs on these.
awk '{left=$4; gsub(/n/,"",left); if ( length(left) > 0 ) print}' $clusterFile | split -a3 -l1500 - xyzzy

# produce a xyzzy???_PredictedDeletions.txt file for each split
for fil in xyzzy???
do Rscript --vanilla $scrambleDir/cluster_analysis/bin/SCRAMble.R --out-name $xDir/$fil \
--cluster-file $xDir/$fil --install-dir $scrambleDir/cluster_analysis/bin \
--mei-refs $meiRef --ref $xDir/ref --no-vcf --eval-dels
done

# transform the *_PredictedDeletions.txt files into VCF lines, and add them to the body
awk \
'BEGIN{ FS=OFS="\t" }
{ if(FNR<2)next
Q= $11=="NA" ? ($15=="NA"?".":$15) : ($15=="NA"?$11:($11+$15)/2)
print $1,$2+1,".","N","<DEL>",Q=="."?".":int(Q),"PASS",\
"END=" $3+1 ";SVTYPE=DEL;SVLEN=" $5 ";STRANDS=+-;CHR2=" $1 ";ALGORITHMS=scramble",\
"GT","0/1" }' xyzzy???_PredictedDeletions.txt >> tmp.vcf
fi

# sort and index the output VCF
bcftools sort -Oz <tmp.vcf >"~{sample_name}.scramble.vcf.gz"
bcftools index -ft "~{sample_name}.scramble.vcf.gz"
Expand Down

0 comments on commit 3fa8e18

Please sign in to comment.