Skip to content

Commit

Permalink
v0.1.9: Remove usage of gatk-framework; replace with bcftools
Browse files Browse the repository at this point in the history
Removes usage of older GATK3 gatk-framework, replacing with
modern bcftools equivalents. gatk-framework is no longer distributed
with bcbio. bcbio/bcbio-nextgen#2412
  • Loading branch information
chapmanb committed Aug 3, 2018
1 parent a22b08a commit be398aa
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 80 deletions.
5 changes: 5 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.1.9 (3 August 2018)

- Remove usage of gatk-framework, which is no longer included in bcbio. Replace
GATK usage with bcftools merge and concat.

## 0.1.8 (18 May 2018)

- Fix tabix index error for variants present at position 1 of contigs.
Expand Down
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject bcbio.variation.recall "0.1.8"
(defproject bcbio.variation.recall "0.1.9"
:description "Parallel merging, squaring off and ensemble calling for genomic variants."
:url "https://github.com/chapmanb/bcbio.variation.recall"
:license {:name "MIT" :url "http://www.opensource.org/licenses/mit-license.html"}
Expand Down
18 changes: 0 additions & 18 deletions src/bcbio/variation/ensemble/prep.clj
Original file line number Diff line number Diff line change
Expand Up @@ -92,24 +92,6 @@
(fn [& args]
(keyword (first args))))

(defmethod create-union :gatk
^{:doc "Use GATK CombineVariants to merge multiple input files with sites-only output"}
[_ vcf-files ref-file region out-dir]
(let [out-file (str (io/file out-dir (str "union-" (region->safestr region) ".vcf.gz")))
variant-str (string/join " " (map #(str "--variant " (bgzip-index-vcf %)) vcf-files))]
(itx/with-temp-dir [tmp-dir (fs/parent out-file)]
(itx/run-cmd out-file
"gatk-framework -Xms250m -Xmx~{(gatk-mem vcf-files)} -XX:+UseSerialGC "
"-Djava.io.tmpdir=~{tmp-dir} "
"-T CombineVariants -R ~{ref-file} "
"-L ~{(region->samstr region)} --out ~{out-file} "
"--minimalVCF --sites_only "
"--genotypemergeoption UNSORTED "
"--suppressCommandLineHeader --setKey null "
"-U LENIENT_VCF_PROCESSING --logging_level ERROR "
"~{variant-str}"))
(bgzip-index-vcf out-file)))

(defmethod create-union :bcftools
^{:doc "Use bcftools isec and custom awk command to handle merge of multiple files"}
[_ vcf-files ref-file region out-dir]
Expand Down
4 changes: 2 additions & 2 deletions src/bcbio/variation/ensemble/realign.clj
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
sample (eprep/region->safestr region) i)))]
(square/subset-sample-region x sample region out-file)))
vcf-files)
union-vcf (eprep/create-union :gatk prep-inputs ref-file region union-dir)]
union-vcf (eprep/create-union :bcftools prep-inputs ref-file region union-dir)]
(realign-and-call region union-vcf bam-file ref-file work-dir config)))

(defn by-region-multi
Expand All @@ -79,7 +79,7 @@
(map (fn [[sample s-vcfs]]
(by-region sample region (map second s-vcfs) (get bam-files sample)
ref-file dirs region-e-dir config))))]
(merge/region-merge :gatk final-vcfs region ref-file region-merge-dir out-file)))
(merge/region-merge :bcftools final-vcfs region ref-file out-file region-merge-dir)))

(defn ensemble-vcfs
"Combine VCF files with squaring off by recalling at uncalled variant positions."
Expand Down
53 changes: 8 additions & 45 deletions src/bcbio/variation/recall/merge.clj
Original file line number Diff line number Diff line change
Expand Up @@ -32,64 +32,27 @@
(fn [& args]
(keyword (first args))))

(defn- run-bcftools
"Run a bcftools merge on a (potential) subset of files in a temporary directory."
[i vcf-files region out-dir base-file]
(let [out-file (fsp/add-file-part base-file (format "%s-%s" (eprep/region->safestr region) i) out-dir)
input-list (str (fsp/file-root out-file) "inputs.txt")]
(if (= 1 (count vcf-files))
(itx/safe-copy (io/file (first vcf-files)) (io/file out-file))
(do
(when (itx/needs-run? out-file)
(spit input-list (string/join "\n" (map eprep/bgzip-index-vcf vcf-files))))
(itx/run-cmd out-file
"bcftools merge -O ~{(vcfutils/bcftools-out-type out-file)} "
"-r ~{(eprep/region->samstr region)} `cat ~{input-list}` "
"> ~{out-file}")))
(eprep/bgzip-index-vcf out-file)))

(defn move-vcf
"Move a VCF file, also handling move of tabix index if it exists."
[orig-file new-file]
(doseq [ext ["" ".tbi"]]
(when (fs/exists? (str orig-file ext))
(.renameTo (io/file (str orig-file ext)) (io/file (str new-file ext))))))

(defmethod region-merge :gatk
^{:doc "Merge VCFs in a region using GATK framework"}
[_ vcf-files region ref-file work-dir final-file]
(let [out-file (region-merge-outfile region work-dir final-file)
(defmethod region-merge :bcftools
^{:doc "Merge VCFs in a region using btools"}
[_ vcf-files region ref-file final-file work-dir]
(let [out-file (if work-dir (region-merge-outfile region work-dir final-file) final-file)
input-list (str (fsp/file-root out-file) "-combineinputs.list")]
(when (itx/needs-run? out-file)
(spit input-list (string/join "\n" (map eprep/bgzip-index-vcf vcf-files))))
(itx/with-temp-dir [tmp-dir (fs/parent out-file)]
(itx/run-cmd out-file
"gatk-framework -Xms250m -Xmx~{(eprep/gatk-mem vcf-files)} -XX:+UseSerialGC "
"-Djava.io.tmpdir=~{tmp-dir} "
"-T CombineVariants -R ~{ref-file} "
"-L ~{(eprep/region->samstr region)} --out ~{out-file} "
"--genotypemergeoption REQUIRE_UNIQUE --logging_level ERROR "
"--suppressCommandLineHeader --setKey null "
"-U LENIENT_VCF_PROCESSING -V ~{input-list}"))
"bcftools merge -O ~{(vcfutils/bcftools-out-type out-file)} "
"-r ~{(eprep/region->samstr region)} -l ~{input-list} "
"-o ~{out-file}"))
(eprep/bgzip-index-vcf out-file)))

(defmethod region-merge :bcftools
^{:doc "Merge VCFs within a region using bcftools."}
[_ vcf-files region ref-file work-dir final-file]
(let [group-size 5000
out-file (region-merge-outfile region work-dir final-file)]
(when (itx/needs-run? out-file)
(itx/with-temp-dir [tmp-dir (fs/parent out-file)]
(let [final-vcf (loop [work-files vcf-files
i 0]
(if (= 1 (count work-files))
(first work-files)
(let [merged (map-indexed (fn [j xs] (run-bcftools (+ i j) xs region tmp-dir out-file))
(partition-all group-size work-files))]
(recur merged (+ i (count merged))))))]
(move-vcf final-vcf out-file))))
out-file))

(defmethod region-merge :vcflib
^{:doc "Merge VCFs within a region using tabix and vcflib"}
[_ vcf-files region ref-file work-dir final-file]
Expand Down Expand Up @@ -151,7 +114,7 @@
[orig-vcf-files ref-file out-file config]
(prep-by-region (fn [vcf-files region out-dir]
(vcfutils/ensure-no-dup-samples vcf-files)
(region-merge :gatk vcf-files region ref-file out-dir out-file))
(region-merge :bcftools vcf-files region ref-file out-file out-dir))
orig-vcf-files ref-file out-file config))

(defn- usage [options-summary]
Expand Down
26 changes: 12 additions & 14 deletions src/bcbio/variation/recall/square.clj
Original file line number Diff line number Diff line change
Expand Up @@ -140,19 +140,17 @@
(eprep/bgzip-index-vcf out-file)))

(defn union-variants
"Use GATK CombineVariants to merge multiple input files"
[vcf-files ref-file region out-file]
(let [variant-str (string/join " " (map #(str "--variant " (eprep/bgzip-index-vcf %)) vcf-files))]
^{:doc "Merge single sample VCFs into combined VCF"}
[vcf-files region ref-file out-file]
(let [input-list (str (fsp/file-root out-file) "-combineinputs.list")]
(when (itx/needs-run? out-file)
(spit input-list (string/join "\n" (filter vcfutils/has-variants? (map eprep/bgzip-index-vcf vcf-files)))))
(itx/with-temp-dir [tmp-dir (fs/parent out-file)]
(itx/run-cmd out-file
"gatk-framework -Xms250m -Xmx~{(eprep/gatk-mem vcf-files)} -XX:+UseSerialGC "
"-Djava.io.tmpdir=~{tmp-dir} "
"-T CombineVariants -R ~{ref-file} "
"-L ~{(eprep/region->samstr region)} --out ~{out-file} "
"--genotypemergeoption UNSORTED "
"--suppressCommandLineHeader --setKey null "
"-U LENIENT_VCF_PROCESSING --logging_level ERROR "
"~{variant-str}"))
"bcftools concat --allow-overlaps "
"-O ~{(vcfutils/bcftools-out-type out-file)} "
"-r ~{(eprep/region->samstr region)} -f ~{input-list} "
"-o ~{out-file}"))
(eprep/bgzip-index-vcf out-file)))

(defn- sample-by-region
Expand All @@ -172,7 +170,7 @@
(intersect-variants (:region fnames) union-vcf ref-file (:existing fnames))
(unique-variants union-vcf (:region fnames) ref-file (:needcall fnames))
(recall-variants sample region (:needcall fnames) region-bam-file ref-file (:recall fnames) config)
(union-variants [(:recall fnames) (:existing fnames)] ref-file region out-file))))
(union-variants [(:recall fnames) (:existing fnames)] region ref-file out-file))))
out-file))

(defn- sample-by-region-prep
Expand Down Expand Up @@ -217,7 +215,7 @@
- For each sample, square off using `sample-by-region`
- Merge all variant files in the region together."
[vcf-files bam-files region ref-file dirs out-file config]
(let [union-vcf (eprep/create-union :gatk vcf-files ref-file region (:union dirs))
(let [union-vcf (eprep/create-union :bcftools vcf-files ref-file region (:union dirs))
config (assoc config :ploidy (get-existing-ploidy vcf-files region))
region-square-dir (fsp/safe-mkdir (io/file (:square dirs) (get region :chrom "nochrom")
(eprep/region->safestr region)))
Expand All @@ -232,7 +230,7 @@
(into [])
(sort-by first)
(map second))]
(merge/region-merge :gatk recall-vcfs region ref-file region-merge-dir out-file)))
(merge/region-merge :bcftools recall-vcfs region ref-file out-file region-merge-dir)))

(defn- sample-to-bam-map*
"Prepare a map of sample names to BAM files."
Expand Down

0 comments on commit be398aa

Please sign in to comment.