Merge pull request #2 from kids-first/feature/mb-add-vqsr

✨ add vqsr BIXU-3741
kids-first · Sep 9, 2024 · 06379b0 · 06379b0
2 parents c79fc24 + b9a5f4f
commit 06379b0
Show file tree

Hide file tree

Showing 9 changed files with 519 additions and 1 deletion.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,40 @@
+<!--Pull Request Template-->
+
+## Description
+
+Please include a summary of the change and which issue is fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
+
+Fixes # (issue)
+
+## Type of change
+
+Please delete options that are not relevant.
+
+- [ ] Bug fix (non-breaking change which fixes an issue)
+- [ ] New feature (non-breaking change which adds functionality)
+- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
+- [ ] This change requires a documentation update
+
+## How Has This Been Tested?
+
+Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration
+
+- [ ] Test A
+- [ ] Test B
+
+**Test Configuration**:
+* Environment:
+* Test files:
+
+## Checklist:
+
+- [ ] My code follows the style guidelines of this project
+- [ ] I have performed a self-review of my own code
+- [ ] I have commented my code, particularly in hard-to-understand areas
+- [ ] I have made corresponding changes to the documentation
+- [ ] My changes generate no new warnings
+- [ ] I have added tests that prove my fix is effective or that my feature works
+- [ ] New and existing unit tests pass locally with my changes
+- [ ] Any dependent changes have been merged and published in downstream modules
+- [ ] I have checked my code and corrected any misspellings
+- [ ] I have committed any related changes to the PR
diff --git a/.github/workflows/sync_pr_template.yml b/.github/workflows/sync_pr_template.yml
@@ -0,0 +1,41 @@
+on:
+  schedule:
+  - cron:  '57 0 1 * *'
+  push:
+    branches:
+    - main 
+    - master
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout This Repo
+      uses: actions/checkout@v2
+      with:
+        path: main
+    - name: Checkout Template Repo
+      uses: actions/checkout@v2
+      with:
+        repository: d3b-center/d3b-bixu-template 
+        path: template
+    - name: Sync template file
+      run: |
+        cmp ./template/.github/PULL_REQUEST_TEMPLATE.md ./main/.github/PULL_REQUEST_TEMPLATE.md || cp ./template/.github/PULL_REQUEST_TEMPLATE.md ./main/.github/PULL_REQUEST_TEMPLATE.md
+    - name: Create Pull Request
+      uses: peter-evans/create-pull-request@v3
+      with:
+        path: ./main/
+        commit-message: sync with template repo
+        title: Sync PR Template with Template Repo
+        body: |
+          Automated changes by [create-pull-request](https://github.com/peter-evans/create-pull-request) GitHub action.
+
+          PULL_REQUEST_TEMPLATE.md is no longer in sync with the [bixu_template_repo](https://github.com/kids-first/kf-template-repo).
+          This PR sets this repository's PULL_REQUEST_TEMPLATE.md to the template found in the bixu_template_repo.
+
+          If this repository's PULL_REQUEST_TEMPLATE.md is preferred to the one in the bixu_template_repo,
+          please update the bixu_template_repo's template file to the newest standard and close this PR.
+        delete-branch: true
+        branch: gha-sync-pr-temp
+        branch-suffix: timestamp
+        reviewers: dmiller15,migbro,sickler-alex,wongjessica93,yuankunzhu
diff --git a/README.md b/README.md
@@ -1,9 +1,11 @@
 # Kids First-Sentieon Joint Cohort Calling
 
-This is a work in progress and under active development.
+This is a beta workflow. It is not currently used in production, but its output is more than serviceable.
 An efficient, fast, and cost effective workflow for joint calling cohorts up to ~4000 individuals on the CAVATICA platform.
 You will likely need to have installed [sbpack](https://pypi.org/project/sbpack/) to push the necessary apps into your project, and it is recommended to have the python packages listed in `python_pip_requirements.txt` installed.
 Both workflows have limited implementations of the [`GVCFtyper` algo](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) which is part of the [Sentieon driver library](https://support.sentieon.com/manual/usages/general/#driver-binary).
+Currently, VQSR is not part of this workfow, but is available to run after this one.
+Please see the [Kids First-Sentieon VQSR Equivalent Workflow](docs/KF_SENTIEON_VQSR.md) for more info on that.
 For any question/comments/support request, please email [email protected]
 <p align="center">
  <img src="https://github.com/d3b-center/d3b-research-workflows/raw/master/doc/kfdrc-logo-sm.png" alt="data service logo"/>

diff --git a/docs/KF_SENTIEON_VQSR.md b/docs/KF_SENTIEON_VQSR.md
@@ -0,0 +1,43 @@
+# Kids First-Sentieon VQSR Equivalent Workflow
+
+This workflow generally follows the guidelines outlined in the [Variant Quality Score Recalibration (VQSR)](https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR) and [Which training sets arguments should I use for running VQSR?](https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR).
+Many find VQSR useful for evaluating the quality of calls made.
+We use Sentieon tools to more efficiently implement an equivalent of the VQSR portion of our [Kids First DRC Joint Genotyping Workflow](https://github.com/kids-first/kf-jointgenotyping-workflow/tree/master) used in trio calling.
+It is to be run after the [Kids First-Sentieon Joint Cohort Calling](../README.md) Workflow.
+
+## Inputs
+### Required:
+ - `reference`: Indexed FASTA file reference. Should be the same one used to create the input gVCFs
+ - `input_vcfs`: Array of by-chromosome joint called VCFs. Workflow will merge before applying VQSR
+ - `sentieon_license`: Sentieon license server host and port in format `0.0.0.0:0000`. Is set by default by the workflow, but can be changed if circumstances require it
+ - `dbsnp_vcf`: Homo_sapiens_assembly38.dbsnp138.vcf # pulled by workflow by default
+ - `hapmap_resource_vcf`: hapmap_3.3.hg38.vcf.gz
+ - `mills_resource_vcf`: Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
+ - `omni_resource_vcf`: 1000G_omni2.5.hg38.vcf.gz
+ - `one_thousand_genomes_resource_vcf`: 1000G_phase1.snps.high_confidence.hg38.vcf.gz
+ - `output_basename`: String to prepend to output
+### Optional
+ - `bcftools_cpu`: Default `8`. Number of cores to be used ot merge VCFs
+ - `output_type` Default `z`. Format of merged variants file
+ - `varcal_threads`: Default `1`. Sentieon documentation states for VarCal to be deterministic, it must be set 1, but will be much slower
+ `varcal_ram`: Default `16`. RAM in GB to providew to VarCal jobs. May need to increase depending on size of input
+ - `srand`: Default `42`. Determines the seed to use in the random number generation. You can set RANDOM_SEED to 0 and the software will use the random seed from your computer. In order to generate a deterministic result, you should use a non-zero RANDOM_SEED
+ - `snp_max_gaussians`: Default `6`. Integer value for max gaussians in SNP VariantRecalibration. If a dataset gives fewer variants
+      than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces the program
+      to group variants into a smaller number of clusters, which results in more variants per cluster
+ - `snp_tranche`: Default `[ 100.0, 99.95, 99.9, 99.8, 99.6, 99.5, 99.4, 99.3, 99.0, 98.0, 97.0, 90.0 ]`. Normalized quality threshold for each tranche; the TRANCH_THRESHOLD number is a number between 0 and 100
+ - `snp_annotation`: Default: `[ 'QD', 'MQRankSum', 'ReadPosRankSum', 'FS', 'MQ', 'SOR', 'DP' ]`. determine annotation that will be used during the indel recalibration
+ - `indel_max_gaussians`: Default `4`. Integer value for max gaussians in INDEL VariantRecalibration. If a dataset gives fewer
+      variants than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces
+      the program to group variants into a smaller number of clusters, which results in more variants per cluster.
+ - `indel_tranche`: Default `[ 100.0, 99.95, 99.9, 99.5, 99.0, 97.0, 96.0, 95.0, 94.0, 93.5, 93.0, 92.0, 91.0, 90.0 ]`. Normalized quality threshold for each tranche; the TRANCH_THRESHOLD number is a number between 0 and 100
+ - `indel_annotation`: Default: `[ 'FS', 'ReadPosRankSum', 'MQRankSum', 'QD', 'SOR', 'DP' ]`. determine annotation that will be used during indel recalibration
+
+## Outputs:
+ - `vqsr_vcf`: Merged VQSR VCF with tranch filters
+
+## Run tips
+ - The default 1TB storage per instance might be enough for up to a 1500 sample cohort size. To be safe, set this to at least 2TB if not more for larger cohorts in the task, documentation on this here: https://docs.sevenbridges.com/docs/set-execution-hints-at-task-level. An example would be to use the following:
+   - Instance type: `c5.2xlarge` # Must meet requirements of min threads set for any tool
+   - EBS storage: `2048` up to `4096`
+   - Number of parallel instances: `2`. Most users have an `80` max limit per account
diff --git a/tools/bcftools_concat.cwl b/tools/bcftools_concat.cwl
@@ -0,0 +1,41 @@
+cwlVersion: v1.2
+class: CommandLineTool
+doc: "Concatenate VCF file"
+label: bcftools_concat
+$namespaces:
+  sbg: https://sevenbridges.com
+
+requirements:
+- class: ShellCommandRequirement
+- class: ResourceRequirement
+  coresMin: $(inputs.threads)
+  ramMin: 16000
+- class: DockerRequirement
+  dockerPull: pgc-images.sbgenomics.com/d3b-bixu/bcftools:1.20
+- class: InlineJavascriptRequirement
+
+baseCommand: [bcftools, concat]
+arguments:
+- position: 3
+  shellQuote: false
+  valueFrom: >-
+    ${
+      if (inputs.output_type == "z"){
+        return "&& tabix --threads " + inputs.threads + " " + inputs.output;
+      }
+      else{
+        return "";
+      }
+    }
+
+inputs:
+  threads: { type: 'int?', default: 8,
+    inputBinding: { position: 1, prefix: "--threads"} }
+  output: { type: string, doc: "Output filename",
+    inputBinding: { position: 1, prefix: "--output"} }
+  output_type: { type: ['null', {type: enum, name: output_type, symbols: ["b", "u", "v", "z"] } ], doc: "b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF [v]",
+    inputBinding: { position: 1, prefix: "--output-type" } }
+  input_vcfs: { type: 'File[]', secondaryFiles: ['.tbi'],
+   inputBinding: { position: 2} }
+outputs:
+  merged_vcf: { type: File, secondaryFiles: ['.tbi'], outputBinding: { glob: '*.vcf.gz' } }
diff --git a/tools/sentieon_apply_varcal.cwl b/tools/sentieon_apply_varcal.cwl
@@ -0,0 +1,54 @@
+cwlVersion: v1.2
+class: CommandLineTool
+id: sentieon-apply-varcal
+label: Sentieon_ApplyVarCal
+doc: |-
+  The ApplyVarCal algorithm combines the output information from the VQSR with the original variant information
+
+requirements:
+- class: ShellCommandRequirement
+- class: ResourceRequirement
+  coresMin: $(Math.max(inputs.threads, 8))
+  ramMin: $(inputs.ram * 1000)
+- class: DockerRequirement
+  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.02_cavatica
+- class: EnvVarRequirement
+  envDef:
+  - envName: SENTIEON_LICENSE
+    envValue: $(inputs.sentieon_license)
+
+- class: InlineJavascriptRequirement
+
+arguments:
+- position: 0
+  shellQuote: false
+  valueFrom: >-
+    sentieon driver
+- position: 2
+  shellQuote: false
+  valueFrom: >-
+    --algo ApplyVarCal
+    --vqsr_model var_type=SNP,recal=$(inputs.snps_recalibration.path),tranches_file=$(inputs.snps_tranches.path),sensitivity=$(inputs.snp_sensitivity)
+    --vqsr_model var_type=INDEL,recal=$(inputs.indels_recalibration.path),tranches_file=$(inputs.indels_tranches.path),sensitivity=$(inputs.indel_sensitivity)
+- position: 3
+  shellQuote: false
+  valueFrom: >-
+    $(inputs.output_basename).VQSR.vcf.gz
+inputs:
+  sentieon_license: { type: string, doc: "Sentieon license server and port, in format 0.0.0.0:0000 " }
+  threads: { type: 'int?', doc: "number of computing threads that will be used by the software to run parallel processes. See srand doc for deterministic behavior",
+    default: 8, inputBinding: { position: 1, prefix: "-t" } }
+  ram: { type: 'int?', doc: "RAM in GB to make available to this task", default: 16 }
+  reference: { type: File, secondaryFiles: ['.fai'],  doc: "location of the reference FASTA file",
+    inputBinding: { position: 1, prefix: "--reference"} }
+  input_vcf: { type: File, secondaryFiles: ['.tbi'], inputBinding: { position: 2, prefix: "-v"} }
+  snps_recalibration: { type: File, secondaryFiles: ['.idx'], doc: "location of the SNP VCF file output from the VarCal algorithm" }
+  snps_tranches: { type: File, doc: "location of the SNP tranches file output from the VarCal algorithm" }
+  snp_sensitivity: { type: 'float?', default: 99.7, doc: "determine the SNP sensitivity to the available truth sites; only tranches with threshold larger than the sensitivity will be included in the recalibration"}
+  indels_recalibration: { type: File, secondaryFiles: ['.idx'], doc: "location of the INDEL VCF file output from the VarCal algorithm" }
+  indels_tranches: { type: File, doc: "location of the INDEL tranches file output from the VarCal algorithm" }
+  indel_sensitivity: { type: 'float?', default: 99.7, doc: "determine the INDEL sensitivity to the available truth sites; only tranches with threshold larger than the sensitivity will be included in the recalibration"}
+  output_basename: { type: 'string?', default: "snps"}
+
+outputs:
+  vqsr_vcf: { type: File, outputBinding: { glob: "*VQSR.vcf.gz"}, secondaryFiles: ['.tbi'] }
diff --git a/tools/sentieon_varcal_indels.cwl b/tools/sentieon_varcal_indels.cwl
@@ -0,0 +1,61 @@
+cwlVersion: v1.2
+class: CommandLineTool
+id: sentieon-varcal-indels
+label: Sentieon_VarCal_INDELs
+doc: |-
+  The VarCal algorithm calculates the Variant Quality Score Recalibration (VQSR). VQSR assigns a well-calibrated probability score to individual variant calls, to enable more accurate control in determining the most likely variants.
+
+requirements:
+- class: ShellCommandRequirement
+- class: ResourceRequirement
+  coresMin: $(Math.max(inputs.threads, 8))
+  ramMin: $(inputs.ram * 1000)
+- class: DockerRequirement
+  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.02_cavatica
+- class: EnvVarRequirement
+  envDef:
+  - envName: SENTIEON_LICENSE
+    envValue: $(inputs.sentieon_license)
+
+- class: InlineJavascriptRequirement
+
+arguments:
+- position: 0
+  shellQuote: false
+  valueFrom: >-
+    sentieon driver
+- position: 2
+  shellQuote: false
+  valueFrom: >-
+    --algo VarCal --tranches_file $(inputs.output_basename).indels.tranches --var_type INDEL
+    --resource $(inputs.mills_resource_vcf.path) --resource_param mills,known=false,training=true,truth=true,prior=12
+    --resource $(inputs.axiomPoly_resource_vcf.path) --resource_param axiomPoly,known=false,training=true,truth=false,prior=10
+    --resource $(inputs.dbsnp_resource_vcf.path) --resource_param known=true,training=false,truth=false,prior=2
+- position: 3
+  shellQuote: false
+  valueFrom: >-
+    $(inputs.output_basename).recal
+inputs:
+  sentieon_license: { type: string, doc: "Sentieon license server and port, in format 0.0.0.0:0000 " }
+  ram: { type: 'int?', doc: "RAM in GB to make available to this task", default: 16 }
+  threads: { type: 'int?', doc: "number of computing threads that will be used by the software to run parallel processes. See srand doc for deterministic behavior",
+    default: 1, inputBinding: { position: 1, prefix: "-t" } }
+  reference: { type: File, secondaryFiles: ['.fai'],  doc: "location of the reference FASTA file",
+    inputBinding: { position: 1, prefix: "--reference"} }
+  input_vcf: { type: File, secondaryFiles: ['.tbi'], inputBinding: { position: 2, prefix: "-v"} }
+  mills_resource_vcf: { type: File, secondaryFiles: ['.tbi'] }
+  axiomPoly_resource_vcf: { type: File, secondaryFiles: ['.tbi'] }
+  dbsnp_resource_vcf: { type: File, secondaryFiles: ['.idx'] }
+  max_gaussians: { type: 'int?', doc: "determines the maximum number of Gaussians that will be used for the positive recalibration model",
+    default: 4, inputBinding: { position: 2, prefix: "--max_gaussians"} }
+  tranche: { type: ['null', { type: array, items: float,  inputBinding: { prefix: "--tranche", separate: true }}], doc: "normalized quality threshold for each tranche; the TRANCH_THRESHOLD number is a number between 0 and 100. Multiple instances of the option are allowed that will create as many tranches as there are thresholds",
+    default: [ 100.0, 99.95, 99.9, 99.5, 99.0, 97.0, 96.0, 95.0, 94.0, 93.5, 93.0, 92.0, 91.0, 90.0 ], inputBinding: { position: 2 } }
+  annotation: { type: ['null', {type: array, items: string, inputBinding: { prefix: "--annotation", separate: true }}], doc: "determine annotation that will be used during the recalibration",
+    default: [ 'FS', 'ReadPosRankSum', 'MQRankSum', 'QD', 'SOR', 'DP' ], inputBinding: { position: 2 } }
+  srand: { type: 'int?', doc: "Determines the seed to use in the random number generation. You can set RANDOM_SEED to 0 and the software will use the random seed from your computer. In order to generate a deterministic result, you should use a non-zero RANDOM_SEED and set the NUMBER_THREADS to 1",
+    default: 42, inputBinding: {position: 2, prefix: "--srand"} }
+  output_basename: { type: 'string?', default: "indels"}
+
+outputs:
+  recal: { type: File, outputBinding: { glob: "*recal"}, secondaryFiles: ['.idx'] }
+  tranches: { type: File, outputBinding: { glob: "*.indels.tranches" } }