diff --git a/.github/actions/upload-gatk-test-results/action.yml b/.github/actions/upload-gatk-test-results/action.yml
index 3353b727126..e2c267e2c75 100644
--- a/.github/actions/upload-gatk-test-results/action.yml
+++ b/.github/actions/upload-gatk-test-results/action.yml
@@ -40,9 +40,10 @@ runs:
name: test-results-${{ inputs.is-docker == 'true' && 'docker-' || '' }}${{ matrix.Java }}-${{ matrix.testType }}
path: build/reports/tests
- - name: Upload to codecov
- run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
- shell: bash
+ # Disabling codecov because it is timing out and failing builds that otherwise succeed.
+ ## - name: Upload to codecov
+ ## run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
+ ## shell: bash
- name: Upload Reports
if: ${{ inputs.only-artifact != 'true' }}
@@ -91,4 +92,4 @@ runs:
run: |
pip install --user PyGithub;
python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }};
- shell: bash
\ No newline at end of file
+ shell: bash
diff --git a/.github/workflows/dependency_submission.yml b/.github/workflows/dependency_submission.yml
new file mode 100644
index 00000000000..3103492f769
--- /dev/null
+++ b/.github/workflows/dependency_submission.yml
@@ -0,0 +1,22 @@
+name: Dependency Submission
+
+on:
+ push:
+ branches: [ 'master' ]
+
+permissions:
+ contents: write
+
+jobs:
+ dependency-submission:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout sources
+ uses: actions/checkout@v4
+ - name: Setup Java
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: 17
+ - name: Generate and submit dependency graph
+ uses: gradle/actions/dependency-submission@v4
diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml
index 76fa7a9d07e..15a550e859e 100644
--- a/.github/workflows/gatk-tests.yml
+++ b/.github/workflows/gatk-tests.yml
@@ -291,7 +291,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
+ wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
continue-on-error: true
name: WDL test ${{ matrix.wdlTest }} on cromwell
steps:
@@ -345,12 +345,6 @@ jobs:
echo "Running M2 WDL";
bash scripts/m2_cromwell_tests/run_m2_wdl.sh;
- - name: "CNN_WDL_TEST"
- if: ${{ matrix.wdlTest == 'RUN_CNN_WDL' }}
- run: |
- echo "Running CNN WDL";
- bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh;
-
- name: "VCF_SITE_LEVEL_FILTERING_WDL_TEST"
if: ${{ matrix.wdlTest == 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' }}
run: |
diff --git a/.gitignore b/.gitignore
index e95e5ab6094..a73810708a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,3 +44,4 @@ funcotator_tmp
#Test generated dot files
test*.dot
+.vscode/
diff --git a/Dockerfile b/Dockerfile
index 22f319086ad..65abb0cb939 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,18 @@
-ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
+ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.3.1
# stage 1 for constructing the GATK zip
FROM ${BASE_DOCKER} AS gradleBuild
LABEL stage=gatkIntermediateBuildImage
ARG RELEASE=false
-RUN ls .
+
ADD . /gatk
WORKDIR /gatk
# Get an updated gcloud signing key, in case the one in the base image has expired
-RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
+#Download only resources required for the build, not for testing
+RUN ls . && \
+ rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt update &&\
apt-key list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
@@ -19,16 +21,13 @@ RUN rm /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get -y clean && \
apt-get -y autoclean && \
apt-get -y autoremove && \
- rm -rf /var/lib/apt/lists/*
-RUN git lfs install --force
-
-#Download only resources required for the build, not for testing
-RUN git lfs pull --include src/main/resources/large
-
-RUN export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE
-RUN cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/
-RUN unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts
-RUN chmod -R a+rw /gatk/unzippedJar
+ rm -rf /var/lib/apt/lists/* && \
+ git lfs install --force && \
+ git lfs pull --include src/main/resources/large && \
+ export GRADLE_OPTS="-Xmx4048m -Dorg.gradle.daemon=false" && /gatk/gradlew clean collectBundleIntoDir shadowTestClassJar shadowTestJar -Drelease=$RELEASE && \
+ cp -r $( find /gatk/build -name "*bundle-files-collected" )/ /gatk/unzippedJar/ && \
+ unzip -o -j $( find /gatk/unzippedJar -name "gatkPython*.zip" ) -d /gatk/unzippedJar/scripts && \
+ chmod -R a+rw /gatk/unzippedJar
FROM ${BASE_DOCKER}
@@ -47,17 +46,17 @@ RUN chmod -R a+rw /gatk
COPY --from=gradleBuild /gatk/unzippedJar .
#Setup linked jars that may be needed for running gatk
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar
-RUN ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar
-RUN ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
+RUN ln -s $( find /gatk -name "gatk*local.jar" ) gatk.jar && \
+ ln -s $( find /gatk -name "gatk*local.jar" ) /root/gatk.jar && \
+ ln -s $( find /gatk -name "gatk*spark.jar" ) gatk-spark.jar
WORKDIR /root
# Make sure we can see a help message
-RUN java -jar gatk.jar -h
-RUN mkdir /gatkCloneMountPoint
-RUN mkdir /jars
-RUN mkdir .gradle
+RUN java -jar gatk.jar -h && \
+ mkdir /gatkCloneMountPoint && \
+ mkdir /jars && \
+ mkdir .gradle
WORKDIR /gatk
@@ -80,16 +79,13 @@ RUN echo "source activate gatk" > /root/run_unit_tests.sh && \
echo "ln -s /gatkCloneMountPoint/build/ /gatkCloneMountPoint/scripts/docker/build" >> /root/run_unit_tests.sh && \
echo "cd /gatk/ && /gatkCloneMountPoint/gradlew -Dfile.encoding=UTF-8 -b /gatkCloneMountPoint/dockertest.gradle testOnPackagedReleaseJar jacocoTestReportOnPackagedReleaseJar -a -p /gatkCloneMountPoint" >> /root/run_unit_tests.sh
-WORKDIR /root
-RUN cp -r /root/run_unit_tests.sh /gatk
-RUN cp -r gatk.jar /gatk
-ENV CLASSPATH /gatk/gatk.jar:$CLASSPATH
+RUN cp -r /root/run_unit_tests.sh /gatk && \
+ cp -r /root/gatk.jar /gatk
+ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
# Start GATK Python environment
-WORKDIR /gatk
-ENV PATH $CONDA_PATH/envs/gatk/bin:$CONDA_PATH/bin:$PATH
-RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
+RUN conda env create -vv -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
conda clean -afy && \
diff --git a/README.md b/README.md
index 26e731c26db..b6300d9b6ee 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@ releases of the toolkit.
* [How to contribute to GATK](#contribute)
* [Discussions](#discussions)
* [Authors](#authors)
+* [Citing GATK](#citing)
* [License](#license)
## Requirements
@@ -78,15 +79,31 @@ releases of the toolkit.
docker client, which can be found on the [docker website](https://www.docker.com/get-docker).
* Python Dependencies:
* GATK4 uses the [Conda](https://conda.io/docs/index.html) package manager to establish and manage the
- Python environment and dependencies required by GATK tools that have a Python dependency. This environment also
- includes the R dependencies used for plotting in some of the tools. The ```gatk``` environment
- requires hardware with AVX support for tools that depend on TensorFlow (e.g. CNNScoreVariant). The GATK Docker image
- comes with the ```gatk``` environment pre-configured.
- * At this time, the only supported platforms are 64-bit Linux distributions. The required Conda environment is not
- currently supported on OS X/macOS.
+ Python environment and dependencies required by Python-based GATK tools. This environment also
+ includes the R dependencies used for plotting in some of the tools. The GATK Docker image
+ comes with the ```gatk``` conda environment pre-configured and activated.
* To establish the environment when not using the Docker image, a conda environment must first be "created", and
then "activated":
- * First, make sure [Miniconda or Conda](https://conda.io/docs/index.html) is installed (Miniconda is sufficient).
+ * First, make sure [Miniconda or Conda](https://conda.io/docs/index.html) is installed. We recommend installing
+ ```Miniconda3-py310_23.10.0-1``` from [the miniconda download page](https://repo.anaconda.com/miniconda/), selecting the Linux or
+ MacOS version of the installer as appropriate.
+ * This is the same version of ```miniconda``` used by the official GATK docker image.
+ * If you use a different version, you may run into issues.
+ * If you have an ARM-based Mac, you must select the `MacOSX-x86_64` installer, not the `MacOSX-arm64` installer,
+ and rely on Mac OS's built-in x86 emulation.
+ * Set up miniconda:
+ * Install miniconda to a location on your PATH such as ```/opt/miniconda```, and then restart your shell:
+ ```
+ bash Miniconda3-py310_23.10.0-1-[YOUR_OS].sh -p /opt/miniconda -b
+ ```
+ * Disable conda auto-updates, which can cause compatibility issues with GATK:
+ ```
+ conda config --set auto_update_conda false
+ ```
+ * Enable the (much) faster ```libmamba``` solver to greatly speed up creation of the conda environment:
+ ```
+ conda config --set solver libmamba
+ ```
* To "create" the conda environment:
* If running from a zip or tar distribution, run the command ```conda env create -f gatkcondaenv.yml``` to
create the ```gatk``` environment.
@@ -156,7 +173,9 @@ For more details on system packages, see the GATK [Base Dockerfile](scripts/dock
* This creates a zip archive in the `build/` directory with a name like `gatk-VERSION.zip` containing a complete standalone GATK distribution, including our launcher `gatk`, both the local and spark jars, and this README.
* You can also run GATK commands directly from the root of your git clone after running this command.
- * Note that you *must* have a full git clone in order to build GATK, including the git-lfs files in src/main/resources. The zipped source code alone is not buildable.
+ * Note that you *must* have a full git clone in order to build GATK, including the git-lfs files in `src/main/resources/large`. The zipped source code alone is not buildable.
+ * The large files under `src/main/resources/large/` are required to build GATK, since they are packaged inside the GATK jar and used by tools at runtime. These include things like ML models and native C/C++ libraries used for acceleration of certain tools.
+ * The large files under `src/test/resources/large/`, on the other hand, are only required by the test suite when running tests, and are not required to build GATK.
* **Other ways to build:**
* `./gradlew installDist`
@@ -671,5 +690,8 @@ Thank you for getting involved!
The authors list is maintained in the [AUTHORS](https://github.com/broadinstitute/gatk/edit/master/AUTHORS) file.
See also the [Contributors](https://github.com/broadinstitute/gatk/graphs/contributors) list at github.
+## Citing GATK
+If you use GATK in your research, please see [this article](https://gatk.broadinstitute.org/hc/en-us/articles/360035530852-How-should-I-cite-GATK-in-my-own-publications) for details on how to properly cite GATK.
+
## License
Licensed under the Apache 2.0 License. See the [LICENSE.txt](https://github.com/broadinstitute/gatk/blob/master/LICENSE.TXT) file.
diff --git a/build.gradle b/build.gradle
index d894796321a..184dec2632c 100644
--- a/build.gradle
+++ b/build.gradle
@@ -11,17 +11,19 @@ plugins {
id "application" // provides installDist
id 'maven-publish'
id 'signing'
- id "jacoco"
- id "de.undercouch.download" version "5.4.0" //used for downloading GSA lib
- id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars
- id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating
- id 'com.palantir.git-version' version '0.5.1' //version helper
- id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies
+// id "jacoco"
+ id "de.undercouch.download" version "5.6.0" //used for downloading GSA lib
+ id "com.gradleup.shadow" version "8.3.3" //used to build the shadow and sparkJars
+ id 'com.palantir.git-version' version '3.1.0' //version helper
+ id 'org.sonatype.gradle.plugins.scan' version '2.8.3' // scans for security vulnerabilities in our dependencies
}
import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
+import com.github.jengelman.gradle.plugins.shadow.transformers.AppendingTransformer
+
import java.time.format.DateTimeFormatter
+import java.util.stream.Collectors
application {
mainClass = "org.broadinstitute.hellbender.Main"
@@ -36,8 +38,8 @@ startScripts {
}
}
-task downloadGsaLibFile(type: Download) {
- src 'http://cran.r-project.org/src/contrib/gsalib_2.2.1.tar.gz'
+tasks.register('downloadGsaLibFile', Download) {
+ src 'https://cran.r-project.org/src/contrib/gsalib_2.2.1.tar.gz'
dest "src/main/resources/org/broadinstitute/hellbender/utils/R/gsalib.tar.gz"
overwrite false
}
@@ -57,20 +59,20 @@ repositories {
mavenLocal()
}
-final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
-final picardVersion = System.getProperty('picard.version','3.1.1')
+final htsjdkVersion = System.getProperty('htsjdk.version','4.1.3')
+final picardVersion = System.getProperty('picard.version','3.3.0')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
final sparkVersion = System.getProperty('spark.version', '3.5.0')
final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
final disqVersion = System.getProperty('disq.version','0.3.8')
-final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.2')
+final genomicsdbVersion = System.getProperty('genomicsdb.version','1.5.4')
final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0')
final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0')
final guavaVersion = System.getProperty('guava.version', '32.1.3-jre')
-final log4j2Version = System.getProperty('log4j2Version', '2.17.1')
-final testNGVersion = '7.0.0'
-
-final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8'
+final log4j2Version = System.getProperty('log4j2Version', '2.24.1')
+final testNGVersion = System.getProperty('testNGVersion', '7.7.0')
+final googleCloudNioVersion = System.getProperty('googleCloudNioVersion','0.127.8')
+final gklVersion = System.getProperty('gklVersion', '0.8.11')
final baseJarName = 'gatk'
final secondaryBaseJarName = 'hellbender'
@@ -91,7 +93,7 @@ def checkForLFSStubFiles(targetFolder) {
}
def targetFiles = fileTree(dir: targetFolder)
return targetFiles.any() { f ->
- final byte[] actualBytes = readBytesFromFile(f, lfsStubFileHeader.length());
+ final byte[] actualBytes = readBytesFromFile(f, lfsStubFileHeader.length())
return new String(actualBytes, "UTF-8") == lfsStubFileHeader
}
}
@@ -104,7 +106,7 @@ def resolveLargeResourceStubFiles(largeResourcesFolder, buildPrerequisitesMessag
def retCode = gitLFSExecCommand.execute().waitFor()
if (retCode.intValue() != 0) {
throw new GradleException("Execution of \"$gitLFSExecCommand\" failed with exit code: $retCode. " +
- " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage");
+ " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage")
}
return retCode
} catch (IOException e) {
@@ -163,33 +165,18 @@ if (versionOverridden) {
println "Version number overridden as " + version
}
-configurations.all {
- resolutionStrategy {
- // the snapshot folder contains a dev version of guava, we don't want to use that.
- force 'com.google.guava:guava:' + guavaVersion
- // force the htsjdk version so we don't get a different one transitively
- force 'com.github.samtools:htsjdk:' + htsjdkVersion
- force 'com.google.protobuf:protobuf-java:3.23.4'
- // force testng dependency so we don't pick up a different version via GenomicsDB
- force 'org.testng:testng:' + testNGVersion
- force 'org.broadinstitute:barclay:' + barclayVersion
- force 'com.twitter:chill_2.12:0.10.0'
- force 'org.apache.commons:commons-math3:3.5'
-
- // make sure we don't pick up an incorrect version of the GATK variant of the google-nio library
- // via Picard, etc.
- force googleCloudNioDependency
-
- force 'com.esotericsoftware:kryo:4.0.0'
- }
+configurations.configureEach {
configurations*.exclude group: 'org.slf4j', module: 'slf4j-jdk14' //exclude this to prevent slf4j complaining about to many slf4j bindings
configurations*.exclude group: 'com.google.guava', module: 'guava-jdk5'
configurations*.exclude group: 'junit', module: 'junit'
+
+ //this is excluded and replaced below with a dependency on bcprof-jdk18on which fixes known vulnerabilities
+ //configurations*.exclude group: 'org.bouncycastle', module: 'bcprov-jdk15on'
}
-tasks.withType(JavaCompile) {
- options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
- options.encoding = 'UTF-8'
+tasks.withType(JavaCompile).configureEach {
+ options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose']
+ options.encoding = 'UTF-8'
}
sourceSets {
@@ -219,13 +206,13 @@ configurations {
// exclude Hadoop and Spark dependencies, since they are provided when running with Spark
// (ref: http://unethicalblogger.com/2015/07/15/gradle-goodness-excluding-depends-from-shadow.html)
exclude group: 'org.apache.hadoop'
- exclude module: 'spark-core_2.12'
+ exclude module: 'spark-core_2.13'
exclude group: 'org.slf4j'
exclude module: 'jul-to-slf4j'
exclude module: 'javax.servlet'
exclude module: 'servlet-api'
exclude group: 'com.esotericsoftware.kryo'
- exclude module: 'spark-mllib_2.12.15'
+ exclude module: 'spark-mllib_2.13.15'
exclude group: 'org.scala-lang'
exclude module: 'kryo'
}
@@ -233,23 +220,33 @@ configurations {
dependencies {
- implementation ('org.freemarker:freemarker:2.3.30')
- implementation 'org.broadinstitute:barclay:' + barclayVersion
+ implementation 'org.freemarker:freemarker:2.3.30'
+ implementation ('org.broadinstitute:barclay'){
+ version {
+ strictly barclayVersion
+ }
+ }
// Library for configuration:
implementation 'org.aeonbits.owner:owner:1.0.9'
implementation 'com.github.broadinstitute:picard:' + picardVersion
externalSourceConfiguration 'com.github.broadinstitute:picard:' + picardVersion + ':sources'
- implementation ('org.genomicsdb:genomicsdb:' + genomicsdbVersion) {
- exclude module: 'log4j-api'
- exclude module: 'log4j-core'
- exclude module: 'htsjdk'
- exclude module: 'protobuf-java'
- }
+
+ implementation 'org.genomicsdb:genomicsdb:' + genomicsdbVersion
implementation 'com.opencsv:opencsv:3.4'
implementation 'com.google.guava:guava:' + guavaVersion
- implementation 'com.github.samtools:htsjdk:'+ htsjdkVersion
- implementation(googleCloudNioDependency)
+
+ implementation ('com.github.samtools:htsjdk'){
+ version {
+ strictly htsjdkVersion
+ }
+ }
+
+ implementation ('com.google.cloud:google-cloud-nio'){
+ version {
+ strictly googleCloudNioVersion
+ }
+ }
implementation 'com.google.cloud:google-cloud-bigquery:' + bigQueryVersion
implementation 'com.google.cloud:google-cloud-bigquerystorage:' + bigQueryStorageVersion
@@ -261,45 +258,49 @@ dependencies {
// should we want to)
implementation 'com.google.cloud.bigdataoss:gcs-connector:1.9.4-hadoop3'
- implementation 'org.apache.logging.log4j:log4j-api:' + log4j2Version
- implementation 'org.apache.logging.log4j:log4j-core:' + log4j2Version
+ implementation platform('org.apache.logging.log4j:log4j-bom:' + log4j2Version)
+ implementation 'org.apache.logging.log4j:log4j-api'
+ implementation 'org.apache.logging.log4j:log4j-core'
// include the apache commons-logging bridge that matches the log4j version we use so
// messages that originate with dependencies that use commons-logging (such as jexl)
// are routed to log4j
- implementation 'org.apache.logging.log4j:log4j-jcl:' + log4j2Version
+ implementation 'org.apache.logging.log4j:log4j-jcl'
+ // these two annotation dependencies
+ // are needed because log4j-core isn't meant to be included
+ // at compile time so it doesn't include its own annotations
+ // https://github.com/apache/logging-log4j2/issues/3110
+ implementation 'biz.aQute.bnd:biz.aQute.bnd.annotation'
+ implementation 'org.osgi:org.osgi.annotation.bundle'
+
implementation 'org.apache.commons:commons-lang3:3.14.0'
- implementation 'org.apache.commons:commons-math3:3.6.1'
+ implementation('org.apache.commons:commons-math3'){
+ version {
+ strictly '3.5' // changing this breaks ModelSegmentsIntegrationTests, they're quite brittle
+ }
+ because "updating this breaks ModelSegmentsIntegrationTests, they're quite brittle"
+ }
implementation 'org.hipparchus:hipparchus-stat:2.0'
implementation 'org.apache.commons:commons-collections4:4.4'
implementation 'org.apache.commons:commons-vfs2:2.9.0'
- implementation 'org.apache.commons:commons-configuration2:2.9.0'
- constraints {
- implementation('org.apache.commons:commons-text') {
- version {
- strictly '1.10.0'
- }
- because 'previous versions have a nasty vulnerability: https://nvd.nist.gov/vuln/detail/CVE-2022-42889'
- }
- }
+ implementation 'org.apache.commons:commons-configuration2:2.10.1'
- implementation 'org.apache.httpcomponents:httpclient:4.5.12'
- implementation 'commons-beanutils:commons-beanutils:1.9.3'
- implementation 'commons-io:commons-io:2.5'
+ implementation 'org.apache.httpcomponents:httpclient:4.5.13'
+ implementation 'commons-beanutils:commons-beanutils:1.9.4'
+ implementation 'commons-io:commons-io:2.14.0'
implementation 'org.reflections:reflections:0.9.10'
implementation 'it.unimi.dsi:fastutil:7.0.13'
- implementation 'org.broadinstitute:hdf5-java-bindings:1.1.0-hdf5_2.11.0'
+ implementation 'org.broadinstitute:hdf5-java-bindings:1.2.0-hdf5_2.11.0'
implementation 'org.broadinstitute:gatk-native-bindings:1.0.0'
implementation 'org.ojalgo:ojalgo:44.0.0'
- implementation ('org.ojalgo:ojalgo-commons-math3:1.0.0') {
+ implementation('org.ojalgo:ojalgo-commons-math3:1.0.0'){
exclude group: 'org.apache.commons'
}
- // TODO: migrate to mllib_2.12.15?
- implementation ('org.apache.spark:spark-mllib_2.12:' + sparkVersion) {
+ implementation ('org.apache.spark:spark-mllib_2.13:' + sparkVersion) {
// JUL is used by Google Dataflow as the backend logger, so exclude jul-to-slf4j to avoid a loop
exclude module: 'jul-to-slf4j'
exclude module: 'javax.servlet'
@@ -310,44 +311,83 @@ dependencies {
implementation 'org.jgrapht:jgrapht-core:1.1.0'
implementation 'org.jgrapht:jgrapht-io:1.1.0'
- implementation('org.disq-bio:disq:' + disqVersion)
- implementation('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
- implementation('com.github.jsr203hadoop:jsr203hadoop:1.0.3')
+ implementation 'org.disq-bio:disq:' + disqVersion
+ implementation 'org.apache.hadoop:hadoop-client:' + hadoopVersion // should be a 'provided' dependency
+ implementation 'com.github.jsr203hadoop:jsr203hadoop:1.0.3'
- implementation('org.apache.orc:orc:1.6.5')
- implementation('de.javakaffee:kryo-serializers:0.45') {
- exclude module: 'kryo' // use Spark's version
+ implementation 'org.apache.orc:orc:1.6.5'
+ implementation 'de.javakaffee:kryo-serializers:0.45'
+ implementation ('com.esotericsoftware:kryo'){
+ version {
+ strictly '[4,5)' // we're not compatible with kryo 5+
+ }
}
// Dependency change for including MLLib
- implementation('org.objenesis:objenesis:1.2')
- testImplementation('org.objenesis:objenesis:2.1')
+ implementation 'org.objenesis:objenesis:1.2'
+ testImplementation 'org.objenesis:objenesis:2.1'
// Comment the next lines to disable native code proxies in Spark MLLib
- implementation('com.github.fommil.netlib:netlib-native_ref-osx-x86_64:1.1:natives')
- implementation('com.github.fommil.netlib:netlib-native_ref-linux-x86_64:1.1:natives')
- implementation('com.github.fommil.netlib:netlib-native_system-linux-x86_64:1.1:natives')
- implementation('com.github.fommil.netlib:netlib-native_system-osx-x86_64:1.1:natives')
+ implementation 'com.github.fommil.netlib:netlib-native_ref-osx-x86_64:1.1:natives'
+ implementation 'com.github.fommil.netlib:netlib-native_ref-linux-x86_64:1.1:natives'
+ implementation 'com.github.fommil.netlib:netlib-native_system-linux-x86_64:1.1:natives'
+ implementation 'com.github.fommil.netlib:netlib-native_system-osx-x86_64:1.1:natives'
- implementation('com.intel.gkl:gkl:0.8.11') {
- exclude module: 'htsjdk'
- }
+ implementation 'com.intel.gkl:gkl:' + gklVersion
implementation 'org.broadinstitute:gatk-bwamem-jni:1.0.4'
implementation 'org.broadinstitute:gatk-fermilite-jni:1.2.0'
- implementation 'org.broadinstitute:http-nio:1.1.0'
+ implementation 'org.broadinstitute:http-nio:1.1.1'
// Required for COSMIC Funcotator data source:
implementation 'org.xerial:sqlite-jdbc:3.44.1.0'
// natural sort
- implementation('net.grey-panther:natural-comparator:1.1')
- implementation('com.fasterxml.jackson.module:jackson-module-scala_2.12:2.9.8')
+ implementation 'net.grey-panther:natural-comparator:1.1'
+ implementation 'com.fasterxml.jackson.module:jackson-module-scala_2.13:2.9.8'
+
+ /********* Update transitive dependencies that have known vulnerabilities in this section *******/
+ constraints {
+ // all of these constraints are here to force upgrades from lower versions of these libraries which are included
+ // as transitive dependencies
+ // once the libraries that make use of these move forward we can remove these constraints
+
+ implementation 'com.google.protobuf:protobuf-java:3.25.5'
+ implementation 'dnsjava:dnsjava:3.6.0'
+ implementation 'org.apache.commons:commons-compress:1.26.0'
+ implementation 'org.apache.ivy:ivy:2.5.2'
+ implementation 'org.apache.commons:commons-text:1.10.0' because 'of https://nvd.nist.gov/vuln/detail/CVE-2022-42889'
+ implementation 'ch.qos.logback:logback-classic:1.4.14'
+ implementation 'ch.qos.logback:logback-core:1.4.14'
+ implementation 'org.apache.avro:avro:1.12.0'
+ implementation 'io.airlift:aircompressor:0.27'
+ implementation 'org.scala-lang:scala-library:2.13.14'
+ implementation 'com.nimbusds:nimbus-jose-jwt:9.41.2'
+ implementation 'org.codehaus.janino:janino:3.1.12'
+ implementation 'org.apache.zookeeper:zookeeper:3.9.2'
+ implementation 'org.jetbrains.kotlin:kotlin-stdlib:1.9.25'
+ implementation 'com.squareup.okio:okio:3.9.1'
+ implementation 'org.codehaus.jettison:jettison:1.5.4'
+ implementation 'org.xerial.snappy:snappy-java:1.1.10.4'
+ }
+
+ //use netty bom to enforce same netty version
+ //this upgrades all transitive netty dependencies without adding a direct dependency on netty
+ implementation platform('io.netty:netty-bom:4.1.114.Final')
+
+ implementation platform('org.eclipse.jetty:jetty-bom:9.4.56.v20240826')
+ /************************************************************************************************/
+
testUtilsImplementation sourceSets.main.output
testUtilsImplementation 'org.testng:testng:' + testNGVersion
testUtilsImplementation 'org.apache.hadoop:hadoop-minicluster:' + hadoopVersion
+ //this is a replacement for the transitive dependency of minicluster: bcprov-jdk15on:1.70.0
+ // which is excluded for security purposes
+ //this causes this to act as direct dependency of ours but we don't actually rely on it except as a transitive
+ testUtilsImplementation 'org.bouncycastle:bcprov-jdk18on:1.78.1' //
+
testImplementation sourceSets.testUtils.output
@@ -437,9 +477,9 @@ run {
test {
// transform the list test configuration --add-opens (which must include both the runtime and test args) into
// command line argument format
- final testJVMAddOpens = new ArrayList<>();
- testJVMAddOpens.addAll(runtimeAddOpens);
- testJVMAddOpens.addAll(testAddOpens);
+ final testJVMAddOpens = new ArrayList<>()
+ testJVMAddOpens.addAll(runtimeAddOpens)
+ testJVMAddOpens.addAll(testAddOpens)
final testConfigurationJVMArgs = testJVMAddOpens.stream()
.flatMap(openSpec -> ['--add-opens', openSpec].stream())
.toList()
@@ -488,17 +528,17 @@ def createGatkSymlinks(destinationDir, archiveFile, suffix, baseJarName, seconda
logger.info("build for version:" + version)
group = 'org.broadinstitute'
-tasks.withType(Jar) {
+tasks.withType(Jar).configureEach {
// transform the list of --add-opens directives into manifest format, which requires only the source
// package (unlike the command line equivalent, in the manifest the "ALL-UNNAMED" target is implied
// and can't be included in the manifest entry syntax)
final manifestAddOpens = runtimeAddOpens.stream()
.map(o -> o.substring(0, (o.length() - "ALL-UNNAMED".length()) - 1))
- .collect(java.util.stream.Collectors.joining(' '))
+ .collect(Collectors.joining(' '))
manifest {
attributes 'Implementation-Title': 'The Genome Analysis Toolkit (GATK)',
'Implementation-Version': archiveVersion.get(),
- 'Toolkit-Short-Name' : 'GATK',
+ 'Toolkit-Short-Name': 'GATK',
'Main-Class': application.mainClass,
'Picard-Version': picardVersion,
'htsjdk-Version': htsjdkVersion,
@@ -509,10 +549,10 @@ tasks.withType(Jar) {
}
wrapper {
- gradleVersion = '8.2.1'
+ gradleVersion = '8.10.2'
}
-tasks.withType(ShadowJar) {
+tasks.withType(ShadowJar).configureEach {
from(project.sourceSets.main.output)
archiveBaseName = project.name + '-package'
mergeServiceFiles()
@@ -524,7 +564,7 @@ tasks.withType(ShadowJar) {
// Suggested by the akka devs to make sure that we do not get the spark configuration error.
// http://doc.akka.io/docs/akka/snapshot/general/configuration.html#When_using_JarJar__OneJar__Assembly_or_any_jar-bundler
- transform(com.github.jengelman.gradle.plugins.shadow.transformers.AppendingTransformer) {
+ transform(AppendingTransformer) {
resource = 'reference.conf'
}
}
@@ -543,9 +583,9 @@ shadowJar {
}
}
-task localJar{ dependsOn shadowJar }
+tasks.register('localJar') { dependsOn shadowJar }
-task sparkJar(type: ShadowJar) {
+tasks.register('sparkJar', ShadowJar) {
group = "Shadow"
description = "Create a combined jar of project and runtime dependencies that excludes provided spark dependencies"
configurations = [project.configurations.sparkConfiguration]
@@ -559,7 +599,7 @@ task sparkJar(type: ShadowJar) {
}
// A jar that only contains the test classes and resources (to be extracted for testing)
-task shadowTestClassJar(type: ShadowJar){
+tasks.register('shadowTestClassJar', ShadowJar) {
group = "Shadow"
from sourceSets.test.output
description = "Create a jar that packages the compiled test classes"
@@ -567,19 +607,19 @@ task shadowTestClassJar(type: ShadowJar){
}
// A minimal jar that only contains the extra dependencies needed for running the tests
-task shadowTestJar(type: ShadowJar){
+tasks.register('shadowTestJar', ShadowJar) {
dependsOn 'compileTestUtilsJava', 'processTestUtilsResources'
group = "Shadow"
description = " A minimal jar that only contains the extra dependencies needed for running the tests that arent packaged in the main shadow jar"
from {
- (project.configurations.testRuntimeClasspath - project.configurations.runtimeClasspath ).collect {
+ (project.configurations.testRuntimeClasspath - project.configurations.runtimeClasspath).collect {
it.isDirectory() ? it : it.getName().endsWith(".jar") ? zipTree(it) : it
}
}
archiveClassifier = "testDependencies"
}
-task collectBundleIntoDir(type: Copy) {
+tasks.register('collectBundleIntoDir', Copy) {
dependsOn shadowJar, sparkJar, 'condaEnvironmentDefinition', 'gatkTabComplete', 'gatkDoc'
doFirst {
@@ -605,11 +645,11 @@ task collectBundleIntoDir(type: Copy) {
from("scripts/sv", { into("scripts/sv") })
from("scripts/cnv_wdl/", { into("scripts/cnv_wdl") })
from("scripts/mutect2_wdl/", { into("scripts/mutect2_wdl") })
- from("scripts/dataproc-cluster-ui", { into("scripts/")})
+ from("scripts/dataproc-cluster-ui", { into("scripts/") })
into "$buildDir/bundle-files-collected"
}
-task bundle(type: Zip) {
+tasks.register('bundle', Zip) {
dependsOn collectBundleIntoDir
zip64 true
@@ -625,26 +665,32 @@ task bundle(type: Zip) {
}
}
-jacocoTestReport {
+//jacocoTestReport {
+// dependsOn test
+//
+// group = "Reporting"
+// description = "Generate Jacoco coverage reports after running tests."
+// getAdditionalSourceDirs().from(sourceSets.main.allJava.srcDirs)
+//
+// reports {
+// xml.required = true
+// html.required = true
+// }
+//}
+//}
+
+//This is a stub, so we don't have to change our github action targets while we've disabled jacoco
+tasks.register('jacocoTestReport') {
dependsOn test
-
- group = "Reporting"
- description = "Generate Jacoco coverage reports after running tests."
- getAdditionalSourceDirs().from(sourceSets.main.allJava.srcDirs)
-
- reports {
- xml.required = true
- html.required = true
- }
}
-task condaStandardEnvironmentDefinition(type: Copy) {
+tasks.register('condaStandardEnvironmentDefinition', Copy) {
from "scripts"
into buildDir
include gatkCondaTemplate
rename { file -> gatkCondaYML }
- expand(["condaEnvName":"gatk",
- "condaEnvDescription" : "Conda environment for GATK Python Tools"])
+ expand(["condaEnvName" : "gatk",
+ "condaEnvDescription": "Conda environment for GATK Python Tools"])
doLast {
logger.lifecycle("Created standard Conda environment yml file: $gatkCondaYML")
}
@@ -652,12 +698,12 @@ task condaStandardEnvironmentDefinition(type: Copy) {
// Create GATK conda environment yml file from the conda enc template
-task condaEnvironmentDefinition() {
+tasks.register('condaEnvironmentDefinition') {
dependsOn 'pythonPackageArchive', 'condaStandardEnvironmentDefinition'
}
// Create the Python package archive file
-task pythonPackageArchive(type: Zip) {
+tasks.register('pythonPackageArchive', Zip) {
inputs.dir "src/main/python/org/broadinstitute/hellbender/"
outputs.file pythonPackageArchiveName
doFirst {
@@ -680,29 +726,39 @@ task pythonPackageArchive(type: Zip) {
// NOTE: This CREATES a local conda environment; but does not *activate* it. The environment must
// be activated manually in the shell from which GATK will be run.
//
-task localDevCondaEnv(type: Exec) {
+tasks.register('localDevCondaEnv', Exec) {
+ dependsOn 'condaEnvironmentDefinition'
+ inputs.file("$buildDir/$pythonPackageArchiveName")
+ workingDir "$buildDir"
+ commandLine "conda", "env", "create", "--yes", "-f", gatkCondaYML
+}
+
+// update the conda environment without completely rebuilding
+// this may be faster
+tasks.register('localDevCondaUpdate', Exec) {
dependsOn 'condaEnvironmentDefinition'
inputs.file("$buildDir/$pythonPackageArchiveName")
workingDir "$buildDir"
- commandLine "conda", "env", "create", "--force", "-f", gatkCondaYML
+ commandLine "conda", "env", "update", "-f", gatkCondaYML
}
-task javadocJar(type: Jar, dependsOn: javadoc) {
+tasks.register('javadocJar', Jar) {
+ dependsOn javadoc
archiveClassifier = 'javadoc'
from "$docBuildDir/javadoc"
}
-task sourcesJar(type: Jar) {
+tasks.register('sourcesJar', Jar) {
from sourceSets.main.allSource
archiveClassifier = 'sources'
}
-task testUtilsJar(type: Jar){
+tasks.register('testUtilsJar', Jar) {
archiveBaseName = "$project.name-test-utils"
from sourceSets.testUtils.output
}
-tasks.withType(Javadoc) {
+tasks.withType(Javadoc).configureEach {
// do this for all javadoc tasks, including gatkDoc
options.addStringOption('Xdoclint:none')
options.addStringOption('encoding', 'UTF-8')
@@ -717,7 +773,7 @@ javadoc {
}
-task testUtilsJavadoc(type: Javadoc) {
+tasks.register('testUtilsJavadoc', Javadoc) {
// This is a hack to disable the java default javadoc lint until we fix the html formatting
// We only want to do this for the javadoc task, not gatkDoc
options.addStringOption('Xdoclint:none', '-quiet')
@@ -727,27 +783,29 @@ task testUtilsJavadoc(type: Javadoc) {
include '**/*.java'
}
-task testUtilsJavadocJar(type: Jar, dependsOn: testUtilsJavadoc){
+tasks.register('testUtilsJavadocJar', Jar) {
+ dependsOn testUtilsJavadoc
archiveBaseName = "$project.name-test-utils"
archiveClassifier = 'javadoc'
from "$docBuildDir/testUtilsJavadoc"
}
-task testUtilsSourcesJar(type: Jar){
+tasks.register('testUtilsSourcesJar', Jar) {
archiveBaseName = "$project.name-test-utils"
archiveClassifier = 'sources'
from sourceSets.testUtils.allSource
}
// Generate GATK Online Doc
-task gatkDoc(type: Javadoc, dependsOn: classes) {
+tasks.register('gatkDoc', Javadoc) {
+ dependsOn classes
final File gatkDocDir = new File("$docBuildDir/gatkdoc")
doFirst {
// make sure the output folder exists or we can create it
if (!gatkDocDir.exists() && !gatkDocDir.mkdirs()) {
throw new GradleException(String.format("Failure creating folder (%s) for GATK doc output in task (%s)",
gatkDocDir.getAbsolutePath(),
- it.name));
+ it.name))
}
copy {
from('src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates')
@@ -772,7 +830,7 @@ task gatkDoc(type: Javadoc, dependsOn: classes) {
outputs.dir(gatkDocDir)
options.destinationDirectory(gatkDocDir)
- options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates");
+ options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates")
if (project.hasProperty('phpDoc')) {
// use -PphpDoc to generate .php file extensions, otherwise rely on default of .html
final String phpExtension = "php"
@@ -784,14 +842,15 @@ task gatkDoc(type: Javadoc, dependsOn: classes) {
}
// Generate GATK Bash Tab Completion File
-task gatkTabComplete(type: Javadoc, dependsOn: classes) {
+tasks.register('gatkTabComplete', Javadoc) {
+ dependsOn classes
final File tabCompletionDir = new File("$docBuildDir/tabCompletion")
doFirst {
// make sure the output folder exists or we can create it
if (!tabCompletionDir.exists() && !tabCompletionDir.mkdirs()) {
throw new GradleException(String.format("Failure creating folder (%s) for GATK tab completion output in task (%s)",
tabCompletionDir.getAbsolutePath(),
- it.name));
+ it.name))
}
}
// Include the Picard source jar, which contains various .R, .sh, .css, .html, .xml and .MF files and
@@ -837,13 +896,14 @@ task gatkTabComplete(type: Javadoc, dependsOn: classes) {
options.addStringOption("caller-post-arg-max-occurs", "1 1 1 1 1 1 1 1 1 1")
}
-def getWDLInputJSONTestFileNameFromWDLName(File wdlName) {
+static def getWDLInputJSONTestFileNameFromWDLName(File wdlName) {
String fileWithoutExt = wdlName.name.take(wdlName.name.lastIndexOf('.'))
return new File (wdlName.getParentFile(), fileWithoutExt + "Inputs.json").getAbsolutePath()
}
// Generate GATK Tool WDL
-task gatkWDLGen(type: Javadoc, dependsOn: classes) {
+tasks.register('gatkWDLGen', Javadoc) {
+ dependsOn classes
final File gatkWDLDir = new File("$docBuildDir/wdlGen")
outputs.dir(gatkWDLDir)
doFirst {
@@ -851,7 +911,7 @@ task gatkWDLGen(type: Javadoc, dependsOn: classes) {
if (!gatkWDLDir.exists() && !gatkWDLDir.mkdirs()) {
throw new GradleException(String.format("Failure creating folder (%s) for GATK WDL output in task (%s)",
gatkWDLDir.getAbsolutePath(),
- it.name));
+ it.name))
}
copy {
from('src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates/common.html')
@@ -873,7 +933,7 @@ task gatkWDLGen(type: Javadoc, dependsOn: classes) {
outputs.dir(gatkWDLDir)
options.destinationDirectory(gatkWDLDir)
- options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates");
+ options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates")
options.addStringOption("output-file-extension", "wdl")
options.addStringOption("index-file-extension", "html")
@@ -894,11 +954,11 @@ def execWDLValidation = { validateWDL ->
}
return retCode
} catch (IOException e) {
- throw new GradleException("An IOException occurred while attempting to execute the command $validateWDL.")
+ throw new GradleException("An IOException occurred while attempting to execute the command $validateWDL.", e)
}
}
-task gatkValidateScriptsWdl() {
+tasks.register('gatkValidateScriptsWdl') {
doFirst {
// running this task requires a local cromwell installation, with environment variables CROMWELL_JAR,
// WOMTOOL_JAR set to the jar locations
@@ -921,7 +981,8 @@ task gatkValidateScriptsWdl() {
}
}
-task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) {
+tasks.register('gatkValidateGeneratedWdl') {
+ dependsOn(gatkWDLGen, shadowJar)
doFirst {
// running this task requires a local cromwell installation, with environment variables CROMWELL_JAR,
// WOMTOOL_JAR set to the jar locations
@@ -1008,7 +1069,7 @@ signing {
def basePomConfiguration = {
packaging = 'jar'
description = 'Development on GATK 4'
- url = 'http://github.com/broadinstitute/gatk'
+ url = 'https://github.com/broadinstitute/gatk'
scm {
url = 'scm:git@github.com:broadinstitute/gatk.git'
@@ -1081,8 +1142,8 @@ publish {
}
}
-task installSpark{ dependsOn sparkJar }
-task installAll{ dependsOn installSpark, installDist }
+tasks.register('installSpark') { dependsOn sparkJar }
+tasks.register('installAll') { dependsOn installSpark, installDist }
installDist.dependsOn downloadGsaLibFile
downloadGsaLibFile.dependsOn sourcesJar
diff --git a/gradle.properties b/gradle.properties
new file mode 100644
index 00000000000..d7a34a5028f
--- /dev/null
+++ b/gradle.properties
@@ -0,0 +1 @@
+org.gradle.jvmargs=-Xmx2g
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
index 249e5832f09..033e24c4cdf 100644
Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
index 84a0b92f9af..df97d72b8b9 100644
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -1,5 +1,7 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.2.1-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-bin.zip
+networkTimeout=10000
+validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
diff --git a/gradlew b/gradlew
index a69d9cb6c20..fcb6fca147c 100755
--- a/gradlew
+++ b/gradlew
@@ -55,7 +55,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
-# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
@@ -80,13 +80,10 @@ do
esac
done
-APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
-
-APP_NAME="Gradle"
+# This is normally unused
+# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
@@ -133,22 +130,29 @@ location of your Java installation."
fi
else
JAVACMD=java
- which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+ if ! command -v java >/dev/null 2>&1
+ then
+ die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
+ fi
fi
# Increase the maximum file descriptors if we can.
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
+ # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
+ # shellcheck disable=SC3045
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
+ # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
+ # shellcheck disable=SC3045
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
@@ -193,6 +197,10 @@ if "$cygwin" || "$msys" ; then
done
fi
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
# Collect all arguments for the java command;
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
# shell script including quotes and variable substitutions, so put them in
diff --git a/scripts/cnn_variant_cromwell_tests/README.md b/scripts/cnn_variant_cromwell_tests/README.md
deleted file mode 100644
index c137747ab65..00000000000
--- a/scripts/cnn_variant_cromwell_tests/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# CNN Variant Automated Tests for WDL
-
-**This directory is for GATK devs only**
-
-This directory contains scripts for running CNN Variant WDL tests in the automated build environment.
-
-Please note that this only tests whether the WDL will complete successfully.
diff --git a/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh b/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh
deleted file mode 100644
index 18a4d824c3d..00000000000
--- a/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash -l
-set -e
-#cd in the directory of the script in order to use relative paths
-script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
-cd "$script_path"
-
-WORKING_DIR=/home/runner/work/gatk
-
-set -e
-echo "Building docker image for CNN WDL tests (skipping unit tests)..."
-
-#assume Dockerfile is in root
-echo "Building docker without running unit tests... ========="
-cd $WORKING_DIR/gatk
-
-# IMPORTANT: This code is duplicated in the cnv and M2 WDL test.
-if [ ! -z "$CI_PULL_REQUEST" ]; then
- HASH_TO_USE=FETCH_HEAD
- sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${CI_PULL_REQUEST};
- echo "using fetch head:"$HASH_TO_USE
-else
- HASH_TO_USE=${CI_COMMIT}
- sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
- echo "using travis commit:"$HASH_TO_USE
-fi
-echo "Docker build done =========="
-
-cd $WORKING_DIR/gatk/scripts/
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cnn_score_variants_travis.json >$WORKING_DIR/cnn_score_variants_travis.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json >$WORKING_DIR/cnn_score_variants_travis_1d.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cram2filtered_travis.json >$WORKING_DIR/cram2filtered_travis.json
-echo "JSON FILES (modified) ======="
-cat $WORKING_DIR/cnn_score_variants_travis.json
-cat $WORKING_DIR/cnn_score_variants_travis_1d.json
-cat $WORKING_DIR/cram2filtered_travis.json
-echo "=================="
-
-
-echo "Running CNN Score Variants WDL through cromwell"
-ln -fs $WORKING_DIR/gatk/scripts/cnn_variant_wdl/cnn_score_variants.wdl
-cd $WORKING_DIR/gatk/scripts/cnn_variant_wdl/
-java -jar $CROMWELL_JAR run cnn_score_variants.wdl -i $WORKING_DIR/cnn_score_variants_travis_1d.json
-java -jar $CROMWELL_JAR run cnn_score_variants.wdl -i $WORKING_DIR/cnn_score_variants_travis.json
-java -jar $CROMWELL_JAR run cram2filtered.wdl -i $WORKING_DIR/cram2filtered_travis.json
diff --git a/scripts/cnn_variant_wdl/README.md b/scripts/cnn_variant_wdl/README.md
deleted file mode 100644
index e3b19930d4b..00000000000
--- a/scripts/cnn_variant_wdl/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# gatk4-cnn-variant-filter
-
-### Purpose :
-These workflows take advantage of GATK's CNN tool which uses a deep learning
-approach to filter variants based on Convolutional Neural Networks.
-
-Please read the following post to learn more about the CNN tool: [Deep Learning in GATK4](https://github.com/broadinstitute/gatk-docs/blob/3333b5aacfd3c48a87b60047395e1febc98c21f9/blog-2012-to-2019/2017-12-21-Deep_learning_in_GATK4.md).
-
-### cram2filtered.wdl
-This workflow takes an input CRAM/BAM to call variants with HaplotypeCaller
-then filters the calls with the CNNScoreVariant neural net tool using the filtering model specified.
-
-The site-level scores are added to the `INFO` field of the VCF. The architecture arguments,
-`info_key` and `tensor_type` arguments MUST be in agreement (e.g. 2D models must have
-`tensor_type` of `read_tensor` and `info_key` of `CNN_2D`, 1D models must have `tensor_type` of
-`reference` and `info_key` of `CNN_1D`). The `INFO` field key will be `CNN_1D` or `CNN_2D`
-depending on the neural net architecture used for inference. The architecture arguments
-specify pre-trained networks. New networks can be trained by the GATK tools: CNNVariantWriteTensors
-and CNNVariantTrain. The CRAM could be generated by the [single-sample pipeline](https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl).
-If you would like test to the workflow on a more representative example file, use the following
-CRAM file as input and change the scatter count from 4 to 200: gs://gatk-best-practices/cnn-h38/NA12878_NA12878_IntraRun_1_SM-G947Y_v1.cram.
-
-#### Requirements/expectations :
- - CRAM/BAM
- - BAM Index (if input is BAM)
-
-#### Output :
- - Filtered VCF and its index.
-
-### cram2model.wdl
-This **optional** workflow is for advanced users who would like to train a CNN model for filtering variants for specific use cases (e.g. custom panels, non-human, or non-Illumina sequencing).
-
-#### Requirements/expectations :
- - CRAM
- - Truth VCF and its index
- - Truth Confidence Interval Bed
-
-#### Output :
- - Model HD5
- - Model JSON
- - Model Plots PNG
-
-### run_happy.wdl
-This **optional** evaluation and plotting workflow runs a filtering model against truth data (e.g. [NIST Genomes in a Bottle](https://github.com/genome-in-a-bottle/giab_latest_release), [Synthic Diploid Truth Set](https://github.com/lh3/CHM-eval/releases) ) and plots the accuracy.
-
-#### Requirements/expectations :
- - File of VCF Files
- - Truth VCF and its index
- - Truth Confidence Interval Bed
-
-#### Output :
- - Evaluation summary
- - Plots
-
-### Important Notes :
-- Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
-- For help running workflows on the Google Cloud Platform or locally please
-view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://gatk.broadinstitute.org/hc/en-us/articles/360035530952).
-- Please visit the [User Guide](https://gatk.broadinstitute.org/hc/en-us/categories/360002310591) site for further documentation on our workflows and tools.
-
-### Contact Us :
-- The following material is provided by the Data Science Platforum group at the Broad Institute. Please direct any questions or concerns to one of our forum sites : [GATK](https://gatk.broadinstitute.org/hc/en-us/community/topics) or [Terra](https://support.terra.bio/hc/en-us/community/topics/360000500432).
-
-### LICENSING :
- This script is released under the GATK source code license (Apache 2.0) (see LICENSE in
- https://github.com/broadinstitute/gatk). Note however that the programs it calls may
- be subject to different licenses. Users are responsible for checking that they are
- authorized to run all programs before running this script.
diff --git a/scripts/cnn_variant_wdl/cnn_score_variants.wdl b/scripts/cnn_variant_wdl/cnn_score_variants.wdl
deleted file mode 100644
index 9b0abb4223f..00000000000
--- a/scripts/cnn_variant_wdl/cnn_score_variants.wdl
+++ /dev/null
@@ -1,113 +0,0 @@
-# The CNNScoreVariants tool annotates a VCF with scores from a Neural Net as part of a single-sample workflow.
-# The site-level scores are added to the INFO field of the VCF.
-# The architecture arguments, info_key and tensor type arguments MUST be in agreement
-# (e.g. 2D models must have tensor_type of read_tensor and info_key CNN_2D, 1D models have tensor_type reference and info_key CNN_1D)
-# The INFO field key will be "1D_CNN" or "2D_CNN" depending on the neural net architecture used for inference.
-# The architecture arguments specify pre-trained networks.
-# New networks can be trained by the GATK tools: CNNVariantWriteTensors and CNNVariantTrain
-# The bam file and index are only required by 2D CNNs which take a read-level tensor_type such as "read_tensor".
-# For 1D CNNs the tensor_type is typically "reference".
-# Parallelization over sites is controlled by the scatter_count variable.
-
-import "cnn_variant_common_tasks.wdl" as CNNTasks
-
-workflow CNNScoreVariantsWorkflow {
- File input_vcf # The VCF to annotate with scores
- File input_vcf_index
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- Array[File] resources # List of VCF file names of resources of known SNPs and INDELs, (e.g. mills, gnomAD)
- Array[File] resources_index # List of VCF file indices of resources
- File? bam_file # Bam (or HaplotypeCaller-generated "bamout") file from which input_vcf was called, required by read-level architectures
- File? bam_file_index
- File? architecture_json # Neural Net configuration for CNNScoreVariants
- File? architecture_hd5 # Pre-Trained weights and architecture for CNNScoreVariants
- String? tensor_type # Keyword indicating the shape of the input tensor (e.g. read_tensor, reference)
- String info_key # The score key for the INFO field of the vcf (e.g. CNN_1D, CNN_2D)
- String snp_tranches # Filtering threshold(s) for SNPs in terms of sensitivity to overlapping known variants in resources
- String indel_tranches # Filtering threshold(s) for INDELs in terms of sensitivity to overlapping known variants in resources
- String? filter_tranches_extra # Additional arguments for filter variant tranches
- String output_prefix # Identifying string for this run which will be used to name output files (the gzipped VCF and, for the 2D CNN, bamout)
- Int? inference_batch_size # Batch size for python in CNNScoreVariants
- Int? transfer_batch_size # Batch size for java transfers to python in CNNScoreVariants
- Int? intra_op_threads # Tensorflow threading within nodes
- Int? inter_op_threads # Tensorflow threading between nodes
- File? gatk_override
- String gatk_docker
- File calling_intervals
- Int scatter_count
- Int? preemptible_attempts
- Int? cnn_task_mem_gb
- Int? cnn_task_cpu
- Int? mem_gb
-
- call CNNTasks.SplitIntervals {
- input:
- gatk_override = gatk_override,
- scatter_count = scatter_count,
- intervals = calling_intervals,
- ref_fasta = reference_fasta,
- ref_dict = reference_dict,
- ref_fai = reference_fasta_index,
- preemptible_attempts = preemptible_attempts,
- gatk_docker = gatk_docker
- }
-
- scatter (calling_interval in SplitIntervals.interval_files) {
-
- call CNNTasks.CNNScoreVariants {
- input:
- input_vcf = input_vcf,
- input_vcf_index = input_vcf_index,
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- bam_file = bam_file,
- bam_file_index = bam_file_index,
- architecture_json = architecture_json,
- architecture_hd5 = architecture_hd5,
- tensor_type = tensor_type,
- inference_batch_size = inference_batch_size,
- transfer_batch_size = transfer_batch_size,
- intra_op_threads = intra_op_threads,
- inter_op_threads = inter_op_threads,
- output_prefix = output_prefix,
- interval_list = calling_interval,
- gatk_override = gatk_override,
- gatk_docker = gatk_docker,
- preemptible_attempts = preemptible_attempts,
- mem_gb = cnn_task_mem_gb,
- cpu = cnn_task_cpu
- }
- }
-
- call CNNTasks.MergeVCFs as MergeVCF_CNN {
- input:
- input_vcfs = CNNScoreVariants.cnn_annotated_vcf,
- output_prefix = output_prefix,
- preemptible_attempts = preemptible_attempts,
- gatk_override = gatk_override,
- gatk_docker = gatk_docker
- }
-
- call CNNTasks.FilterVariantTranches {
- input:
- input_vcf = MergeVCF_CNN.merged_vcf,
- input_vcf_index = MergeVCF_CNN.merged_vcf_index,
- resources = resources,
- resources_index = resources_index,
- output_prefix = output_prefix,
- snp_tranches = snp_tranches,
- indel_tranches = indel_tranches,
- info_key = info_key,
- extra_args = filter_tranches_extra,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- gatk_docker = gatk_docker
- }
-
- output {
- FilterVariantTranches.*
- }
-}
diff --git a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl b/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl
deleted file mode 100644
index 5e4d468f36f..00000000000
--- a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl
+++ /dev/null
@@ -1,359 +0,0 @@
-task CNNScoreVariants {
- File input_vcf
- File input_vcf_index
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- String output_prefix
- File? bam_file
- File? bam_file_index
- File? architecture_json
- File? architecture_hd5
- Int? inference_batch_size
- Int? transfer_batch_size
- Int? intra_op_threads
- Int? inter_op_threads
- String? tensor_type
-
- File interval_list
- File? gatk_override
-
- # Runtime parameters
- Int? mem_gb
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 6000
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem / 2
-
-command <<<
-
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- CNNScoreVariants \
- ${"-I " + bam_file} \
- ${"--read-index " + bam_file_index} \
- -R ${reference_fasta} \
- -V ${input_vcf} \
- -O ${output_prefix}_cnn_annotated.vcf.gz \
- -L ${interval_list} \
- ${"--architecture " + architecture_json} \
- ${"--tensor-type " + tensor_type} \
- ${"--inference-batch-size " + inference_batch_size} \
- ${"--transfer-batch-size " + transfer_batch_size} \
- ${"--intra-op-threads " + intra_op_threads} \
- ${"--inter-op-threads " + inter_op_threads}
-
->>>
-
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- zones: "us-central1-b" # Needs to be a zone that guarantees CPUs with AVX see (https://cloud.google.com/compute/docs/regions-zones/)
- bootDiskSizeGb: "16"
- }
-
- output {
- Array[File] log = glob("gatkStreamingProcessJournal*")
- File cnn_annotated_vcf = "${output_prefix}_cnn_annotated.vcf.gz"
- File cnn_annotated_vcf_index = "${output_prefix}_cnn_annotated.vcf.gz.tbi"
- }
-}
-
-task RunHC4 {
- File input_bam
- File input_bam_index
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- String output_prefix
- File interval_list
- String extra_args
- File? gatk_override
-
- # Runtime parameters
- Int? mem_gb
- String gatk_docker
- Int? preemptible_attempts
- Int disk_space_gb
- Int? cpu
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 8000
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- HaplotypeCaller \
- -R ${reference_fasta} \
- -I ${input_bam} \
- --read-index ${input_bam_index} \
- -O ${output_prefix}_hc4.vcf.gz \
- -L ${interval_list} \
- -bamout ${output_prefix}_bamout.bam \
- ${extra_args}
- }
-
- output {
- File bamout = "${output_prefix}_bamout.bam"
- File bamout_index = "${output_prefix}_bamout.bai"
- File raw_vcf = "${output_prefix}_hc4.vcf.gz"
- File raw_vcf_index = "${output_prefix}_hc4.vcf.gz.tbi"
- }
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- # Note that the space before SSD and HDD should be included.
- disks: "local-disk " + sub(disk_space_gb, "\\..*", "") + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- bootDiskSizeGb: "16"
- }
-}
-
-
-task FilterVariantTranches {
- File input_vcf
- File input_vcf_index
- Array[File] resources
- Array[File] resources_index
- String output_prefix
- String snp_tranches
- String indel_tranches
- String info_key
- String? extra_args
- File? gatk_override
-
- # Runtime parameters
- Int? mem_gb
- String gatk_docker
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- String output_vcf = "${output_prefix}_cnn_filtered.vcf.gz"
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
- Int default_disk_space_gb = 200
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
-command <<<
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- FilterVariantTranches \
- -V ${input_vcf} \
- --output ${output_vcf} \
- -resource ${sep=" -resource " resources} \
- -info-key ${info_key} \
- ${snp_tranches} \
- ${indel_tranches} \
- ${extra_args}
->>>
-
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- # Note that the space before HDD and HDD should be included.
- disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- bootDiskSizeGb: "16"
- }
-
- output {
- File cnn_filtered_vcf = "${output_vcf}"
- File cnn_filtered_vcf_index = "${output_vcf}.tbi"
- }
-}
-
-task SplitIntervals {
- # inputs
- File? intervals
- File ref_fasta
- File ref_fai
- File ref_dict
- Int scatter_count
- String? split_intervals_extra_args
-
- File? gatk_override
-
- # runtime
- String gatk_docker
- Int? mem
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem) then mem * 1000 else 3500
- Int command_mem = machine_mem - 500
-
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- SplitIntervals \
- -R ${ref_fasta} \
- ${"-L " + intervals} \
- -scatter ${scatter_count} \
- -O ./ \
- ${split_intervals_extra_args}
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space, 100]) + " HDD"
- preemptible: select_first([preemptible_attempts, 10])
- cpu: select_first([cpu, 1])
- bootDiskSizeGb: "16"
- }
-
- output {
- Array[File] interval_files = glob("*.interval_list")
- }
-}
-
-task MergeVCFs {
- # inputs
- Array[File] input_vcfs
- String output_prefix
-
- File? gatk_override
-
- # runtime
- String gatk_docker
- Int? mem
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- String output_vcf = "${output_prefix}_cnn_scored.vcf.gz"
-
- Int default_disk_space_gb = 100
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem) then mem * 1000 else 3500
- Int command_mem = machine_mem - 1000
-
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
- gatk --java-options "-Xmx${command_mem}m" MergeVcfs \
- -I ${sep=' -I ' input_vcfs} -O "${output_vcf}"
- }
-
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD"
- preemptible: select_first([preemptible_attempts, 10])
- cpu: select_first([cpu, 1])
- bootDiskSizeGb: "16"
- }
-
- output {
- File merged_vcf = "${output_vcf}"
- File merged_vcf_index = "${output_vcf}.tbi"
- }
-}
-
-task CramToBam {
- File reference_fasta
- File reference_fasta_index
- File reference_dict
- File cram_file
- String output_prefix
-
- # Runtime parameters
- Int? mem_gb
- Int? preemptible_attempts
- Int disk_space_gb
- Int? cpu
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
-
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
-command <<<
- ls -ltr ${cram_file} ${reference_fasta} &&
- echo "ls (1): complete" &&
- samtools view -h -T ${reference_fasta} ${cram_file} |
- samtools view -b -o ${output_prefix}.bam - &&
- echo "samtools view: complete" &&
- ls -ltr . &&
- echo "ls (2): complete" &&
- samtools index -b ${output_prefix}.bam &&
- echo "samtools index: complete" &&
- ls -ltr . &&
- echo "ls (3): complete" &&
- mv ${output_prefix}.bam.bai ${output_prefix}.bai &&
- echo "mv: complete" &&
- ls -ltr . &&
- echo "ls (4): complete"
- >>>
- runtime {
- docker: "broadinstitute/genomes-in-the-cloud:2.1.1"
- memory: machine_mem + " MB"
-
- # Note that the space before SSD and HDD should be included.
- disks: "local-disk " + disk_space_gb + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- }
-
- output {
- File output_bam = "${output_prefix}.bam"
- File output_bam_index = "${output_prefix}.bai"
- }
-}
-
-task SamtoolsMergeBAMs {
- Array[File] input_bams
- String output_prefix
- Int disk_space_gb
- command {
- samtools merge ${output_prefix}_bamout.bam ${sep=' ' input_bams}
- samtools index ${output_prefix}_bamout.bam ${output_prefix}_bamout.bai
- }
-
- output {
- File bamout = "${output_prefix}_bamout.bam"
- File bamout_index = "${output_prefix}_bamout.bai"
- }
-
- runtime {
- docker: "broadinstitute/genomes-in-the-cloud:2.1.1"
- memory: "16 GB"
- disks: "local-disk " + disk_space_gb + " HDD"
- }
-}
diff --git a/scripts/cnn_variant_wdl/cram2filtered.wdl b/scripts/cnn_variant_wdl/cram2filtered.wdl
deleted file mode 100755
index ed07fd4543d..00000000000
--- a/scripts/cnn_variant_wdl/cram2filtered.wdl
+++ /dev/null
@@ -1,158 +0,0 @@
-# This workflow takes an input CRAM to call variants with HaplotypeCaller
-# Then filters the calls with the CNNVariant neural net tool
-# The site-level scores are added to the INFO field of the VCF.
-# The architecture arguments, info_key and tensor type arguments MUST be in agreement
-# (e.g. 2D models must have tensor_type of read_tensor and info_key CNN_2D, 1D models have tensor_type reference and info_key CNN_1D)
-# The INFO field key will be "1D_CNN" or "2D_CNN" depending on the neural net architecture used for inference.
-# The architecture arguments specify pre-trained networks.
-# New networks can be trained by the GATK tools: CNNVariantWriteTensors and CNNVariantTrain
-# The CRAM could be generated by the single-sample pipeline
-# (https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl)
-# Also accepts a BAM as the input file in which case a BAM index is required as well.
-
-import "cnn_variant_common_tasks.wdl" as CNNTasks
-
-workflow Cram2FilteredVcf {
- File input_file # Aligned CRAM file or Aligned BAM files
- File? input_file_index # Index for an aligned BAM file if that is the input, unneeded if input is a CRAM
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- Array[File] resources # List of VCF file names of resources of known SNPs and INDELs, (e.g. mills, gnomAD)
- Array[File] resources_index # List of VCF file indices of resources
- File? architecture_json # Neural Net configuration for CNNScoreVariants
- File? architecture_hd5 # Pre-Trained weights and architecture for CNNScoreVariants
- Int? inference_batch_size # Batch size for python in CNNScoreVariants
- Int? transfer_batch_size # Batch size for java in CNNScoreVariants
- Int? intra_op_threads # Tensorflow threading within nodes
- Int? inter_op_threads # Tensorflow threading between nodes
- String output_prefix # Identifying string for this run will be used to name all output files
- String? tensor_type # What kind of tensors the Neural Net expects (e.g. reference, read_tensor)
- String info_key # The score key for the info field of the vcf (e.g. CNN_1D, CNN_2D)
- String snp_tranches # Filtering threshold(s) for SNPs in terms of sensitivity to overlapping known variants in resources
- String indel_tranches # Filtering threshold(s) for INDELs in terms of sensitivity to overlapping known variants in resources
- File? gatk_override # GATK Jar file to over ride the one included in gatk_docker
- String gatk_docker
- File calling_intervals
- Int scatter_count # Number of shards for parallelization of HaplotypeCaller and CNNScoreVariants
- String extra_args # Extra arguments for HaplotypeCaller
-
- # Runtime parameters
- Int? mem_gb
- Int? preemptible_attempts
- Float? disk_space_gb
- Int? cpu
-
- Int? increase_disk_size
- Int additional_disk = select_first([increase_disk_size, 20])
- Float ref_size = size(reference_fasta, "GB") + size(reference_fasta_index, "GB") + size(reference_dict, "GB")
-
- # Clunky check to see if the input is a BAM or a CRAM
- if (basename(input_file) == basename(input_file, ".bam")){
- call CNNTasks.CramToBam {
- input:
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- cram_file = input_file,
- output_prefix = output_prefix,
- disk_space_gb = round(4*size(input_file, "GB") + ref_size + additional_disk),
- preemptible_attempts = preemptible_attempts
- }
- }
-
- call CNNTasks.SplitIntervals {
- input:
- gatk_override = gatk_override,
- scatter_count = scatter_count,
- intervals = calling_intervals,
- ref_fasta = reference_fasta,
- ref_dict = reference_dict,
- ref_fai = reference_fasta_index,
- gatk_docker = gatk_docker,
- disk_space = round(additional_disk + ref_size)
- }
-
- String input_bam = select_first([CramToBam.output_bam, input_file])
- Float bam_size = size(input_bam, "GB")
-
- scatter (calling_interval in SplitIntervals.interval_files) {
- call CNNTasks.RunHC4 {
- input:
- input_bam = input_bam,
- input_bam_index = select_first([CramToBam.output_bam_index, input_file_index]),
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- output_prefix = output_prefix,
- interval_list = calling_interval,
- gatk_docker = gatk_docker,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- extra_args = extra_args,
- disk_space_gb = round(bam_size + ref_size + additional_disk)
- }
-
- call CNNTasks.CNNScoreVariants {
- input:
- input_vcf = RunHC4.raw_vcf,
- input_vcf_index = RunHC4.raw_vcf_index,
- bam_file = RunHC4.bamout,
- bam_file_index = RunHC4.bamout_index,
- architecture_json = architecture_json,
- architecture_hd5 = architecture_hd5,
- reference_fasta = reference_fasta,
- tensor_type = tensor_type,
- inference_batch_size = inference_batch_size,
- transfer_batch_size = transfer_batch_size,
- intra_op_threads = intra_op_threads,
- inter_op_threads = inter_op_threads,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- output_prefix = output_prefix,
- interval_list = calling_interval,
- gatk_override = gatk_override,
- gatk_docker = gatk_docker,
- preemptible_attempts = preemptible_attempts,
- mem_gb = mem_gb,
- disk_space_gb = round((bam_size/scatter_count) + ref_size + additional_disk)
- }
- }
-
- call CNNTasks.MergeVCFs as MergeVCF_HC4 {
- input:
- input_vcfs = CNNScoreVariants.cnn_annotated_vcf,
- output_prefix = output_prefix,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- gatk_docker = gatk_docker,
- disk_space_gb = additional_disk
- }
-
- call CNNTasks.FilterVariantTranches {
- input:
- input_vcf = MergeVCF_HC4.merged_vcf,
- input_vcf_index = MergeVCF_HC4.merged_vcf_index,
- resources = resources,
- resources_index = resources_index,
- output_prefix = output_prefix,
- snp_tranches = snp_tranches,
- indel_tranches = indel_tranches,
- info_key = info_key,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- gatk_docker = gatk_docker,
- disk_space_gb = additional_disk
- }
-
- call CNNTasks.SamtoolsMergeBAMs {
- input:
- input_bams = RunHC4.bamout,
- output_prefix = output_prefix,
- disk_space_gb = round(bam_size + ref_size + additional_disk)
- }
-
- output {
- FilterVariantTranches.*
- }
-}
diff --git a/scripts/cnn_variant_wdl/cram2model.wdl b/scripts/cnn_variant_wdl/cram2model.wdl
deleted file mode 100755
index 7c932e7058a..00000000000
--- a/scripts/cnn_variant_wdl/cram2model.wdl
+++ /dev/null
@@ -1,242 +0,0 @@
-# CRAM to trained CNNVariant Model
-
-import "cnn_variant_common_tasks.wdl" as CNNTasks
-
-workflow Cram2TrainedModel {
- File input_cram
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- File truth_vcf
- File truth_vcf_index
- File truth_bed
- String output_prefix
- String tensor_type
- Int epochs
- File calling_intervals
- Int scatter_count
- String extra_args
-
- # Runtime parameters
- File? gatk_override
- String gatk_docker
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- Int? increase_disk_size
- Int additional_disk = select_first([increase_disk_size, 20])
- Float ref_size = size(reference_fasta, "GB") + size(reference_fasta_index, "GB") + size(reference_dict, "GB")
-
- call CNNTasks.CramToBam {
- input:
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- cram_file = input_cram,
- output_prefix = output_prefix,
- disk_space_gb = disk_space_gb,
- preemptible_attempts = preemptible_attempts
- }
-
- call CNNTasks.SplitIntervals {
- input:
- scatter_count = scatter_count,
- intervals = calling_intervals,
- ref_fasta = reference_fasta,
- ref_dict = reference_dict,
- ref_fai = reference_fasta_index,
- gatk_docker = gatk_docker,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts
- }
-
- Float bam_size = size(CramToBam.output_bam, "GB")
-
- scatter (calling_interval in SplitIntervals.interval_files) {
- call CNNTasks.RunHC4 {
- input:
- input_bam = CramToBam.output_bam,
- input_bam_index = CramToBam.output_bam_index,
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- output_prefix = output_prefix,
- interval_list = calling_interval,
- gatk_docker = gatk_docker,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- extra_args = extra_args,
- disk_space_gb = round(bam_size + ref_size + additional_disk)
- }
-
- call WriteTensors {
- input:
- input_vcf = RunHC4.raw_vcf,
- input_vcf_index = RunHC4.raw_vcf_index,
- input_bam = RunHC4.bamout,
- input_bam_index = RunHC4.bamout_index,
- truth_vcf = truth_vcf,
- truth_vcf_index = truth_vcf_index,
- truth_bed = truth_bed,
- tensor_type = tensor_type,
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- output_prefix = output_prefix,
- interval_list = calling_interval,
- gatk_docker = gatk_docker,
- gatk_override = gatk_override,
- preemptible_attempts = preemptible_attempts,
- disk_space_gb = disk_space_gb
- }
- }
-
- call CNNTasks.MergeVCFs as MergeVCF_HC4 {
- input:
- input_vcfs = RunHC4.raw_vcf,
- output_prefix = output_prefix,
- gatk_override = gatk_override,
- gatk_docker = gatk_docker,
- preemptible_attempts = preemptible_attempts,
- disk_space_gb = disk_space_gb
- }
-
- call CNNTasks.SamtoolsMergeBAMs {
- input:
- input_bams = RunHC4.bamout,
- output_prefix = output_prefix,
- disk_space_gb = round(bam_size + ref_size + additional_disk)
- }
-
- call TrainModel {
- input:
- tar_tensors = WriteTensors.tensors,
- output_prefix = output_prefix,
- tensor_type = tensor_type,
- gatk_docker = gatk_docker,
- gatk_override = gatk_override,
- disk_space_gb = disk_space_gb,
- epochs = epochs
- }
-
- output {
- MergeVCF_HC4.*
- SamtoolsMergeBAMs.*
- TrainModel.*
- }
-
-}
-
-task WriteTensors {
- File input_bam
- File input_bam_index
- File input_vcf
- File input_vcf_index
- File reference_fasta
- File reference_dict
- File reference_fasta_index
- File truth_vcf
- File truth_vcf_index
- File truth_bed
- String output_prefix
- String tensor_type
- File interval_list
-
- # Runtime parameters
- String gatk_docker
- File? gatk_override
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- Int default_ram_mb = 8000
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- mkdir "/root/tensors/"
-
- gatk --java-options "-Xmx${command_mem}m" \
- CNNVariantWriteTensors \
- -R ${reference_fasta} \
- -V ${input_vcf} \
- -truth-vcf ${truth_vcf} \
- -truth-bed ${truth_bed} \
- -tensor-type ${tensor_type} \
- -output-tensor-dir "/root/tensors/" \
- -bam-file ${input_bam}
-
- tar -czf "tensors.tar.gz" "/root/tensors/"
- }
-
- output {
- File tensors = "tensors.tar.gz"
- }
- runtime {
- docker: "${gatk_docker}"
- memory: machine_mem + " MB"
- disks: "local-disk " + disk_space_gb + " SSD"
- }
-
-}
-
-task TrainModel {
- Array[File] tar_tensors
- String output_prefix
- String tensor_type
- Int epochs
-
- # Runtime parameters
- String gatk_docker
- File? gatk_override
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space_gb
- Int? cpu
-
- Int default_ram_mb = 8000
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- for tensors in ${sep=' ' tar_tensors} ; do
- tar -xzf $tensors
- done
-
- gatk --java-options "-Xmx${command_mem}m" \
- CNNVariantTrain \
- -input-tensor-dir "./tensors/" \
- -model-name ${output_prefix} \
- -image-dir "./" \
- -tensor-type ${tensor_type} \
- -epochs ${epochs}
- }
-
- output {
- File model_json = "${output_prefix}.json"
- File model_hd5 = "${output_prefix}.hd5"
- File roc_png = "per_class_roc_${output_prefix}.png"
- File training_png = "metric_history_${output_prefix}.png"
- }
-
- runtime {
- docker: "${gatk_docker}"
- #gpuType: "nvidia-tesla-k80" # This will require PAPI v2 and CUDA on VM
- #gpuCount: 1
- #zones: ["us-central1-c"]
- memory: machine_mem + " MB"
- disks: "local-disk 400 SSD"
- bootDiskSizeGb: "16"
- }
-}
\ No newline at end of file
diff --git a/scripts/cnn_variant_wdl/happy_plot.R b/scripts/cnn_variant_wdl/happy_plot.R
deleted file mode 100644
index ca12915b23d..00000000000
--- a/scripts/cnn_variant_wdl/happy_plot.R
+++ /dev/null
@@ -1,79 +0,0 @@
-library(dplyr)
-library(ggplot2)
-library(reshape2)
-
-# Multiple plot function
-#
-# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
-# - cols: Number of columns in layout
-# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
-#
-# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
-# then plot 1 will go in the upper left, 2 will go in the upper right, and
-# 3 will go all the way across the bottom.
-#
-multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
- library(grid)
-
- # Make a list from the ... arguments and plotlist
- plots <- c(list(...), plotlist)
-
- numPlots = length(plots)
-
- # If layout is NULL, then use 'cols' to determine layout
- if (is.null(layout)) {
- # Make the panel
- # ncol: Number of columns of plots
- # nrow: Number of rows needed, calculated from # of cols
- layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
- ncol = cols, nrow = ceiling(numPlots/cols))
- }
-
- if (numPlots==1) {
- print(plots[[1]])
-
- } else {
- # Set up the page
- grid.newpage()
- pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
-
- # Make each plot, in the correct location
- for (i in 1:numPlots) {
- # Get the i,j matrix positions of the regions that contain this subplot
- matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
-
- print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
- layout.pos.col = matchidx$col))
- }
- }
-}
-
-round_digits <- -2
-files <- list.files(pattern = "summary\\.csv$")
-dlist <- lapply(files, read.csv)
-names <- lapply(files, function(x) gsub("happy_", "", gsub(".summary.csv", "", x)))
-dnamed <- mapply(cbind, dlist, "Name"=names, SIMPLIFY=F)
-merged <- Reduce(function(...) merge(..., all=T), dnamed)
-
-names(merged) <- c( "Type", "Filter", "Total", "True Positives", "False Negatives", "QTotal", "False Positives", "Unknown", "Genotype Error", "Recall", "Precision", "NA", "F1 Score", "T TiTv" , "Q TiTv" , "T Het Hom" , "Q Het Hom", "Name")
-melted <- melt(merged, id.vars=c("Name", "Filter", "Type"))
-
-metrics <- subset(melted, variable%in%c("Recall", "Precision", "F1 Score"))
-p1 <- ggplot(metrics, aes(x=Name, y=value, color=Filter)) +
- geom_point(stat="identity", position = position_jitter(w = 0.06, h = 0)) +
- geom_text(aes(label=ifelse(Filter=="PASS", round(value, 3), "")), color="black", size=2.5, hjust=-0.4, vjust=0.5) +
- geom_text(aes(label=ifelse(Filter!="PASS", round(value, 3), "")), color="darkgrey", size=2.5, hjust=1.6, vjust=0.5) +
- facet_grid( variable ~ Type, scales="free_y" ) +
- ylab("Metrics") +
- theme(axis.text.x=element_text(angle=30, hjust = 1))
-
-counts <- subset(melted, variable%in%c("True Positives", "False Negatives", "False Positives"))
-p2 <- ggplot(counts, aes(x=Name, y=value, color=Filter)) +
- geom_point(stat="identity", position = position_jitter(w = 0.06, h = 0)) +
- facet_grid( variable ~ Type, scales="free_y" ) +
- ylab("Counts") +
- geom_text(aes(label=ifelse(Filter=="PASS", round(value, round_digits), "")), color="black", size=2.5, hjust=-0.4, vjust=0.5) +
- geom_text(aes(label=ifelse(Filter!="PASS", round(value, round_digits), "")), color="darkgrey", size=2.5, hjust=1.6, vjust=0.5) +
- theme(axis.text.x=element_text(angle=30, hjust = 1))
-
-ggsave(plot=multiplot(p1, p2, cols=2), filename = 'metrics.png', width=4, height=3, units="in")
\ No newline at end of file
diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json
deleted file mode 100755
index 1724eb39a67..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
- "CNNScoreVariantsWorkflow.calling_intervals": "gs://broad-references/hg19/v0/wgs_calling_regions.v1.chr20.interval_list",
- "CNNScoreVariantsWorkflow.gatk_docker": "broadinstitute/gatk",
- "CNNScoreVariantsWorkflow.input_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/chr20_tiny_tf_python_gpu.vcf.gz",
- "CNNScoreVariantsWorkflow.input_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/chr20_tiny_tf_python_gpu.vcf.gz.tbi",
- "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_tiny",
- "CNNScoreVariantsWorkflow.reference_fasta": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.fasta",
- "CNNScoreVariantsWorkflow.reference_dict": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.dict",
- "CNNScoreVariantsWorkflow.reference_fasta_index": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai",
- "CNNScoreVariantsWorkflow.resources" : [
- "gs://broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz",
- "gs://broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz",
- "gs://broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.vcf.gz"
- ],
- "CNNScoreVariantsWorkflow.resources_index" : [
- "gs://broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz.tbi",
- "gs://broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi",
- "gs://broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.vcf.gz.tbi"
- ],
- "CNNScoreVariantsWorkflow.inference_batch_size": "16",
- "CNNScoreVariantsWorkflow.transfer_batch_size": "32",
- "CNNScoreVariantsWorkflow.tensor_type": "reference",
- "CNNScoreVariantsWorkflow.info_key": "CNN_1D",
- "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.9 ",
- "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.5 ",
- "CNNScoreVariantsWorkflow.scatter_count": "2",
- "CNNScoreVariantsWorkflow.cnn_task_mem_gb": "8",
- "CNNScoreVariantsWorkflow.cnn_task_cpu": "2",
- "CNNScoreVariantsWorkflow.preemptible_attempts": "20"
-}
diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json
deleted file mode 100755
index 83439b5143d..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "CNNScoreVariantsWorkflow.bam_file": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bam",
- "CNNScoreVariantsWorkflow.bam_file_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bai",
- "CNNScoreVariantsWorkflow.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list",
- "CNNScoreVariantsWorkflow.gatk_docker": "__GATK_DOCKER__",
- "CNNScoreVariantsWorkflow.input_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz",
- "CNNScoreVariantsWorkflow.input_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz.tbi",
- "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_1m_895",
- "CNNScoreVariantsWorkflow.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
- "CNNScoreVariantsWorkflow.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
- "CNNScoreVariantsWorkflow.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
- "CNNScoreVariantsWorkflow.resources": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf",
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"
- ],
- "CNNScoreVariantsWorkflow.resources_index": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf.idx",
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf.idx"
- ],
- "CNNScoreVariantsWorkflow.inference_batch_size": "1",
- "CNNScoreVariantsWorkflow.transfer_batch_size": "2",
- "CNNScoreVariantsWorkflow.intra_op_threads": 0,
- "CNNScoreVariantsWorkflow.inter_op_threads": 0,
- "CNNScoreVariantsWorkflow.tensor_type": "read_tensor",
- "CNNScoreVariantsWorkflow.info_key": "CNN_2D",
- "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.0 ",
- "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.0 ",
- "CNNScoreVariantsWorkflow.scatter_count": "2"
-}
diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json
deleted file mode 100755
index dc1ec66f4f5..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
- "CNNScoreVariantsWorkflow.bam_file": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bam",
- "CNNScoreVariantsWorkflow.bam_file_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bai",
- "CNNScoreVariantsWorkflow.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list",
- "CNNScoreVariantsWorkflow.gatk_docker": "__GATK_DOCKER__",
- "CNNScoreVariantsWorkflow.input_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz",
- "CNNScoreVariantsWorkflow.input_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz.tbi",
- "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_1m_895",
- "CNNScoreVariantsWorkflow.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
- "CNNScoreVariantsWorkflow.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
- "CNNScoreVariantsWorkflow.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
- "CNNScoreVariantsWorkflow.resources": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf",
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf"
- ],
- "CNNScoreVariantsWorkflow.resources_index": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf.idx",
- "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf.idx"
- ],
- "CNNScoreVariantsWorkflow.inference_batch_size": "1",
- "CNNScoreVariantsWorkflow.transfer_batch_size": "2",
- "CNNScoreVariantsWorkflow.tensor_type": "reference",
- "CNNScoreVariantsWorkflow.info_key": "CNN_1D",
- "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.9 ",
- "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.5 ",
- "CNNScoreVariantsWorkflow.scatter_count": "2"
-}
diff --git a/scripts/cnn_variant_wdl/jsons/cram2filtered.json b/scripts/cnn_variant_wdl/jsons/cram2filtered.json
deleted file mode 100755
index 5272c4398fe..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cram2filtered.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
- "Cram2FilteredVcf.input_file": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram",
- "Cram2FilteredVcf.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
- "Cram2FilteredVcf.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
- "Cram2FilteredVcf.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
- "Cram2FilteredVcf.resources" : [
- "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz",
- "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
- "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
- ],
- "Cram2FilteredVcf.resources_index" : [
- "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi",
- "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi",
- "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
- ],
- "Cram2FilteredVcf.output_prefix": "hg38_20k_na12878",
- "Cram2FilteredVcf.info_key": "CNN_2D",
- "Cram2FilteredVcf.snp_tranches": " --snp-tranche 99.9 ",
- "Cram2FilteredVcf.indel_tranches": " --indel-tranche 99.5 ",
- "Cram2FilteredVcf.tensor_type":"read_tensor",
- "Cram2FilteredVcf.calling_intervals": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
- "Cram2FilteredVcf.gatk_docker": "broadinstitute/gatk",
- "Cram2FilteredVcf.preemptible_attempts": 10,
- "Cram2FilteredVcf.inference_batch_size": 8,
- "Cram2FilteredVcf.transfer_batch_size": 32,
- "Cram2FilteredVcf.mem_gb": 7,
- "Cram2FilteredVcf.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ",
- "Cram2FilteredVcf.scatter_count": 4
-}
diff --git a/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json b/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json
deleted file mode 100755
index f7f07f50cd7..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
- "Cram2FilteredVcf.input_file": "/home/runner/work/gatk/gatk/src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram",
- "Cram2FilteredVcf.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
- "Cram2FilteredVcf.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
- "Cram2FilteredVcf.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
- "Cram2FilteredVcf.resources" : ["/home/runner/work/gatk/gatk/src/test/resources/large/dbsnp_138.b37.20.21.vcf"],
- "Cram2FilteredVcf.resources_index" : ["/home/runner/work/gatk/gatk/src/test/resources/large/dbsnp_138.b37.20.21.vcf.idx"],
- "Cram2FilteredVcf.output_prefix": "na12878_b37_20_21",
- "Cram2FilteredVcf.info_key": "CNN_2D",
- "Cram2FilteredVcf.snp_tranches": " --snp-tranche 99.9 ",
- "Cram2FilteredVcf.indel_tranches": " --indel-tranche 99.5 ",
- "Cram2FilteredVcf.tensor_type":"read_tensor",
- "Cram2FilteredVcf.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list",
- "Cram2FilteredVcf.gatk_docker": "__GATK_DOCKER__",
- "Cram2FilteredVcf.preemptible_attempts": 0,
- "Cram2FilteredVcf.inference_batch_size": 2,
- "Cram2FilteredVcf.transfer_batch_size": 4,
- "Cram2FilteredVcf.intra_op_threads": 0,
- "Cram2FilteredVcf.inter_op_threads": 0,
- "Cram2FilteredVcf.mem_gb": 7,
- "Cram2FilteredVcf.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ",
- "Cram2FilteredVcf.scatter_count": 3
-}
diff --git a/scripts/cnn_variant_wdl/jsons/cram2model.json b/scripts/cnn_variant_wdl/jsons/cram2model.json
deleted file mode 100755
index a728ecf5b97..00000000000
--- a/scripts/cnn_variant_wdl/jsons/cram2model.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
- "Cram2TrainedModel.input_cram": "gs://broad-dsde-methods-sam/cnn-variant/bams/NA12878_PLUMBING.cram",
- "Cram2TrainedModel.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
- "Cram2TrainedModel.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
- "Cram2TrainedModel.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
- "Cram2TrainedModel.output_prefix": "plumbing_na12878",
- "Cram2TrainedModel.tensor_type": "read_tensor",
- "Cram2TrainedModel.truth_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz",
- "Cram2TrainedModel.truth_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi",
- "Cram2TrainedModel.truth_bed": "gs://broad-dsde-methods-sam/cnn-variant/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed",
- "Cram2TrainedModel.calling_intervals": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list",
- "Cram2TrainedModel.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ",
- "Cram2TrainedModel.gatk_docker": "samfriedman/gatk:44dc3d18e0e204",
- "Cram2TrainedModel.gatk_override": "gs://broad-dsde-methods-sam/cnn-variant/jars/sf_gatk2.jar",
- "Cram2TrainedModel.preemptible_attempts": 0,
- "Cram2TrainedModel.disk_space_gb": 300,
- "Cram2TrainedModel.scatter_count": 2,
- "Cram2TrainedModel.epochs": 36,
- "Cram2TrainedModel.mem_gb": 7
-}
diff --git a/scripts/cnn_variant_wdl/jsons/run_happy.json b/scripts/cnn_variant_wdl/jsons/run_happy.json
deleted file mode 100644
index 491fc0571b7..00000000000
--- a/scripts/cnn_variant_wdl/jsons/run_happy.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
- "HappyWorkflow.vcf_files": [
- "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz",
- "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/e8031a29-d788-4901-bb77-4f4ba542a024/call-FilterVariantTranches/nova_g947n_na12878_filtered.vcf.gz"
- ],
- "HappyWorkflow.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
- "HappyWorkflow.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
- "HappyWorkflow.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
- "HappyWorkflow.truth_bed": "gs://broad-dsde-methods-sam/cnn-variant/beds/chr20_conf_1m_10m.bed",
- "HappyWorkflow.truth_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz",
- "HappyWorkflow.truth_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi",
- "HappyWorkflow.rscript": "gs://broad-dsde-methods-sam/cnn-variant/scripts/happy_plot.R",
- "HappyWorkflow.disk_space": "100",
- "HappyWorkflow.cpu": "2",
- "HappyWorkflow.mem_gb": "8",
- "HappyWorkflow.preemptible_attempts": "10"
-}
\ No newline at end of file
diff --git a/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json b/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json
deleted file mode 100644
index 22e060d7fa2..00000000000
--- a/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
- "VariantClassifierPlots.call_vcf": "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz",
- "VariantClassifierPlots.call_vcf_index": "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz.tbi",
- "VariantClassifierPlots.call_sample": "SM-G947Y",
- "VariantClassifierPlots.score_key": "CNN_2D",
- "VariantClassifierPlots.truth_vcf": "gs://broad-dsde-methods/cnn-variant-score/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz",
- "VariantClassifierPlots.truth_vcf_index": "gs://broad-dsde-methods/cnn-variant-score/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi",
- "VariantClassifierPlots.truth_sample": "HG001",
- "VariantClassifierPlots.intervals": "gs://broad-dsde-methods/cnn-variant-score/beds/HG001_NA12878_GRCh38_GIAB_highconf.interval_list",
- "VariantClassifierPlots.rscript": "gs://broad-dsde-methods/cnn-variant-score/scripts/vcf_analysis.R",
- "VariantClassifierPlots.gatk_docker": "broadinstitute/gatk",
- "VariantClassifierPlots.preemptible_attempts": "0"
-}
\ No newline at end of file
diff --git a/scripts/cnn_variant_wdl/run_happy.wdl b/scripts/cnn_variant_wdl/run_happy.wdl
deleted file mode 100644
index e7bcbf2c239..00000000000
--- a/scripts/cnn_variant_wdl/run_happy.wdl
+++ /dev/null
@@ -1,145 +0,0 @@
-# Run the hap.py VCF evaluation over input vcfs given a validated truth vcf and confidence region
-workflow HappyWorkflow {
- Array[File] vcf_files # VCF files to evaluate with hap.py
-
- File reference_fasta
- File reference_dict
- File reference_fasta_index
-
- File truth_vcf
- File truth_vcf_index
- File truth_bed
-
- File rscript
-
- Int? preemptible_attempts
- Int? disk_space
- Int? mem_gb
- Int? cpu
-
- call RunHappy {
- input:
- vcf_files = vcf_files,
- truth_vcf = truth_vcf,
- truth_vcf_index = truth_vcf_index,
- truth_bed = truth_bed,
- reference_fasta = reference_fasta,
- reference_dict = reference_dict,
- reference_fasta_index = reference_fasta_index,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- call RunHappyPlots{
- input:
- happy_outputs = RunHappy.happy_outputs,
- rscript = rscript,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- output {
- RunHappy.*
- RunHappyPlots.*
- }
-}
-
-task RunHappy {
- Array[File] vcf_files
-
- File reference_fasta
- File reference_dict
- File reference_fasta_index
-
- File truth_vcf
- File truth_vcf_index
- File truth_bed
-
- # Runtime parameters
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
- Boolean use_ssd = false
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- for vcf_file in ${sep=" " vcf_files}; do
- vname=$(basename "$vcf_file")
- /opt/hap.py/bin/hap.py \
- ${truth_vcf} \
- "$vcf_file" \
- -f ${truth_bed} \
- -r ${reference_fasta} \
- -o ./happy_"$vname"
- done
- }
-
- output {
- Array[File] happy_outputs = glob("./happy_*")
- }
-
- runtime {
- docker: "pkrusche/hap.py"
-
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
- preemptible: select_first([preemptible_attempts, 10])
- cpu: select_first([cpu, 1])
- }
-}
-
-task RunHappyPlots {
- Array[File] happy_outputs
- File rscript
-
- # Runtime parameters
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
- Boolean use_ssd = false
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- for file in ${sep=" " happy_outputs}; do
- mv "$file" ./
- done
- find `pwd`
-
- Rscript ${rscript}
- }
-
- output {
- Array[File] plots = glob("*png")
- }
-
- runtime {
- docker: "rocker/tidyverse"
-
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
- preemptible: select_first([preemptible_attempts, 10])
- cpu: select_first([cpu, 1])
- }
-}
-
-
diff --git a/scripts/cnn_variant_wdl/variant_classifier_plots.wdl b/scripts/cnn_variant_wdl/variant_classifier_plots.wdl
deleted file mode 100644
index 10d291c06d1..00000000000
--- a/scripts/cnn_variant_wdl/variant_classifier_plots.wdl
+++ /dev/null
@@ -1,272 +0,0 @@
-# Run VCF evaluation over input vcf given a validated truth vcf and confidence region
-workflow VariantClassifierPlots {
- File call_vcf # VCF to be evaluated
- File call_vcf_index # Index of VCF to be evaluated
- String? call_sample
- String score_key
-
- File? truth_vcf # Optional truth VCF. If provided, plot colors show true positives and
- File? truth_vcf_index # true negatives in green with false positives in red and false negatives in yellow.
- String? truth_sample # Otherwise, plot colors show filtered variants in red and passing variant in green.
-
- File? intervals
-
- File rscript
-
- String gatk_docker
- File? gatk_override
-
- Int? preemptible_attempts
- Int? disk_space
- Int? mem_gb
- Int? cpu
-
- if(defined(truth_vcf)){
- call MakeTables {
- input:
- call_vcf = call_vcf,
- call_vcf_index = call_vcf_index,
- call_sample = call_sample,
- score_key = score_key,
- truth_vcf = truth_vcf,
- truth_vcf_index = truth_vcf_index,
- truth_sample = truth_sample,
- intervals = intervals,
- gatk_docker = gatk_docker,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- call MakePlots{
- input:
- rscript = rscript,
- call_table = MakeTables.call_table,
- truth_table = MakeTables.truth_table,
- score_key = score_key,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- output {
- MakeTables.*
- MakePlots.*
- }
- }
-
- if(!defined(truth_vcf)){
- call MakeTableNoTruth {
- input:
- call_vcf = call_vcf,
- call_vcf_index = call_vcf_index,
- call_sample = call_sample,
- score_key = score_key,
- gatk_docker = gatk_docker,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- call MakePlots as MakePlotsNoTruth {
- input:
- rscript = rscript,
- call_table = MakeTableNoTruth.call_table,
- score_key = score_key,
- cpu = cpu,
- mem_gb = mem_gb,
- disk_space = disk_space,
- preemptible_attempts = preemptible_attempts
- }
-
- output {
- MakeTableNoTruth.*
- MakePlotsNoTruth.*
- }
-
- }
-
-}
-
-task MakeTables {
- File call_vcf
- File call_vcf_index
- String? call_sample
- String score_key
-
- File? truth_vcf
- File? truth_vcf_index
- String? truth_sample
-
- File? intervals
-
- # Runtime parameters
- String gatk_docker
- File? gatk_override
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
-
- String call_table_name = basename(call_vcf) + ".table"
- String sd_fix_vcf = "call_sd_fix.vcf"
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- UpdateVcfSequenceDictionary \
- --INPUT=${call_vcf} \
- --OUTPUT=${sd_fix_vcf} \
- -SD=${truth_vcf}
-
- gatk IndexFeatureFile -F ${sd_fix_vcf}
-
- gatk --java-options "-Xmx${command_mem}m" \
- GenotypeConcordance \
- --CALL_VCF=${sd_fix_vcf} \
- ${"--CALL_SAMPLE=" + call_sample} \
- --TRUTH_VCF=${truth_vcf} \
- ${"--TRUTH_SAMPLE=" + truth_sample} \
- ${"--INTERVALS=" + intervals} \
- --OUTPUT_VCF=true \
- --IGNORE_FILTER_STATUS \
- -O=concordance
-
- gatk --java-options "-Xmx${command_mem}m" \
- VariantsToTable \
- -V ${sd_fix_vcf} \
- -F CHROM -F POS -F REF -F ALT -F FILTER -F ${score_key} \
- -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE \
- --show-filtered \
- -O ${call_table_name}
-
- gatk --java-options "-Xmx${command_mem}m" \
- VariantsToTable \
- -V concordance.genotype_concordance.vcf.gz \
- -F CHROM -F POS -F REF -F ALT -F CONC_ST \
- -O truth.table
- }
-
- output {
- File call_table = "${call_table_name}"
- File truth_table = "truth.table"
- }
-
- runtime {
- docker: gatk_docker
- memory: machine_mem + " MB"
- # Note that the space before SSD and HDD should be included.
- disks: "local-disk " + default_disk_space_gb + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- zones: "us-east4-a"
- bootDiskSizeGb: "16"
- }
-}
-
-task MakeTableNoTruth {
- File call_vcf
- File call_vcf_index
- String? call_sample
- String score_key
-
- # Runtime parameters
- String gatk_docker
- File? gatk_override
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
-
- String call_table_name = basename(call_vcf) + ".table"
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- set -e
- export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
- gatk --java-options "-Xmx${command_mem}m" \
- VariantsToTable \
- -V ${call_vcf} \
- -F CHROM -F POS -F REF -F ALT -F FILTER -F ${score_key} \
- -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE \
- --show-filtered \
- -O ${call_table_name}
- }
-
- output {
- File call_table = "${call_table_name}"
- }
-
- runtime {
- docker: gatk_docker
- memory: machine_mem + " MB"
- # Note that the space before SSD and HDD should be included.
- disks: "local-disk " + default_disk_space_gb + " HDD"
- preemptible: select_first([preemptible_attempts, 3])
- cpu: select_first([cpu, 1])
- zones: "us-east4-a"
- bootDiskSizeGb: "16"
- }
-}
-
-task MakePlots {
- File rscript
- File call_table
- File? truth_table
- String score_key
-
- # Runtime parameters
- Int? mem_gb
- Int? preemptible_attempts
- Int? disk_space
- Int? cpu
-
- # You may have to change the following two parameter values depending on the task requirements
- Int default_ram_mb = 16000
- # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
- Int default_disk_space_gb = 100
-
- # Mem is in units of GB but our command and memory runtime values are in MB
- Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb
- Int command_mem = machine_mem - 1000
-
- command {
- Rscript ${rscript} ${call_table} ${truth_table} ${score_key}
- }
-
- output {
- Array[File] plots = glob("*png")
- }
-
- runtime {
- docker: "rocker/tidyverse"
-
- memory: machine_mem + " MB"
- disks: "local-disk " + select_first([disk_space, 100]) + " HDD"
- preemptible: select_first([preemptible_attempts, 10])
- cpu: select_first([cpu, 1])
- }
-}
diff --git a/scripts/cnn_variant_wdl/vcf_analysis.R b/scripts/cnn_variant_wdl/vcf_analysis.R
deleted file mode 100644
index 0794906da74..00000000000
--- a/scripts/cnn_variant_wdl/vcf_analysis.R
+++ /dev/null
@@ -1,215 +0,0 @@
-#!/usr/bin/env Rscript
-
-library(tidyr)
-library(dplyr)
-library(ggplot2)
-library(reshape2)
-
-args = commandArgs(trailingOnly=TRUE)
-if (length(args) != 3) {
- stop("We need 3 arguments: call_vcf_table concordance_vcf_table score_key")
-}
-
-print("try to load VCF table.")
-d <- read.table(args[1], header=TRUE)
-print("try to load VCF Truth table.")
-dt <- read.table(args[2], header=TRUE)
-score_key <- args[3]
-score_label <- paste(score_key, " LOD Score")
-plot_title <- gsub( ".vcf.gz.table", "", basename(args[1]))
-num_bins <- 50
-bin_by_quantile <- FALSE
-
-get_proportion <- function(d, num_bins, column_to_sum, quality_column) {
- x <- rowsum(column_to_sum, quality_column, na.rm =T)
- idx <- row.names(x)
-
- for (i in 1:num_bins) {
- qsum <- sum(quality_column==as.numeric(idx[i]))
- if (!is.na(x[i]) && qsum>0) {
- x[i] <- x[i] / qsum
- }
- }
- return(x[quality_column])
-}
-
-print("try to merge.")
-d <- merge(d, dt, by=c("CHROM", "POS", "REF", "ALT"))
-d$TP <- as.numeric(d$CONC_ST!="FP,TN" & d$CONC_ST!="FP" & d$CONC_ST!="EMPTY")
-d$True_Positive <- d$CONC_ST!="FP,TN" & d$CONC_ST!="FP" & d$CONC_ST!="EMPTY"
-d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "."
-d$SNP <- d$EVENTLENGTH == 0
-d$ONE <- 1
-x <- rowsum(d$ONE, d$EVENTLENGTH)
-d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)]
-d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0))
-d$Truth_Status <- ifelse(d$True_Positive & d$Unfiltered, "True Positive", ifelse(d$True_Positive & !d$Unfiltered, "False Negative", ifelse(!d$True_Positive & d$Unfiltered, "False Positive", "True Negative")))
-statusColor <- c("True Positive" = "springgreen3", "True Negative" = "aquamarine4", "False Positive" = "red", "False Negative" = "orange")
-
-# All variant plots
-print("Make all variant plots.")
-
-# Plot histogram of scores separately for SNPs and INDELs.
-p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) +
- scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) +
- geom_density(alpha=0.55) +
- ggtitle(plot_title) +
- xlab(score_label) +
- guides(color=FALSE)
-
-# Violin plot of scores stratified by event length, including all insertions and deletions.
-p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status, shape=Variant_Type)) +
- scale_color_manual(values=statusColor) +
- scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) +
- geom_jitter(height = 0, width = 0.1, alpha=0.6) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-# Violin plot of scores stratified by event length, insertions and deletions smaller than 20 base pairs.
-p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) + xlim(-20, 20) +
- scale_color_manual(values=statusColor) +
- geom_jitter(height = 0, width = 0.1, alpha=0.4) +
- geom_violin(color="grey", alpha=0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-# Violin plot of scores stratified by event length, insertions and deletions smaller than 10 base pairs.
-p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) + xlim(-10, 10) +
- scale_color_manual(values=statusColor) +
- geom_jitter(height = 0, width = 0.2, alpha=0.4) +
- geom_violin(color="grey", alpha=0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-# Violin plot of scores stratified by event length, insertions and deletions smaller than 5 base pairs.
-p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) +
- scale_color_manual(values=statusColor) + xlim(-5, 5) +
- geom_jitter(height = 0, width = 0.35, alpha=0.4) +
- geom_violin(color="grey", alpha=0.0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-
-# SNP specific plots
-print("Make SNP plots.")
-snps <- subset(d, EVENTLENGTH == 0)
-my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
-snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
-snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T)
-mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
-df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
-q_means <- rowMeans(df)
-snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN]
-snps$TPR_PREDICTION <- exp(snps$QUALITY_LOD) / (1 + exp(snps$QUALITY_LOD) )
-
-x <- rowsum(snps$ONE, snps$QUALITY_BIN)
-snps$BIN_SUM <- x[snps$QUALITY_BIN]
-snps$TRANSVERSION <- as.numeric( abs(snps$TRANSITION)==0 )
-snps$TPR <- get_proportion(snps, num_bins, snps$TP, snps$QUALITY_BIN)
-ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN)
-tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN)
-snps$TI_TV <- ti/tv
-
-# Plot transition transversion ratios as a function of score bins
-p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Truth_Status, shape=TRANSITION==1)) +
- scale_color_manual(values=statusColor) +
- scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) +
- geom_point() +
- geom_line(color="grey") +
- ggtitle("Transition Transversion Ratio per score bin") +
- xlab(score_label) +
- ylim(0, 4)
-
-# SNP calibration plot
-p7 <- ggplot(snps, aes(x=TPR_PREDICTION, y=TPR, group=QUALITY_BIN, color=Truth_Status)) +
- scale_color_manual(values=statusColor) +
- geom_jitter(height = 0.01, width = 0.01, alpha=0.4) +
- ggtitle(paste("SNP Calibration", plot_title)) +
- ylim(0, 1) + xlim(0, 1)
-
-
-# INDEL specific plots
-print("Make INDEL plots.")
-indels <- subset(d, EVENTLENGTH != 0)
-my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
-indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
-indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T)
-mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
-df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
-q_means <- rowMeans(df)
-indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN]
-indels$TPR_PREDICTION <- exp(indels$QUALITY_LOD) / (1 + exp(indels$QUALITY_LOD))
-x <- rowsum(indels$ONE, indels$QUALITY_BIN)
-indels$BIN_SUM <- x[indels$QUALITY_BIN]
-indels$TPR <- get_proportion(indels, num_bins, indels$TP, indels$QUALITY_BIN)
-indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1)
-indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN)
-
-# Plot proportion of each socre bin that are 1 base pair Insertion or deletion
-p8 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Truth_Status, shape=EVENTLENGTH<0)) +
- scale_color_manual(values=statusColor) +
- scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) +
- geom_jitter(height = 0.005, width = 0.0, alpha=0.6) +
- geom_line(color="grey") +
- ggtitle("Proportion of 1bp INDELs per score bin") +
- xlab(score_label)
-
-# INDEL calibration plot
-p9 <- ggplot(indels, aes(x=TPR_PREDICTION, y=TPR, group=QUALITY_BIN, color=Truth_Status)) +
- scale_color_manual(values=statusColor) +
- geom_jitter(height = 0.01, width = 0.01, alpha=0.4) +
- ggtitle(paste("INDEL Calibration", plot_title)) +
- ylim(0, 1) + xlim(0, 1)
-
-# Multiple plot function
-#
-# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
-# - cols: Number of columns in layout
-# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
-#
-# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
-# then plot 1 will go in the upper left, 2 will go in the upper right, and
-# 3 will go all the way across the bottom.
-#
-multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
- library(grid)
-
- # Make a list from the ... arguments and plotlist
- plots <- c(list(...), plotlist)
-
- numPlots = length(plots)
-
- # If layout is NULL, then use 'cols' to determine layout
- if (is.null(layout)) {
- # Make the panel
- # ncol: Number of columns of plots
- # nrow: Number of rows needed, calculated from # of cols
- layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
- ncol = cols, nrow = ceiling(numPlots/cols))
- }
-
- if (numPlots==1) {
- print(plots[[1]])
-
- } else {
- # Set up the page
- grid.newpage()
- pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
-
- # Make each plot, in the correct location
- for (i in 1:numPlots) {
- # Get the i,j matrix positions of the regions that contain this subplot
- matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
-
- print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
- layout.pos.col = matchidx$col))
- }
- }
-}
-ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7,p8,p9, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=22)
diff --git a/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R b/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R
deleted file mode 100644
index c5dc8cc1039..00000000000
--- a/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R
+++ /dev/null
@@ -1,193 +0,0 @@
-#!/usr/bin/env Rscript
-
-library(tidyr)
-library(dplyr)
-library(ggplot2)
-library(reshape2)
-
-# ./gatk VariantsToTable -V /dsde/data/deep/vqsr/vcfs/illumina_na12878_platinum_scored_chr2.vcf.gz -F CHROM -F POS -F REF -F ALT -F FILTER -F G947_SITE_LABELLED_RRAB -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE -O ~/Documents/illumin_chr2.table
-#d <- read.table("illumin_chr2.table", header=TRUE)
-#score_key <- "G947_SITE_LABELLED_RRAB"
-#d <- read.table("g94982_chr20.table", header=TRUE)
-#score_key <- "CNN_2D"
-#d <- read.table("new_gnomad_22.table", header=TRUE)
-#score_key <- "CNN_1D"
-
-args = commandArgs(trailingOnly=TRUE)
-if (length(args) != 2) {
- stop("We need 2 arguments: call_vcf_table score_key")
-}
-
-print("try to load VCF table.")
-d <- read.table(args[1], header=TRUE)
-score_key <- args[2]
-score_label <- paste(score_key, " LOD Score")
-plot_title <- gsub(".vcf.gz.table", "", basename(args[1]))
-num_bins <- 50
-bin_by_quantile <- FALSE
-
-get_proportion <- function(d, num_bins, column_to_sum, quality_column) {
- x <- rowsum(column_to_sum, quality_column, na.rm =T)
- idx <- row.names(x)
-
- for (i in 1:num_bins) {
- qsum <- sum(quality_column==as.numeric(idx[i]))
- if (!is.na(x[i]) && qsum>0) {
- x[i] <- x[i] / qsum
- }
- }
- return(x[quality_column])
-}
-
-d$SNP <- d$EVENTLENGTH == 0
-d$ONE <- 1
-x <- rowsum(d$ONE, d$EVENTLENGTH)
-d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)]
-d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "."
-d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0))
-
-
-# All variant plots
-print("Make all variant plots.")
-p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) +
- scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) +
- geom_density(alpha=0.55) +
- ggtitle(plot_title) +
- xlab(score_label) +
- guides(color=FALSE)
-
-p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered, shape=Variant_Type)) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) +
- geom_jitter(height = 0, width = 0.2, alpha=0.6) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-20, 20) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- geom_jitter(height = 0, width = 0.15, alpha=0.4) +
- geom_violin(color="grey", alpha=0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-10, 10) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- geom_jitter(height = 0, width = 0.2, alpha=0.4) +
- geom_violin(color="grey", alpha=0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- xlim(-5, 5) +
- geom_jitter(height = 0, width = 0.35, alpha=0.4) +
- geom_violin(color="grey", alpha=0.0) +
- geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) +
- ggtitle(plot_title) +
- ylab(score_label) +
- xlab("Event Length: - Deletions, 0 SNPs, + Insertions")
-
-
-# SNP specific plots
-print("Make SNP plots.")
-snps <- subset(d, EVENTLENGTH == 0)
-my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
-snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
-snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T)
-mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
-df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
-q_means <- rowMeans(df)
-snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN]
-
-x <- rowsum(snps$ONE, snps$QUALITY_BIN)
-snps$BIN_SUM <- x[snps$QUALITY_BIN]
-snps$TRANSVERSION <- as.numeric(abs(snps$TRANSITION)==0)
-ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN)
-tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN)
-snps$TI_TV <- ti/tv
-
-p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Unfiltered, shape=TRANSITION==1)) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) +
- geom_point() +
- geom_line(color="grey") +
- xlab(score_label) +
- ggtitle("Transition Transversion Ratio per score bin") +
- ylim(0, 4)
-
-
-# INDEL specific plots
-print("Make INDEL plots.")
-indels <- subset(d, EVENTLENGTH != 0)
-my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins)
-indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F)
-indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T)
-mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric)
-df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T))
-q_means <- rowMeans(df)
-indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN]
-x <- rowsum(indels$ONE, indels$QUALITY_BIN)
-indels$BIN_SUM <- x[indels$QUALITY_BIN]
-indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1)
-indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN)
-
-p7 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Unfiltered, shape=EVENTLENGTH<0)) +
- scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) +
- scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) +
- geom_jitter(height = 0.005, width = 0.0, alpha=0.6) +
- geom_line(color="grey") +
- ggtitle("Proportion of 1bp INDELs per score bin") +
- xlab(score_label)
-
-# Multiple plot function
-#
-# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
-# - cols: Number of columns in layout
-# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
-#
-# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
-# then plot 1 will go in the upper left, 2 will go in the upper right, and
-# 3 will go all the way across the bottom.
-#
-multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
- library(grid)
-
- # Make a list from the ... arguments and plotlist
- plots <- c(list(...), plotlist)
-
- numPlots = length(plots)
-
- # If layout is NULL, then use 'cols' to determine layout
- if (is.null(layout)) {
- # Make the panel
- # ncol: Number of columns of plots
- # nrow: Number of rows needed, calculated from # of cols
- layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
- ncol = cols, nrow = ceiling(numPlots/cols))
- }
-
- if (numPlots==1) {
- print(plots[[1]])
-
- } else {
- # Set up the page
- grid.newpage()
- pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
-
- # Make each plot, in the correct location
- for (i in 1:numPlots) {
- # Get the i,j matrix positions of the regions that contain this subplot
- matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
-
- print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
- layout.pos.col = matchidx$col))
- }
- }
-}
-ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=20)
-
diff --git a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json
index aeb0bb8796d..ab85fe78e83 100644
--- a/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json
+++ b/scripts/cnv_cromwell_tests/germline/cnv_germline_case_scattered_workflow.json
@@ -17,8 +17,8 @@
"CNVGermlineCaseScatteredWorkflow.gcnv_max_training_epochs": 1,
"CNVGermlineCaseScatteredWorkflow.gcnv_min_training_epochs": 1,
"CNVGermlineCaseScatteredWorkflow.gcnv_model_tars": [
- "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz",
- "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"],
+ "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-0.tar.gz",
+ "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-1.tar.gz"],
"CNVGermlineCaseScatteredWorkflow.gcnv_num_thermal_advi_iters": 1,
"CNVGermlineCaseScatteredWorkflow.intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list",
"CNVGermlineCaseScatteredWorkflow.filtered_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.preprocessed.filtered.interval_list",
diff --git a/scripts/docker/dockertest.gradle b/scripts/docker/dockertest.gradle
index 692bc945db9..de860d6736d 100644
--- a/scripts/docker/dockertest.gradle
+++ b/scripts/docker/dockertest.gradle
@@ -8,7 +8,7 @@ buildscript {
plugins {
id "java" // set up default java compile and test tasks
- id "jacoco"
+// id "jacoco"
}
repositories {
@@ -113,9 +113,9 @@ def getJVMArgs(runtimeAddOpens, testAddOpens) {
test {
jvmArgs = getJVMArgs(runtimeAddOpens, testAddOpens)
- jacoco {
- jvmArgs = getJVMArgs(runtimeAddOpens, testAddOpens)
- }
+// jacoco {
+// jvmArgs = getJVMArgs(runtimeAddOpens, testAddOpens)
+// }
}
task testOnPackagedReleaseJar(type: Test){
@@ -153,22 +153,25 @@ task testOnPackagedReleaseJar(type: Test){
// Task intended to collect coverage data from testOnPackagedReleaseJar executed inside the docker image
// the classpath for these tests is set at execution time for testOnPackagedReleaseJar
-task jacocoTestReportOnPackagedReleaseJar(type: JacocoReport) {
- String sourceFiles = "$System.env.SOURCE_DIR"
- String testClassesUnpacked = "$System.env.CP_DIR"
-
+//task jacocoTestReportOnPackagedReleaseJar(type: JacocoReport) {
+// String sourceFiles = "$System.env.SOURCE_DIR"
+// String testClassesUnpacked = "$System.env.CP_DIR"
+//
+// dependsOn testOnPackagedReleaseJar
+// executionData testOnPackagedReleaseJar
+// additionalSourceDirs.setFrom(sourceSets.main.allJava.srcDirs)
+//
+// sourceDirectories.setFrom(sourceFiles)
+// classDirectories.setFrom(testClassesUnpacked)
+//
+// group = "Reporting"
+// description = "Generate Jacoco coverage reports after running tests inside the docker image."
+//
+// reports {
+// xml.required = true
+// html.required = true
+// }
+//}
+task jacocoTestReportOnPackagedReleaseJar {
dependsOn testOnPackagedReleaseJar
- executionData testOnPackagedReleaseJar
- additionalSourceDirs.setFrom(sourceSets.main.allJava.srcDirs)
-
- sourceDirectories.setFrom(sourceFiles)
- classDirectories.setFrom(testClassesUnpacked)
-
- group = "Reporting"
- description = "Generate Jacoco coverage reports after running tests inside the docker image."
-
- reports {
- xml.required = true
- html.required = true
- }
}
diff --git a/scripts/docker/gatkbase/Dockerfile b/scripts/docker/gatkbase/Dockerfile
index 201ff3c6237..7f405744c85 100644
--- a/scripts/docker/gatkbase/Dockerfile
+++ b/scripts/docker/gatkbase/Dockerfile
@@ -3,10 +3,14 @@
# NOTE: If you update the ubuntu version make sure to update the samtools/bcftools/bedtools versions in the README
FROM ubuntu:22.04
+# Set environment variables.
# Avoid interactive prompts during apt installs/upgrades
-ENV DEBIAN_FRONTEND noninteractive
+ENV DEBIAN_FRONTEND="noninteractive" HOME="/root" JAVA_LIBRARY_PATH="/usr/lib/jni" DOWNLOAD_DIR="/downloads" CONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh" CONDA_SHA256="c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a" CONDA_PATH="/opt/miniconda" PATH="/opt/miniconda/bin:$PATH"
+
+# Define working directory.
+WORKDIR /root
-#### Basic image utilities
+#### Basic image utilities, google cloud support, and miniconda
RUN apt update && \
apt full-upgrade -y && \
apt install -y --no-install-recommends \
@@ -26,18 +30,16 @@ RUN apt update && \
git \
gpg-agent \
build-essential \
+ libblas-dev \
openjdk-17-jdk \
vim \
software-properties-common && \
apt -y clean && \
apt -y autoclean && \
apt -y autoremove && \
- rm -rf /var/lib/apt/lists/*
-
-RUN java -version
-
-#### Specific for google cloud support
-RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
+ rm -rf /var/lib/apt/lists/* && \
+ java -version && \
+ echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
| apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && \
apt update -y && \
@@ -49,26 +51,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.c
# Remove the anthos cli tool and related files since they are very large and we / anyone using the docker are unlikely to use them
# Remove the bundled python because we have python installed separately
rm -rf /usr/lib/google-cloud-sdk/bin/anthoscli /usr/lib/google-cloud-sdk/platform/anthoscli_licenses /usr/lib/google-cloud-sdk/platform/bundledpythonunix && \
- find / -wholename "*__pycache__/*.pyc" -exec rm {} +
-
-# Set environment variables.
-ENV HOME /root
-
-# Define working directory.
-WORKDIR /root
-
-# Define default command.
-CMD ["bash"]
-
-ENV JAVA_LIBRARY_PATH /usr/lib/jni
-
-# Install miniconda
-ENV DOWNLOAD_DIR /downloads
-ENV CONDA_URL https://repo.anaconda.com/miniconda/Miniconda3-py310_23.10.0-1-Linux-x86_64.sh
-ENV CONDA_SHA256 "c7a34df472feb69805b64df6e8db58363c5ccab41cd3b40b07e3e6dfb924359a"
-ENV CONDA_PATH /opt/miniconda
-ENV PATH $CONDA_PATH/bin:$PATH
-RUN mkdir $DOWNLOAD_DIR && \
+ find / -wholename "*__pycache__/*.pyc" -exec rm {} + && \
+ mkdir $DOWNLOAD_DIR && \
wget -nv -O $DOWNLOAD_DIR/miniconda.sh $CONDA_URL && \
test "$(sha256sum $DOWNLOAD_DIR/miniconda.sh | awk -v FS=' ' -v ORS='' '{print $1}')" = "$CONDA_SHA256" && \
bash $DOWNLOAD_DIR/miniconda.sh -p $CONDA_PATH -b && \
@@ -77,3 +61,5 @@ RUN mkdir $DOWNLOAD_DIR && \
conda config --set auto_update_conda false && \
conda config --set solver libmamba && \
rm -rf /root/.cache/pip
+
+CMD ["bash"]
\ No newline at end of file
diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template
index 75284b829e8..51d00ce99e8 100644
--- a/scripts/gatkcondaenv.yml.template
+++ b/scripts/gatkcondaenv.yml.template
@@ -15,53 +15,43 @@
name: $condaEnvName
channels:
# if channels other than conda-forge are added and the channel order is changed (note that conda channel_priority is currently set to flexible),
-# verify that key dependencies are installed from the correct channel and compiled against MKL
+# verify that key dependencies are installed from the correct channel
- conda-forge
-- defaults
+
dependencies:
# core python dependencies
-- conda-forge::python=3.6.10 # do not update
-- conda-forge::pip=21.3.1
-- conda-forge::mkl=2019.5 # MKL typically provides dramatic performance increases for theano, tensorflow, and other key dependencies
-- conda-forge::mkl-service=2.3.0
-- conda-forge::joblib=1.1.1 # must pin joblib - versions after 1.1.1 no longer support python 3.6
-- conda-forge::numpy=1.17.5 # do not update, this will break scipy=1.0.0
- # verify that numpy is compiled against MKL (e.g., by checking *_mkl_info using numpy.show_config())
- # and that it is used in tensorflow, theano, and other key dependencies
-- conda-forge::theano=1.0.4 # it is unlikely that new versions of theano will be released
- # verify that this is using numpy compiled against MKL (e.g., by the presence of -lmkl_rt in theano.config.blas.ldflags)
-- defaults::tensorflow=1.15.0 # update only if absolutely necessary, as this may cause conflicts with other core dependencies
- # verify that this is using numpy compiled against MKL (e.g., by checking tensorflow.pywrap_tensorflow.IsMklEnabled())
-- conda-forge::scipy=1.0.0 # do not update, this will break a scipy.misc.logsumexp import (deprecated in scipy=1.0.0) in pymc3=3.1
-- conda-forge::pymc3=3.1 # do not update, this will break gcnvkernel
-- conda-forge::h5py=2.10.0 # required by keras 2.2.4
-- conda-forge::keras=2.2.4 # updated from pip-installed 2.2.0, which caused various conflicts/clobbers of conda-installed packages
- # conda-installed 2.2.4 appears to be the most recent version with a consistent API and without conflicts/clobbers
- # if you wish to update, note that versions of conda-forge::keras after 2.2.5
- # undesirably set the environment variable KERAS_BACKEND = theano by default
-- defaults::intel-openmp=2019.4
-- conda-forge::scikit-learn=0.23.1
-- conda-forge::matplotlib=3.2.1
-- conda-forge::pandas=1.0.3
-- conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
-- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel
+- conda-forge::python=3.10.13 # do not update without good reason
+- conda-forge:pip=23.3.1
+- conda-forge:blas=1.0=mkl # our official environment uses MKL versions of various packages; if other versions are desired, users should edit this YML accordingly
+- conda-forge::numpy=1.26.2
+- conda-forge::pymc=5.10.1
+- conda-forge::pytensor=2.18.3
+- conda-forge::scipy=1.11.4
+- conda-forge::h5py=3.10.0
+- conda-forge::pytorch=2.1.0=*mkl*100
+- conda-forge::pytorch-lightning=2.4.0 # supports Pytorch >= 2.1 and <= 2.4, used by NVScoreVariants
+- conda-forge::scikit-learn=1.3.2
+- conda-forge::matplotlib=3.8.2
+- conda-forge::pandas=2.1.3
+- conda-forge::tqdm=4.66.1
+- conda-forge::dill=0.3.7 # used for pickling lambdas in TrainVariantAnnotationsModel
+- conda-forge::biopython=1.84 # used by NVScoreVariants
# core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
-- r-base=3.6.2
-- r-data.table=1.12.8
-- r-dplyr=0.8.5
-- r-getopt=1.20.3
-- r-ggplot2=3.3.0
-- r-gplots=3.0.3
-- r-gsalib=2.1
-- r-optparse=1.6.4
-- r-backports=1.1.10
+- r-base=4.3.1
+- r-data.table=1.14.8
+- r-dplyr=1.1.3
+- r-getopt=1.20.4
+- r-ggplot2=3.4.4
+- r-gplots=3.1.3
+- r-gsalib=2.2.1
+- r-optparse=1.7.3
+- r-backports=1.4.1
# other python dependencies; these should be removed after functionality is moved into Java code
-- biopython=1.76
-- pyvcf=0.6.8
-- bioconda::pysam=0.15.3 # using older conda-installed versions may result in libcrypto / openssl bugs
+- bioconda::pysam=0.22.0
+- conda-forge::pyvcf=0.6.8
# pip installs should be avoided, as pip may not respect the dependencies found by the conda solver
- pip:
diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index 5d93a688cca..18f7004419f 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -314,6 +314,8 @@ workflow Mutect2 {
File? maf_segments = CalculateContamination.maf_segments
File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table
File? m3_dataset = Concatenate.concatenated
+ File permutect_contigs_table = select_first(M2.permutect_contigs_table)
+ File permutect_read_groups_table = select_first(M2.permutect_read_groups_table)
}
}
@@ -442,22 +444,22 @@ task M2 {
touch bamout.bam
touch f1r2.tar.gz
touch dataset.txt
- echo "" > normal_name.txt
-
- gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_reads} -O tumor_name.txt -encode \
- ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
- tumor_command_line="-I ~{tumor_reads} -tumor `cat tumor_name.txt`"
+ touch contigs.table
+ touch read-groups.table
if [[ ! -z "~{normal_reads}" ]]; then
- gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_name.txt -encode \
+ gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_names.txt -encode \
~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
- normal_command_line="-I ~{normal_reads} -normal `cat normal_name.txt`"
+ # add "-normal " to the start of each line and " " to the end, then remove newlines
+ # to get -normal sample1 -normal sample2 etc
+ normal_sample_line=`awk '{ print "-normal", $0 }' normal_names.txt | tr '\n' ' '`
fi
gatk --java-options "-Xmx~{command_mem}m" Mutect2 \
-R ~{ref_fasta} \
- $tumor_command_line \
- $normal_command_line \
+ -I ~{tumor_reads} \
+ ~{"-I " + normal_reads} \
+ $normal_sample_line \
~{"--germline-resource " + gnomad} \
~{"-pon " + pon} \
~{"-L " + intervals} \
@@ -478,7 +480,7 @@ task M2 {
# If the variants for contamination and the intervals for this scatter don't intersect, GetPileupSummaries
# throws an error. However, there is nothing wrong with an empty intersection for our purposes; it simply doesn't
- # contribute to the merged pileup summaries that we create downstream. We implement this by with array outputs.
+ # contribute to the merged pileup summaries that we create downstream. We implement this via array outputs.
# If the tool errors, no table is created and the glob yields an empty array.
set +e
@@ -513,13 +515,13 @@ task M2 {
File unfiltered_vcf = "~{output_vcf}"
File unfiltered_vcf_idx = "~{output_vcf_idx}"
File output_bamOut = "bamout.bam"
- String tumor_sample = read_string("tumor_name.txt")
- String normal_sample = read_string("normal_name.txt")
File stats = "~{output_stats}"
File f1r2_counts = "f1r2.tar.gz"
Array[File] tumor_pileups = glob("*tumor-pileups.table")
Array[File] normal_pileups = glob("*normal-pileups.table")
File m3_dataset = "dataset.txt"
+ File permutect_contigs_table = "contigs.table"
+ File permutect_read_groups_table = "read-groups.table"
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
index 8835707c556..bc7e7eb9f3e 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java
@@ -28,6 +28,7 @@
import java.io.IOException;
import java.net.InetAddress;
import java.nio.file.*;
+import java.nio.file.attribute.PosixFilePermission;
import java.text.DecimalFormat;
import java.time.Duration;
import java.time.ZonedDateTime;
@@ -167,6 +168,10 @@ public Object instanceMainPostParseArgs() {
final Path p = tmpDir.toPath();
try {
p.getFileSystem().provider().checkAccess(p, AccessMode.READ, AccessMode.WRITE);
+
+ // Warn if there's anything that prevents execution in the tmp dir because some tools need that
+ tryToWriteAnExecutableFileAndWarnOnFailure(p);
+
System.setProperty("java.io.tmpdir", IOUtils.getAbsolutePathWithoutFileProtocol(p));
} catch (final AccessDeniedException | NoSuchFileException e) {
// TODO: it may be that the program does not need a tmp dir
@@ -494,4 +499,49 @@ public final CommandLineParser getCommandLineParser() {
protected interface AutoCloseableNoCheckedExceptions extends AutoCloseable{
@Override void close();
}
+
+ private void tryToWriteAnExecutableFileAndWarnOnFailure(final Path p) {
+ Path tempFilePath = null;
+ try {
+ // This test relies on the file system supporting posix file permissions
+ if(p.getFileSystem().supportedFileAttributeViews().contains("posix")) {
+ // Write an empty file to the tempdir
+ tempFilePath = Files.createTempFile(p, "gatk_exec_test", null);
+ // Add execute permissions
+ final Set executePermissions = EnumSet.of(
+ PosixFilePermission.OWNER_EXECUTE,
+ PosixFilePermission.GROUP_EXECUTE,
+ PosixFilePermission.OTHERS_EXECUTE
+ );
+ final Set newPermissions = Files.getPosixFilePermissions(tempFilePath);
+ newPermissions.addAll(executePermissions);
+
+ Files.setPosixFilePermissions(tempFilePath, newPermissions);
+ if(!Files.isExecutable(tempFilePath)) {
+ logger.warn(
+ "User has permissions to create executable files within the configured temporary directory, " +
+ "but cannot execute those files. It is possible the directory has been mounted using the " +
+ "'noexec' flag. This can cause issues for some GATK tools. You can specify a different " +
+ "directory using --tmp-dir"
+ );
+ }
+ }
+ } catch(Exception e) {
+ logger.warn(
+ "Cannot create executable files within the configured temporary directory. It is possible " +
+ "this user does not have the proper permissions to execute files within this directory. " +
+ "This can cause issues for some GATK tools. You can specify a different directory using " +
+ "--tmp-dir"
+ );
+ logger.debug(e);
+ } finally {
+ // Make sure we clean up the test file
+ try {
+ Files.deleteIfExists(tempFilePath);
+ } catch(Exception e) {
+ logger.warn("Failed to delete temp file for testing temp dir", e);
+ }
+ }
+
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java b/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java
index 2d9f7da9099..1d3f00147fb 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java
@@ -22,6 +22,14 @@ public class DeprecatedToolsRegistry {
// Indicate version in which the tool disappeared, and recommended replacement in parentheses if applicable
deprecatedTools.put("IndelRealigner", Pair.of("4.0.0.0", "Please use GATK3 to run this tool"));
deprecatedTools.put("RealignerTargetCreator", Pair.of("4.0.0.0", "Please use GATK3 to run this tool"));
+ deprecatedTools.put("CNNScoreVariants", Pair.of("4.6.1.0",
+ "Please use the replacement tool NVScoreVariants instead, which produces virtually identical results"));
+ deprecatedTools.put("CNNVariantTrain", Pair.of("4.6.1.0",
+ "Please use a version of GATK prior to 4.6.1.0 to run this tool, " +
+ "or wait for the forthcoming Pytorch-based training tool for NVScoreVariants to be released"));
+ deprecatedTools.put("CNNVariantWriteTensors", Pair.of("4.6.1.0",
+ "Please use a version of GATK prior to 4.6.1.0 to run this tool, " +
+ "or wait for the forthcoming Pytorch-based training tool for NVScoreVariants to be released"));
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java
index 9b8d016e14d..d873e545e62 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/ReadFilterArgumentDefinitions.java
@@ -53,10 +53,8 @@ private ReadFilterArgumentDefinitions(){}
public static final String KEEP_INTERVAL_NAME = "keep-intervals";
- public static final String SOFT_CLIPPED_RATIO_THRESHOLD = "soft-clipped-ratio-threshold";
- public static final String SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD = "soft-clipped-leading-trailing-ratio";
-
- public static final String INVERT_SOFT_CLIP_RATIO_FILTER = "invert-soft-clip-ratio-filter";
+ public static final String SOFT_CLIPPED_RATIO_THRESHOLD = "max-soft-clipped-ratio";
+ public static final String SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD = "max-soft-clipped-leading-trailing-ratio";
public static final String READ_FILTER_TAG = "read-filter-tag";
public static final String READ_FILTER_TAG_COMP = "read-filter-tag-comp";
diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
index 1c8596eb91b..ad1bb00a10a 100644
--- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
+++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java
@@ -46,7 +46,7 @@ private StandardArgumentDefinitions(){}
public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters";
public static final String SORT_ORDER_LONG_NAME = "sort-order";
public static final String FLOW_ORDER_FOR_ANNOTATIONS = "flow-order-for-annotations";
-
+ public static final String VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME = "variant-output-filtering";
public static final String INPUT_SHORT_NAME = "I";
public static final String OUTPUT_SHORT_NAME = "O";
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
index fc15bcd9425..f4d529ebf4b 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/GATKTool.java
@@ -16,8 +16,10 @@
import java.util.*;
import java.util.stream.Stream;
+
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
+import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLinePluginDescriptor;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
@@ -45,6 +47,7 @@
import org.broadinstitute.hellbender.utils.reference.ReferenceUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.ShardingVCFWriter;
+import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;
/**
* Base class for all GATK tools. Tool authors that want to write a "GATK" tool but not use one of
@@ -417,6 +420,13 @@ public int getDefaultCloudIndexPrefetchBufferSize() {
*/
public String getProgressMeterRecordLabel() { return ProgressMeter.DEFAULT_RECORD_LABEL; }
+ /**
+ * @return default value does no filtering. Override to change how variants are filtered against the intervals for your tools.
+ */
+ public IntervalFilteringVcfWriter.Mode getVariantOutputFilteringMode(){
+ return IntervalFilteringVcfWriter.Mode.ANYWHERE;
+ }
+
protected List transformTraversalIntervals(final List getIntervals, final SAMSequenceDictionary sequenceDictionary) {
return getIntervals;
}
@@ -600,7 +610,7 @@ public boolean requiresIntervals() {
/**
* Does this tool want to disable the progress meter? If so, override here to return true
- *
+ *
* @return true if this tools wants to disable progress meter output, otherwise false
*/
public boolean disableProgressMeter() {
@@ -727,12 +737,16 @@ protected void onStartup() {
initializeIntervals(); // Must be initialized after reference, reads and features, since intervals currently require a sequence dictionary from another data source
- if ( seqValidationArguments.performSequenceDictionaryValidation()) {
+ if (seqValidationArguments.performSequenceDictionaryValidation()) {
validateSequenceDictionaries();
}
checkToolRequirements();
+ if ((getVariantOutputFilteringMode() != IntervalFilteringVcfWriter.Mode.ANYWHERE ) && userIntervals == null){
+ throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME + " was specified or if the tool uses interval filtering.");
+ }
+
initializeProgressMeter(getProgressMeterRecordLabel());
}
@@ -911,20 +925,27 @@ public VariantContextWriter createVCFWriter(final Path outPath) {
if (outputSitesOnlyVCFs) {
options.add(Options.DO_NOT_WRITE_GENOTYPES);
}
-
+ final VariantContextWriter unfilteredWriter;
if (maxVariantsPerShard > 0) {
- return new ShardingVCFWriter(
+ unfilteredWriter = new ShardingVCFWriter(
outPath,
maxVariantsPerShard,
sequenceDictionary,
createOutputVariantMD5,
- options.toArray(new Options[options.size()]));
+ options.toArray(new Options[0]));
+ } else {
+ unfilteredWriter = GATKVariantContextUtils.createVCFWriter(
+ outPath,
+ sequenceDictionary,
+ createOutputVariantMD5,
+ options.toArray(new Options[0]));
}
- return GATKVariantContextUtils.createVCFWriter(
- outPath,
- sequenceDictionary,
- createOutputVariantMD5,
- options.toArray(new Options[options.size()]));
+
+ return getVariantOutputFilteringMode() == IntervalFilteringVcfWriter.Mode.ANYWHERE ?
+ unfilteredWriter :
+ new IntervalFilteringVcfWriter(unfilteredWriter,
+ intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()),
+ getVariantOutputFilteringMode());
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerGroupedByOverlap.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerGroupedByOverlap.java
new file mode 100644
index 00000000000..9578671163e
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalkerGroupedByOverlap.java
@@ -0,0 +1,206 @@
+package org.broadinstitute.hellbender.engine;
+
+import htsjdk.samtools.util.Locatable;
+import htsjdk.samtools.util.OverlapDetector;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+
+/**
+ * A MultiVariantWalker that walks over multiple variant context sources in reference order and emits to client tools
+ * groups of all input variant contexts by their start position. This is intended to mimic GATK3 traversal behavior for
+ * some tools.
+ *
+ * As such, the argument '-ignore-variants-starting-outside-interval' has been provided to mimic GATK3's behavior
+ * only presenting variants that start inside the requested interval regardless of whether there is a spanning variant.
+ *
+ * Client tools must implement apply(List variantContexts, ReferenceContext referenceContext)
+ */
+public abstract class MultiVariantWalkerGroupedByOverlap extends MultiVariantWalker {
+ private List currentVariants = new ArrayList<>();
+ private String lastCurrentVariantContig;
+ private int lastCurrentVariantEnd;
+ private List currentReadsContexts = new ArrayList<>();
+ private OverlapDetector overlapDetector;
+
+ public static final String IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL = "ignore-variants-starting-outside-interval";
+
+ public static final String COMBINE_VARIANTS_DISTANCE = "combine-variants-distance";
+
+ public static final String MAX_COMBINED_DISTANCE = "max-distance";
+
+ public static final String REFERENCE_WINDOW_PADDING = "ref-padding";
+
+ /**
+ * this option has no effect unless intervals are specified.
+ *
+ * This exists to mimic GATK3 interval traversal patterns
+ */
+ @Advanced
+ @Argument(fullName = IGNORE_VARIANTS_THAT_START_OUTSIDE_INTERVAL,
+ doc = "Restrict variant output to sites that start within provided intervals (only applies when an interval is specified)",
+ optional = true)
+ protected boolean ignoreIntervalsOutsideStart = false;
+
+ @Advanced
+ @Argument(fullName = COMBINE_VARIANTS_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true)
+ protected int distanceToCombineVariants = defaultDistanceToGroupVariants();
+
+ @Advanced
+ @Argument(fullName = MAX_COMBINED_DISTANCE, doc = "Maximum distance for variants to be grouped together", optional = true)
+ protected int maxCombinedDistance = defaultMaxGroupedSpan();
+
+ @Advanced
+ @Argument(fullName = REFERENCE_WINDOW_PADDING, doc = "Number of bases on either side to expand spanning reference window", optional = true)
+ protected int referenceWindowPadding = defaultReferenceWindowPadding();
+
+ @Advanced
+ @Argument(fullName = "ignore-reference-blocks", shortName = "ignore-ref-blocks", optional = true)
+ protected boolean ignoreReferenceBlocks = false;
+
+ // override to group variants that start nearby but not at the same locus
+ protected int defaultDistanceToGroupVariants() {
+ return 10000;
+ }
+
+ // override to change reference padding
+ protected int defaultReferenceWindowPadding() {
+ return 1;
+ }
+
+ // override to cap the size span of combined variants
+ protected int defaultMaxGroupedSpan() {
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public boolean requiresReference() {
+ return true;
+ }
+
+ /**
+ * This method keeps track of all the variants it is passed and will feed all the variants that start at the same
+ * site to the reduce method.
+ *
+ * {@inheritDoc}
+ */
+ @Override
+ public final void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+
+ // Filtering out variants that start outside of the specified intervals
+ if (ignoreIntervalsOutsideStart && !isWithinInterval(new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart()))) {
+ return;
+ }
+
+ if (ignoreReferenceBlocks && variant.getAlternateAlleles().size() ==1 && variant.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE)) {
+ return;
+ }
+
+ // Collecting all the reads that start at a particular base into one.
+ if (currentVariants.isEmpty()) {
+ lastCurrentVariantContig = variant.getContig();
+ } else if (!currentVariants.get(0).contigsMatch(variant)
+ || variant.getStart() > lastCurrentVariantEnd) {
+ // Emptying any sites which should emit a new VC since the last one
+ apply(new ArrayList<>(currentVariants), currentReadsContexts);
+ currentVariants.clear();
+ currentReadsContexts.clear();
+ }
+
+ currentVariants.add(variant);
+ currentReadsContexts.add(readsContext);
+ if (variant.getEnd() > lastCurrentVariantEnd || !lastCurrentVariantContig.equals(variant.getContig())) {
+ lastCurrentVariantEnd = variant.getEnd();
+ lastCurrentVariantContig = variant.getContig();
+ }
+ }
+
+ /**
+ * This method must be implemented by tool authors.
+ *
+ * This is the primary traversal for any MultiVariantWalkerGroupedOnStart walkers. Will traverse over input variant contexts
+ * and call #apply() exactly once for each unique reference start position. All variants starting at each locus
+ * across source files will be grouped and passed as a list of VariantContext objects.
+ * @param variantContexts VariantContexts from driving variants with matching start position
+ * NOTE: This will never be empty
+ * @param referenceContext ReferenceContext object covering the reference of the longest spanning VariantContext
+ * @param readsContexts
+ */
+ public abstract void apply(final List variantContexts, final ReferenceContext referenceContext, final List readsContexts);
+
+ public void apply(List variantContexts, final List readsContexts) {
+ apply(variantContexts, makeSpanningReferenceContext(variantContexts, referenceWindowPadding), readsContexts);
+ }
+
+ /**
+ * Helper method that ensures the reference context it returns is adequate to span the length of all the accumulated
+ * VariantContexts. It assumes that all variant contexts in currentVariants have the same contig.
+ */
+ private ReferenceContext makeSpanningReferenceContext(final List variantContexts, final int referenceWindowPadding) {
+ Utils.nonEmpty(variantContexts, "Must have at least one current variant context");
+ final List contigs = variantContexts.stream().map(VariantContext::getContig).distinct().collect(Collectors.toList());
+ Utils.validate(contigs.size() == 1, "variant contexts should all have the same contig");
+ final int minStart = variantContexts.stream().mapToInt(VariantContext::getStart).min().getAsInt();
+ final int maxEnd = variantContexts.stream().mapToInt(VariantContext::getEnd).max().getAsInt();
+ final SimpleInterval combinedInterval = new SimpleInterval(contigs.get(0), minStart, maxEnd);
+
+ final ReferenceContext combinedReferenceContext = new ReferenceContext(reference, combinedInterval);
+ combinedReferenceContext.setWindow(referenceWindowPadding,referenceWindowPadding);
+ return combinedReferenceContext;
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * Implementation of multi-variant grouped on start traversal.
+ *
+ * NOTE: You should only override {@link #traverse()} if you are writing a new walker base class in the
+ * engine package that extends this class. It is not meant to be overridden by tools outside of the
+ * engine package.
+ */
+ @Override
+ public void traverse() {
+ beforeTraverse();
+ super.traverse();
+ afterTraverse();
+ }
+
+ /**
+ * Marked final so that tool authors don't override it. Tool authors should override {@link #onTraversalStart} instead.
+ */
+ private void beforeTraverse() {
+ overlapDetector = hasUserSuppliedIntervals() ? OverlapDetector.create(intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary())) : null;
+ }
+
+ /**
+ * @param loc locatable to query
+ * @return true if the query loc is entirely contained by the interval, true if no interval
+ */
+ protected final boolean isWithinInterval(Locatable loc) {
+ return (overlapDetector==null || overlapDetector.overlapsAny(loc));
+ }
+
+ /**
+ * Clear accumulated reads before {@link #onTraversalSuccess()} is accessed
+ */
+ private void afterTraverse() {
+ // Clearing the accumulator
+ if (currentVariants.isEmpty()) {
+ logger.warn("Error: The requested interval contained no data in source VCF files");
+
+ } else {
+ apply(currentVariants, currentReadsContexts);
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java
index e59aac64532..0e729576172 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/VariantLocusWalker.java
@@ -132,11 +132,17 @@ public void traverse() {
.forEachOrdered(variant -> {
final SimpleInterval variantInterval = new SimpleInterval(variant);
- apply(variant,
- Collections.singletonList(variant),
- new ReadsContext(reads, variantInterval, readFilter),
- new ReferenceContext(reference, variantInterval),
- new FeatureContext(features, variantInterval));
+
+ try{
+ apply(variant,
+ Collections.singletonList(variant),
+ new ReadsContext(reads, variantInterval, readFilter),
+ new ReferenceContext(reference, variantInterval),
+ new FeatureContext(features, variantInterval));
+ } catch (final IllegalStateException e) {
+ throw new GATKException("Exception thrown at " + variant.getContig() + ":" + variant.getStart()
+ + " " + variant.toString(), e);
+ }
progressMeter.update(variantInterval);
});
@@ -158,11 +164,16 @@ public void traverse() {
postTransformer)
.collect(Collectors.toList());
if (!filteredVariants.isEmpty()) {
- apply(locus,
- filteredVariants,
- new ReadsContext(reads, locus, readFilter),
- new ReferenceContext(reference, locus),
- new FeatureContext(features, locus));
+ try {
+ apply(locus,
+ filteredVariants,
+ new ReadsContext(reads, locus, readFilter),
+ new ReferenceContext(reference, locus),
+ new FeatureContext(features, locus));
+ } catch (final IllegalStateException e) {
+ throw new GATKException("Exception thrown at first variant start " + filteredVariants.get(0).getContig() + ":" + filteredVariants.get(0).getStart()
+ + " " + filteredVariants.get(0).toString(), e);
+ }
progressMeter.update(locus);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java
index 08ae9ade223..a80760ee37c 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/VariantWalkerBase.java
@@ -3,12 +3,16 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeader;
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.filters.CountingVariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.transformers.VariantTransformer;
import org.broadinstitute.hellbender.utils.IndexUtils;
+import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;
import java.util.Spliterator;
import java.util.stream.Stream;
@@ -33,6 +37,11 @@ public abstract class VariantWalkerBase extends WalkerBase {
* queries on the driving variants).
*/
public static final int DEFAULT_DRIVING_VARIANTS_LOOKAHEAD_BASES = 100_000;
+ @Argument(fullName = StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME,
+ doc = "Restrict the output variants to ones that match the specified intervals according to the specified matching mode.",
+ optional = true)
+ @Advanced
+ public IntervalFilteringVcfWriter.Mode userOutputVariantIntervalFilteringMode = null;
//Various options for reading from a GenomicsDB
protected GenomicsDBOptions genomicsDBOptions;
@@ -103,6 +112,16 @@ public SAMSequenceDictionary getBestAvailableSequenceDictionary() {
*/
public abstract VCFHeader getHeaderForVariants();
+ @Override
+ public IntervalFilteringVcfWriter.Mode getVariantOutputFilteringMode() {
+ if (userOutputVariantIntervalFilteringMode != null) {
+ return userOutputVariantIntervalFilteringMode;
+ } else {
+ // Use whatever is the default provided by GATKTool
+ return super.getVariantOutputFilteringMode();
+ }
+ }
+
/**
* Return the primary sequence dictionary to be used for the driving variants for this tool. The value returned
* will usually have been prepared in {@link #initializeDrivingVariants}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java
index fa0421255fa..bc63df8e711 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/SoftClippedReadFilter.java
@@ -25,20 +25,13 @@ public final class SoftClippedReadFilter extends ReadFilter {
static final long serialVersionUID = 1L;
private final Logger logger = LogManager.getLogger(this.getClass());
- @VisibleForTesting
- @Argument(fullName = ReadFilterArgumentDefinitions.INVERT_SOFT_CLIP_RATIO_FILTER,
- doc = "Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.",
- optional = true
- )
- boolean doInvertFilter = false;
-
@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD,
doc = "Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.",
optional = true,
mutex = { ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD }
)
- Double minimumSoftClippedRatio = null;
+ Double maximumSoftClippedRatio = null;
@VisibleForTesting
@Argument(fullName = ReadFilterArgumentDefinitions.SOFT_CLIPPED_LEADING_TRAILING_RATIO_THRESHOLD,
@@ -46,7 +39,7 @@ public final class SoftClippedReadFilter extends ReadFilter {
optional = true,
mutex = {ReadFilterArgumentDefinitions.SOFT_CLIPPED_RATIO_THRESHOLD}
)
- Double minimumLeadingTrailingSoftClippedRatio = null;
+ Double maximumLeadingTrailingSoftClippedRatio = null;
// Command line parser requires a no-arg constructor
public SoftClippedReadFilter() {}
@@ -61,15 +54,15 @@ private boolean testMinSoftClippedRatio(final GATKRead read) {
totalLength += element.getLength();
}
- final double softClipRatio = ((double)numSoftClippedBases / (double)totalLength);
+ final double softClipRatio = totalLength != 0 ? ((double)numSoftClippedBases / (double)totalLength) : 0.0;
- return softClipRatio > minimumSoftClippedRatio;
+ return softClipRatio <= maximumSoftClippedRatio;
}
private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) {
if ( read.getCigarElements().size() < 1 ) {
- return false;
+ return true; //NOTE: in this edge case that the read should pass this filter as there are no cigar elements to have edge soft-clipping.
}
// Get the index of the last cigar element:
@@ -90,12 +83,13 @@ private boolean testMinLeadingTrailingSoftClippedRatio(final GATKRead read) {
.sum();
// Calculate the ratio:
- final double softClipRatio = ((double)numLeadingTrailingSoftClippedBases / (double)totalLength);
+ final double softClipRatio = totalLength != 0 ? ((double)numLeadingTrailingSoftClippedBases / (double)totalLength) : 0.0;
- return softClipRatio > minimumLeadingTrailingSoftClippedRatio;
+ return softClipRatio <= maximumLeadingTrailingSoftClippedRatio;
}
@Override
+ // NOTE: for read filters we always return true if the read passes the filter, and false if it doesn't.
public boolean test(final GATKRead read) {
final boolean result;
@@ -103,11 +97,11 @@ public boolean test(final GATKRead read) {
// NOTE: Since we have mutex'd the args for the clipping ratios, we only need to see if they
// have been specified. If they have, that's the filter logic we're using.
// If we specified the clipping ratio, we use the min sequence length test:
- if ( minimumSoftClippedRatio != null ) {
+ if ( maximumSoftClippedRatio != null ) {
result = testMinSoftClippedRatio(read);
}
// If we specified the leading/trailing clipping ratio, we use the min sequence length test:
- else if ( minimumLeadingTrailingSoftClippedRatio != null ) {
+ else if ( maximumLeadingTrailingSoftClippedRatio != null ) {
result = testMinLeadingTrailingSoftClippedRatio(read);
}
else {
@@ -118,10 +112,6 @@ else if ( minimumLeadingTrailingSoftClippedRatio != null ) {
);
}
- // Check for if we want to invert our results:
- if ( doInvertFilter ) {
- return !result;
- }
return result;
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/spark/RangePartitionCoalescer.java b/src/main/java/org/broadinstitute/hellbender/engine/spark/RangePartitionCoalescer.java
index fc1105c7d14..a3691154367 100644
--- a/src/main/java/org/broadinstitute/hellbender/engine/spark/RangePartitionCoalescer.java
+++ b/src/main/java/org/broadinstitute/hellbender/engine/spark/RangePartitionCoalescer.java
@@ -4,9 +4,9 @@
import org.apache.spark.rdd.PartitionCoalescer;
import org.apache.spark.rdd.PartitionGroup;
import org.apache.spark.rdd.RDD;
-import scala.collection.JavaConversions;
import scala.collection.Seq;
-
+import scala.jdk.javaapi.CollectionConverters;
+import java.io.Serial;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
@@ -14,8 +14,9 @@
/**
* A {@link PartitionCoalescer} that allows a range of partitions to be coalesced into groups.
*/
-class RangePartitionCoalescer implements PartitionCoalescer, Serializable, scala.Serializable {
+class RangePartitionCoalescer implements PartitionCoalescer, Serializable {
+ @Serial
private static final long serialVersionUID = 1L;
private List maxEndPartitionIndexes;
@@ -45,7 +46,7 @@ public PartitionGroup[] coalesce(int maxPartitions, RDD> parent) {
PartitionGroup group = new PartitionGroup(preferredLocation);
List partitionsInGroup =
partitions.subList(i, maxEndPartitionIndexes.get(i) + 1);
- group.partitions().append(JavaConversions.asScalaBuffer(partitionsInGroup));
+ group.partitions().addAll(CollectionConverters.asScala(partitionsInGroup).toList());
groups[i] = group;
}
return groups;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/CRAMIssue8768Detector.java b/src/main/java/org/broadinstitute/hellbender/tools/CRAMIssue8768Detector.java
new file mode 100644
index 00000000000..9bda68f1dab
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/CRAMIssue8768Detector.java
@@ -0,0 +1,144 @@
+package org.broadinstitute.hellbender.tools;
+
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.hellbender.tools.filediagnostics.CRAMIssue8768Analyzer;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import picard.cmdline.programgroups.OtherProgramGroup;
+
+/**
+ * A diagnostic tool that analyzes a CRAM file to look for possible base corruption caused by
+ * GATK issue 8768.
+ *
+ *
This issue affects GATK versions 4.3.0.0 through 4.5.0.0, and is fixed in GATK 4.6.0.0.
+ *
+ *
This issue also affects Picard versions 2.27.3 through 3.1.1, and is fixed in Picard 3.2.0.
+ *
+ *
The bug is triggered when writing a CRAM file using one of the affected GATK/Picard versions,
+ * and both of the following conditions are met:
+ *
+ *
+ *
At least one read is mapped to the very first base of a reference contig
+ *
The file contains more than one CRAM container (10,000 reads) with reads mapped to that same reference contig
+ *
+ *
+ *
When both of these conditions are met, the resulting CRAM file may have corrupt containers containing reads
+ * with an incorrect sequence.
+ *
+ *
This tool writes a report to an output text file indicating whether the CRAM file appears to have read base corruption caused by issue 8768,
+ * and listing the affected containers. By default, the output report will have a summary of the average mismatch rate for all suspected bad containers
+ * and a few presumed good containers in order to determine if there is a large difference in the base mismatch rate.
+ *
+ *
Optionally, a TSV file with the same information as the textual report, but in tabular form, can be written
+ * using the "--output-tsv" argument.
+ *
+ *
To analyze the base mismatch rate for ALL containers, use the "verbose" option.
+ */
+@ExperimentalFeature
+@WorkflowProperties
+@CommandLineProgramProperties(
+ summary = "Analyze a CRAM file to check for base corruption caused by GATK issue 8768",
+ oneLineSummary = "Analyze a CRAM file to check for base corruption caused by GATK issue 8768",
+ programGroup = OtherProgramGroup.class
+)
+public class CRAMIssue8768Detector extends CommandLineProgram {
+ // default average mismatch rate threshold above which we consider the file to be corrupt
+ private static final double DEFAULT_MISMATCH_RATE_THRESHOLD = 0.05;
+
+ @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
+ doc = "Input path of CRAM file to analyze",
+ common = true)
+ @WorkflowInput
+ public GATKPath inputPath;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc = "Output diagnostics text file",
+ common = true)
+ @WorkflowOutput
+ public GATKPath textOutputPath;
+
+ public static final String OUTPUT_TSV__ARG_NAME = "output-tsv";
+ @Argument(fullName = OUTPUT_TSV__ARG_NAME,
+ shortName = OUTPUT_TSV__ARG_NAME,
+ doc = "Output diagnostics tsv file",
+ optional = true)
+ @WorkflowOutput
+ public GATKPath tsvOutputPath;
+
+ @Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
+ shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME,
+ doc = "Reference for the CRAM file",
+ common = true)
+ @WorkflowOutput
+ public GATKPath referencePath;
+
+ public static final String MISMATCH_RATE_THRESHOLD_ARG_NAME = "mismatch-rate-threshold";
+ @Argument(fullName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
+ shortName = MISMATCH_RATE_THRESHOLD_ARG_NAME,
+ doc = "Mismatch rate threshold above which we consider the file to be corrupt",
+ optional = true)
+ public double mismatchRateThreshold = DEFAULT_MISMATCH_RATE_THRESHOLD;
+
+ public static final String VERBOSE_ARG_NAME = "verbose";
+ @Argument(fullName = VERBOSE_ARG_NAME,
+ shortName= VERBOSE_ARG_NAME,
+ doc="Calculate and print the mismatch rate for all containers",
+ optional=true)
+ public boolean verbose = false;
+
+ public static final String ECHO_ARG_NAME = "echo-to-stdout";
+ @Argument(fullName = ECHO_ARG_NAME,
+ shortName= ECHO_ARG_NAME,
+ doc="Echo text output to stdout",
+ optional=true)
+ public boolean echoToStdout = false;
+
+ private CRAMIssue8768Analyzer cramAnalyzer;
+
+ @Override
+ protected Object doWork() {
+ cramAnalyzer = new CRAMIssue8768Analyzer(
+ inputPath,
+ textOutputPath,
+ tsvOutputPath,
+ referencePath,
+ mismatchRateThreshold,
+ verbose,
+ echoToStdout);
+ cramAnalyzer.doAnalysis();
+ return cramAnalyzer.getRetCode();
+ }
+
+ @Override
+ protected void onShutdown() {
+ if ( cramAnalyzer != null ) {
+ try {
+ cramAnalyzer.close();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
index 3fa41b46a04..695d1385e4c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
@@ -10,7 +10,6 @@ public class FlowBasedArgumentCollection implements Serializable {
private static final long serialVersionUID = 0;
public static final String FLOW_USE_T0_TAG = "flow-use-t0-tag";
- public static final String PROBABILITY_RATIO_THRESHOLD_LONG_NAME = "flow-probability-threshold";
public static final String REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME = "flow-remove-non-single-base-pair-indels";
public static final String REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME = "flow-remove-one-zero-probs";
public static final String NUMBER_OF_POSSIBLE_PROBS_LONG_NAME = "flow-quantization-bins";
@@ -27,8 +26,7 @@ public class FlowBasedArgumentCollection implements Serializable {
- private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
- private static final double DEFAULT_FILLING_VALUE = 0.001;
+ public static final double DEFAULT_FILLING_VALUE = 0.001;
private static final boolean DEFAULT_REMOVE_LONGER_INDELS = false;
private static final boolean DEFAULT_REMOVE_ONE_TO_ZERO = false;
private static final boolean DEFAULT_SYMMETRIC_INDELS = false;
@@ -45,10 +43,6 @@ public class FlowBasedArgumentCollection implements Serializable {
@Argument(fullName = FLOW_USE_T0_TAG, doc = "Use t0 tag if exists in the read to create flow matrix", optional = true)
public boolean useT0Tag = DEFAULT_FLOW_USE_T0_TAG;
- @Advanced
- @Argument(fullName = PROBABILITY_RATIO_THRESHOLD_LONG_NAME, doc = "Lowest probability ratio to be used as an option", optional = true)
- public double probabilityRatioThreshold = DEFAULT_RATIO_THRESHOLD;
-
@Advanced
@Argument(fullName = REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME, doc = "Should the probabilities of more then 1 indel be used", optional = true)
public boolean removeLongerThanOneIndels = DEFAULT_REMOVE_LONGER_INDELS;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java b/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java
index f66556f539d..6aa20a9dc5c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/GetSampleName.java
@@ -86,18 +86,16 @@ public void onTraversalStart() {
}
final List sampleNames = getHeaderForReads().getReadGroups().stream().map(s -> s.getSample()).distinct().collect(Collectors.toList());
- if (sampleNames.size() > 1) {
- throw new UserException.BadInput("The given input bam has more than one unique sample name: " + StringUtils.join(sampleNames, ", "));
- }
if (sampleNames.size() == 0) {
throw new UserException.BadInput("The given bam input has no sample names.");
}
try (final OutputStreamWriter fileWriter = new OutputStreamWriter(outputSampleNameFile.getOutputStream())) {
- final String rawSample = sampleNames.get(0);
- final String outputSample = urlEncode ? IOUtils.urlEncode(rawSample) : rawSample;
- fileWriter.write(outputSample);
+ final String outputSamplesOnSeparateLines = sampleNames.stream()
+ .map(rawSample -> urlEncode ? IOUtils.urlEncode(rawSample) : rawSample)
+ .collect(Collectors.joining("\n"));
+ fileWriter.write(outputSamplesOnSeparateLines);
} catch (final IOException ioe) {
throw new UserException(String.format("Could not write to output file %s.", outputSampleNameFile), ioe);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java b/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java
index 76a0410c777..83703d9f46d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/PrintFileDiagnostics.java
@@ -8,8 +8,6 @@
import org.broadinstitute.hellbender.tools.filediagnostics.HTSAnalyzerFactory;
import picard.cmdline.programgroups.OtherProgramGroup;
-import java.io.File;
-
/**
* A diagnostic tool that prints meta information about a GATK input file.
*
@@ -43,8 +41,8 @@ public class PrintFileDiagnostics extends CommandLineProgram {
doc = "Outut file for diagnostics (must be a local file)",
optional = false,
common = true)
- @WorkflowInput
- public File outputFile;
+ @WorkflowOutput
+ public GATKPath outputPath;
@Argument(shortName="count-limit",
fullName="count-limit",
@@ -56,7 +54,7 @@ public class PrintFileDiagnostics extends CommandLineProgram {
@Override
protected void onStartup() {
super.onStartup();
- htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputFile, countLimit);
+ htsAnalyzer = HTSAnalyzerFactory.getFileAnalyzer(inputPath, outputPath, countLimit);
}
@Override
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java
index fcb93e202b9..bb2f7f1f0e9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DetermineGermlineContigPloidy.java
@@ -67,12 +67,12 @@
*
OpenMP and MKL parallelism can be controlled by setting the OMP_NUM_THREADS and MKL_NUM_THREADS
* environment variables, respectively.
*
- *
Advanced users may wish to set the THEANO_FLAGS environment variable to override the GATK theano
+ *
Advanced users may wish to set the PYTENSOR_FLAGS environment variable to override the GATK PyTensor
* configuration. For example, by running
- * THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ..., users can specify
- * the theano compilation directory (which is set to $HOME/.theano by default). See theano documentation
- * at
- * https://theano-pymc.readthedocs.io/en/latest/library/config.html.
+ * PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ..., users can specify
+ * the PyTensor compilation directory (which is set to $HOME/.pytensor by default). See PyTensor documentation
+ * at
+ * https://pytensor.readthedocs.io/en/latest/library/config.html.
*
OpenMP and MKL parallelism can be controlled by setting the OMP_NUM_THREADS and MKL_NUM_THREADS
* environment variables, respectively.
*
- *
Advanced users may wish to set the THEANO_FLAGS environment variable to override the GATK theano
+ *
Advanced users may wish to set the PYTENSOR_FLAGS environment variable to override the GATK PyTensor
* configuration. For example, by running
- * THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk GermlineCNVCaller ..., users can specify
- * the theano compilation directory (which is set to $HOME/.theano by default). See theano documentation
- * at
- * https://theano-pymc.readthedocs.io/en/latest/library/config.html.
+ * PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ..., users can specify
+ * the PyTensor compilation directory (which is set to $HOME/.pytensor by default). See PyTensor documentation
+ * at
+ * https://pytensor.readthedocs.io/en/latest/library/config.html.
*
*
*
Resource usage
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java
index fa68bb4f34d..b9ac8561e59 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/PostprocessGermlineCNVCalls.java
@@ -89,12 +89,12 @@
* the python environment is already set up. Otherwise, the environment must be created and activated as described in the
* main GATK README.md file.
*
- *
Advanced users may wish to set the THEANO_FLAGS environment variable to override the GATK theano
+ *
Advanced users may wish to set the PYTENSOR_FLAGS environment variable to override the GATK PyTensor
* configuration. For example, by running
- * THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk PostprocessGermlineCNVCalls ..., users can specify
- * the theano compilation directory (which is set to $HOME/.theano by default). See theano documentation
- * at
- * https://theano-pymc.readthedocs.io/en/latest/library/config.html.
+ * PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ..., users can specify
+ * the PyTensor compilation directory (which is set to $HOME/.pytensor by default). See PyTensor documentation
+ * at
+ * https://pytensor.readthedocs.io/en/latest/library/config.html.
*
*
*
Required inputs:
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
index 33672cac84f..215b03154fa 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotDenoisedCopyRatios.java
@@ -204,8 +204,7 @@ private void writeDenoisingPlots(final String sampleName,
//this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotDenoisedCopyRatios.class));
executor.addScript(new Resource(PLOT_DENOISED_COPY_RATIOS_R_SCRIPT, PlotDenoisedCopyRatios.class));
- //--args is needed for Rscript to recognize other arguments properly
- executor.addArgs("--args",
+ executor.addArgs(
"--sample_name=" + sampleName,
"--standardized_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputStandardizedCopyRatiosFile),
"--denoised_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile),
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
index 89a211cfdd9..424c15880b2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/plotting/PlotModeledSegments.java
@@ -285,8 +285,7 @@ private void writeModeledSegmentsPlot(final String sampleName,
//this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotModeledSegments.class));
executor.addScript(new Resource(PLOT_MODELED_SEGMENTS_R_SCRIPT, PlotModeledSegments.class));
- //--args is needed for Rscript to recognize other arguments properly
- executor.addArgs("--args",
+ executor.addArgs(
"--sample_name=" + sampleName,
"--denoised_copy_ratios_file=" + (inputDenoisedCopyRatiosFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile)),
"--allelic_counts_file=" + (inputAllelicCountsFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputAllelicCountsFile)),
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java
index 1de153092ae..c088ff7abcd 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/BAIAnalyzer.java
@@ -11,16 +11,17 @@
*/
public class BAIAnalyzer extends HTSAnalyzer {
- public BAIAnalyzer(final GATKPath inputPath, final File outputFile) {
- super(inputPath, outputFile);
+ public BAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
+ super(inputPath, outputPath);
}
/**
* Run the analyzer for the file.
*/
protected void doAnalysis() {
- System.out.println(String.format("\nOutput written to %s\n", outputFile));
- BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), outputFile, true);
+ System.out.println(String.format("\nOutput written to %s\n", outputPath));
+ // note this method is not nio aware
+ BAMIndexer.createAndWriteIndex(inputPath.toPath().toFile(), new File(outputPath.getRawInputString()), true);
}
@Override
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java
index dcbe8109938..9fb12ae9d35 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAIAnalyzer.java
@@ -5,23 +5,22 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.*;
+import java.nio.file.Files;
/**
* Analyzer for CRAM (.crai) index files.
*/
public class CRAIAnalyzer extends HTSAnalyzer {
- final FileOutputStream fos;
+ final OutputStream fos;
- public CRAIAnalyzer(final GATKPath inputPath, final File outputFile) {
- super(inputPath, outputFile);
+ public CRAIAnalyzer(final GATKPath inputPath, final GATKPath outputPath) {
+ super(inputPath, outputPath);
try {
- fos = new FileOutputStream(outputFile);
+ fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {
+
throw new RuntimeIOException(e);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java
index 76933277e99..a6d473fe10a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMAnalyzer.java
@@ -11,14 +11,11 @@
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.hellbender.engine.GATKPath;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
+import java.io.*;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
+import java.nio.file.Files;
import java.util.Base64;
-import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@@ -36,13 +33,13 @@ public class CRAMAnalyzer extends HTSAnalyzer {
long coreBlocksDataSize = 0L;
long recordCount = 0;
final long countLimit;
- final FileOutputStream fos;
+ final OutputStream fos;
- public CRAMAnalyzer(final GATKPath inputPathName, final File outputFile, final long countLimit) {
- super(inputPathName, outputFile);
+ public CRAMAnalyzer(final GATKPath inputPathName, final GATKPath outputPath, final long countLimit) {
+ super(inputPathName, outputPath);
this.countLimit = countLimit;
try {
- fos = new FileOutputStream(outputFile);
+ fos = Files.newOutputStream(outputPath.toPath());
} catch (final IOException e) {
throw new RuntimeIOException(e);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMIssue8768Analyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMIssue8768Analyzer.java
new file mode 100644
index 00000000000..366666a9eab
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/CRAMIssue8768Analyzer.java
@@ -0,0 +1,435 @@
+package org.broadinstitute.hellbender.tools.filediagnostics;
+
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.ValidationStringency;
+import htsjdk.samtools.cram.build.CRAMReferenceRegion;
+import htsjdk.samtools.cram.build.CramIO;
+import htsjdk.samtools.cram.ref.ReferenceContext;
+import htsjdk.samtools.cram.ref.ReferenceSource;
+import htsjdk.samtools.cram.structure.*;
+import htsjdk.samtools.seekablestream.SeekablePathStream;
+import htsjdk.samtools.util.RuntimeIOException;
+import htsjdk.samtools.util.SequenceUtil;
+import htsjdk.samtools.util.Tuple;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.utils.tsv.DataLine;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableWriter;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.*;
+
+/**
+ * A diagnostic class that analyzes a CRAM input file to look for conditions that trigger issue 8768.
+ */
+public class CRAMIssue8768Analyzer extends HTSAnalyzer {
+ private final ReferenceSource referenceSource;
+ private final OutputStream outputStream;
+ private final CompressorCache compressorCache = new CompressorCache();
+ private final boolean verbose;
+ private final boolean echoToConsole;
+ private final double mismatchRateThreshold;
+ private final GATKPath tsvOutputPath;
+ private int retCode = 0;
+
+ private SAMFileHeader samHeader = null;
+
+ // everything we need to record for a bad contig
+ private record BadContig(
+ String contigName,
+ List badContainers
+ ) { }
+
+ // everything we need to record for a (good or bad) container
+ private record ContainerStats(
+ int containerOrdinal, // container ordinal # within the contig
+ boolean isBad, // true if this container is bad
+ AlignmentContext alignmentContext, // reference ID, alignment start, alignment span
+ long misMatchCount, // count of mismatched bases
+ double misMatchRate // rate of mismatched bases (mismatches/total bases)
+ ) { }
+
+ public CRAMIssue8768Analyzer(
+ final GATKPath inputPath,
+ final GATKPath textOutputPath,
+ final GATKPath tsvOutputPath,
+ final GATKPath referencePath,
+ final double mismatchRateThreshold,
+ final boolean verbose,
+ final boolean echoToConsole) {
+ super(inputPath, textOutputPath);
+ this.verbose = verbose;
+ this.echoToConsole = echoToConsole;
+ this.tsvOutputPath = tsvOutputPath;
+ this.mismatchRateThreshold = mismatchRateThreshold;
+
+ referenceSource = new ReferenceSource(referencePath.toPath());
+ try {
+ outputStream = Files.newOutputStream(this.outputPath.toPath());
+ } catch (final IOException e) {
+ throw new RuntimeIOException(e);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (outputStream != null) {
+ outputStream.close();
+ }
+ }
+
+ public int getRetCode() {
+ return retCode;
+ }
+ protected void emitln(final String s) {
+ try {
+ outputStream.write(s.getBytes());
+ outputStream.write('\n');
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Some background:
+ *
+ * Each CRAM container has an associated htsjdk AlignmentContext that models if/how the contents of the
+ * container relates to the reference. An AlignmentContext includes a ReferenceContext, and depending on the
+ * type of the ReferenceContext, possibly an alignment start and alignment span. There are 3 possible types of
+ * ReferenceContexts:
+ *
+ * 1. SINGLE_REF: The container contains only reads that are mapped to a single reference contig, in which case
+ * the referenceID for that ReferenceContext is the contig ID for the associated contig. This is the most
+ * common case, and is the only type of ReferenceContext for which the alignment start and span are meaningful.
+ * Call isMappedSingleRef() to determine if the ReferenceContext is SINGLE_REF.
+ *
+ * Note that it is an error (the code throws) to you attempt to query a ReferenceContext for it's contig ID if
+ * the ReferenceContext is not SINGLE_REF.
+ *
+ * 2. MULTI_REF: The container contains reads that are mapped to more than one reference contig. This is an
+ * optimization used primarily when there aren't enough reads mapped to a single reference contig to justify
+ * putting the reads into a separate container. Reads in these containers are not reference compressed, and
+ * AlignmentContexts for MULTI_REF containers have no meaningful start/span values. Call isMappedMultiRef() to
+ * determine if the ReferenceContext is MULTI_REF.
+ *
+ * 3. UNMAPPED_UNPLACED: The container contains only unmapped unplaced reads. start/span are irrelevant. Call
+ * isUnmappedUnplaced() to determine if the ReferenceContext is UNMAPPED_UNPLACED.
+ */
+ public void doAnalysis() {
+ final Map badContigs = new LinkedHashMap<>(); // contig name, BadContig
+ final List goodContainers = new ArrayList<>(); // good containers, for all contigs
+ int containerOrdinalForContig = 0;
+ final int NUMBER_OF_GOOD_CONTAINERS_PER_CONTIG_TO_REPORT = 4;
+ int nGoodContainersReportedForContig = 0;
+
+ // these are HTSJDK CRAM container alignment contexts, not the GATK kind you're thinking of
+ AlignmentContext previousAlignmentContext = null;
+
+ try (final SeekablePathStream seekableStream = new SeekablePathStream(this.inputPath.toPath())) {
+ List badContainersForContig = new ArrayList<>();
+ final CramHeader cramHeader = analyzeCRAMHeader(seekableStream);
+ samHeader = Container.readSAMFileHeaderContainer(
+ cramHeader.getCRAMVersion(),
+ seekableStream,
+ inputPath.getRawInputString());
+
+ // iterate through the containers looking for ones with alignment spans that trigger the issue
+ for (boolean isEOF = false; !isEOF;) {
+ final Container container = new Container(
+ cramHeader.getCRAMVersion(),
+ seekableStream,
+ seekableStream.position());
+ containerOrdinalForContig++;
+
+ // reject CRAMs with properties that clearly indicate they were not written by GATK/Picard
+ if (isForeignCRAM(container)) {
+ return;
+ }
+
+ if (previousAlignmentContext == null) {
+ // first container in the whole file can't be bad
+ recordContainerStats(goodContainers, false, container, containerOrdinalForContig);
+ nGoodContainersReportedForContig++;
+ } else if (!previousAlignmentContext.getReferenceContext().equals(
+ container.getAlignmentContext().getReferenceContext())) {
+ // this is the first container for a new reference context; emit any bad containers accumulated
+ // for the previous reference context/contig, and reset state for the next one
+ if (badContainersForContig.size() > 0) {
+ recordContigStats(badContigs, badContainersForContig, previousAlignmentContext);
+ badContainersForContig = new ArrayList<>();
+ }
+ containerOrdinalForContig = 1;
+ // the first container for a reference context is never bad, so always add it to the good list
+ recordContainerStats(goodContainers, false, container, containerOrdinalForContig);
+ nGoodContainersReportedForContig = 1;
+ } else if (previousAlignmentContext.getReferenceContext().isMappedSingleRef() &&
+ (previousAlignmentContext.getAlignmentStart() == 1)) {
+ // we're on the same reference context as the previous container, and the previous container
+ // was mapped to position 1, so if this container is mapped, it's a candidate for bad, whether
+ // it starts at position 1 (the rare case where there is more than one bad container) or not
+ // (the common case where this is the one bad container for this contig)
+ recordContainerStats(badContainersForContig, true, container, containerOrdinalForContig);
+ } else {
+ // we're on the same reference context as the previous container, and the previous container
+ // was NOT mapped to position 1, so this container is not bad - add it to the list of good
+ // containers
+ if (verbose || nGoodContainersReportedForContig < NUMBER_OF_GOOD_CONTAINERS_PER_CONTIG_TO_REPORT) {
+ recordContainerStats(goodContainers, false,container, containerOrdinalForContig);
+ nGoodContainersReportedForContig++;
+ }
+ }
+
+ previousAlignmentContext = new AlignmentContext(
+ container.getAlignmentContext().getReferenceContext(),
+ container.getAlignmentContext().getAlignmentStart(),
+ container.getAlignmentContext().getAlignmentSpan());
+ isEOF = container.isEOF();
+ }
+ }
+ catch (IOException e) {
+ throw new RuntimeIOException(e);
+ }
+
+ retCode = printTextResults(badContigs, goodContainers);
+ if (tsvOutputPath != null) {
+ printTSVResults(badContigs, goodContainers, tsvOutputPath);
+ }
+ }
+
+ /**
+ * Display metadata for a CRAM file header.
+ */
+ private CramHeader analyzeCRAMHeader(InputStream is) {
+ final CramHeader cramHeader = CramIO.readCramHeader(is);
+ emitln("CRAM File Name: " + inputPath.toPath().getFileName());
+ emitln("CRAM Version: " + cramHeader.getCRAMVersion().toString());
+ emitln("CRAM ID Contents: " + String.format("%s", Base64.getEncoder().encodeToString(cramHeader.getId())));
+ return cramHeader;
+ }
+
+ // reject any inputs that have containers that are reference-less; have multiple slices per container;
+ // or have slices with an embedded reference, since these indicate that the file was not written by GATK/Picard.
+ // it is in theory possible that the file could have been written by some other client of htsjdk (i.e., the
+ // htsjdk tests can write such file), but analyzing such files is beyond the scope of this tool
+ private boolean isForeignCRAM(final Container container) {
+ final List slices = container.getSlices();
+ if (slices.size() > 1 ) {
+ emitln("Multi-slice container detected. This file was not written by GATK or Picard.");
+ return true;
+ } else if (container.getAlignmentContext().getReferenceContext().isMappedSingleRef() &&
+ !container.getCompressionHeader().isReferenceRequired()) {
+ emitln("Reference-less container detected. This file was not written by GATK or Picard.");
+ return true;
+ }
+ for (final Slice slice : slices) {
+ if (slice.getEmbeddedReferenceContentID() != Slice.EMBEDDED_REFERENCE_ABSENT_CONTENT_ID) {
+ emitln(String.format("Embedded reference block (ID %d) detected. This file was not written by GATK or Picard.",
+ slice.getEmbeddedReferenceContentID()));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private void recordContainerStats(
+ final List targetList,
+ final boolean isBad,
+ final Container container,
+ final int containerOrdinal) {
+ // don't even try to compute stats for unmapped-unplaced containers or multi-ref containers
+ if (container.getAlignmentContext().getReferenceContext().isMappedSingleRef()) {
+ final Tuple containerStats = analyzeContainerBaseMismatches(container);
+ targetList.add(new ContainerStats(
+ containerOrdinal,
+ isBad,
+ container.getAlignmentContext(),
+ containerStats.a, // mismatches
+ containerStats.b)); // mismatchPercent
+ }
+ }
+
+ private void recordContigStats(
+ final Map badContigs,
+ final List badContainers,
+ final AlignmentContext previousAlignmentContext) {
+ if (null != badContigs.putIfAbsent(
+ previousAlignmentContext.getReferenceContext().toString(),
+ new BadContig(
+ previousAlignmentContext.getReferenceContext().toString(),
+ badContainers))) {
+ throw new IllegalStateException(
+ String.format(
+ "Attempt to add a bad contig (%s) more than once",
+ previousAlignmentContext.getReferenceContext().toString()));
+ }
+ }
+
+ /**
+ * Analyze base mismatches CRAM file container.
+ * return true if container is EOF container
+ */
+ private Tuple analyzeContainerBaseMismatches(final Container container) {
+ final SAMSequenceDictionary sequenceDictionary = samHeader.getSequenceDictionary();
+ final List actualSAMRecords = container.getSAMRecords(
+ ValidationStringency.LENIENT,
+ new CRAMReferenceRegion(referenceSource, samHeader.getSequenceDictionary()),
+ compressorCache,
+ samHeader);
+
+ final CRAMReferenceRegion localReferenceRegion = new CRAMReferenceRegion(referenceSource, sequenceDictionary);
+ // SequenceUtil.countMismatches wants the full contig's reference bases
+ localReferenceRegion.fetchReferenceBases(container.getAlignmentContext().getReferenceContext().getReferenceContextID());
+ final byte referenceBases[] = localReferenceRegion.getCurrentReferenceBases();
+
+ long totalBases = 0;
+ long misMatches = 0;
+ for (SAMRecord samRec : actualSAMRecords) {
+ totalBases += (long) samRec.getReadLength();
+ misMatches += (long) SequenceUtil.countMismatches(samRec, referenceBases);
+ }
+ return new Tuple<>(totalBases, misMatches/(double) totalBases);
+ }
+
+ private int printTextResults(final Map badContigs, final List goodContainers) {
+ int retCode;
+ if (badContigs.isEmpty()) {
+ final String NO_CORRUPT_CONTAINERS = "\n**********************NO CORRUPT CONTAINERS DETECTED**********************";
+ emitln(NO_CORRUPT_CONTAINERS);
+ // always emit the results summary to console
+ System.out.println(NO_CORRUPT_CONTAINERS);
+ retCode = 0;
+ } else {
+ final String POSSIBLE_CORRUPT_CONTAINERS = "\n**********************!!!!!Possible CORRUPT CONTAINERS DETECTED!!!!!**********************:\n";
+ emitln(POSSIBLE_CORRUPT_CONTAINERS);
+ // always emit the results summary to console
+ System.out.println(POSSIBLE_CORRUPT_CONTAINERS);
+ retCode = 1;
+ }
+
+ // before we print out the containers, print out the stats for both the good and the bad containers
+ final int totalGoodContainers = goodContainers.size();
+ final double sumOfGoodMismatchRates = goodContainers.stream().mapToDouble(c -> c.misMatchRate).sum();
+ final double averageGoodMismatchRate = sumOfGoodMismatchRates / totalGoodContainers;
+ final String avgGoodMismatchStr = String.format("Average mismatch rate for presumed good containers: %f", averageGoodMismatchRate);
+ emitln(avgGoodMismatchStr);
+ if (echoToConsole) {
+ System.out.println(avgGoodMismatchStr);
+ }
+
+ if (!badContigs.isEmpty()) {
+ final int totalBadContainers = badContigs.values().stream().mapToInt(bc -> bc.badContainers().size()).sum();
+ final double sumOfBadMismatchRates = badContigs.values().stream().mapToDouble(
+ bc -> bc.badContainers().stream().mapToDouble(c -> c.misMatchRate).sum()).sum();
+ final double averageBadMismatchRate = sumOfBadMismatchRates / totalBadContainers;
+ final String avgBadMismatchStr = String.format("Average mismatch rate for suspected bad containers: %f", averageBadMismatchRate);
+ emitln(avgBadMismatchStr);
+ if (echoToConsole) {
+ System.out.println(avgBadMismatchStr);
+ }
+
+ if (averageBadMismatchRate > mismatchRateThreshold) {
+ final String exceedThresholdStr = String.format(
+ "The average base mismatch rate of %f for suspected bad containers exceeds the threshold rate of %1.2f, and indicates this file may be corrupt.",
+ averageBadMismatchRate,
+ mismatchRateThreshold);
+ emitln(exceedThresholdStr);
+ if (echoToConsole) {
+ System.out.println(exceedThresholdStr);
+ }
+ }
+
+ // now emit the list of corrupt containers for each bad contig
+ emitln("\nSuspected CORRUPT Containers:");
+ for (final Map.Entry entry : badContigs.entrySet()) {
+ for (final ContainerStats badContainer : entry.getValue().badContainers()) {
+ final String badStatStr = String.format(" Ordinal: %d (%s) Mismatch Rate/Count: %f/%d",
+ badContainer.containerOrdinal,
+ badContainer.alignmentContext.toString(),
+ badContainer.misMatchRate,
+ badContainer.misMatchCount);
+ emitln(badStatStr);
+ if (echoToConsole) {
+ System.out.println(badStatStr);
+ }
+ }
+ }
+ }
+
+ emitln("\nPresumed GOOD Containers:");
+ int lastContig = ReferenceContext.UNINITIALIZED_REFERENCE_ID;
+ for (final ContainerStats goodContainer : goodContainers) {
+ if (lastContig != ReferenceContext.UNINITIALIZED_REFERENCE_ID &&
+ lastContig != goodContainer.alignmentContext.getReferenceContext().getReferenceContextID()) {
+ emitln("");
+ if (echoToConsole) {
+ System.out.println("");
+ }
+ }
+ lastContig = goodContainer.alignmentContext.getReferenceContext().getReferenceContextID();
+ final String goodDetailStr = String.format(" Ordinal: %d (%s) Mismatch Rate/Count: %f/%d",
+ goodContainer.containerOrdinal,
+ goodContainer.alignmentContext.toString(),
+ goodContainer.misMatchRate,
+ goodContainer.misMatchCount);
+ emitln(goodDetailStr);
+ if (echoToConsole) {
+ System.out.println(goodDetailStr);
+ }
+ }
+ return retCode;
+ }
+
+ // write the results out to a machine-readable tsv file
+ private void printTSVResults(
+ final Map badContigs,
+ final List goodContainers,
+ final GATKPath tsvOutputPath) {
+ // File name, contig name, container ordinal, good or bad, mismatch rate
+ final TableColumnCollection columnNames = new TableColumnCollection(
+ "file_name", // file name
+ "contig_name", // contig name
+ "container_ordinal", // container ordinal
+ "container_is_bad", // good or bad, 1 or 0
+ "mismatch_rate", // mismatch rate (double)
+ "alignment_start", // alignment start (int)
+ "alignment_span" // alignment span (int)
+ );
+
+ try (final TableWriter tsvWriter = new TableWriter<>(tsvOutputPath.toPath(), columnNames) {
+ @Override
+ protected void composeLine(final ContainerStats containerStats, final DataLine dataLine) {
+ dataLine.set("file_name", inputPath.toPath().getFileName().toString())
+ .set("contig_name", samHeader.getSequenceDictionary().getSequence(containerStats.alignmentContext.getReferenceContext().getReferenceContextID()).getSequenceName())
+ .set("container_ordinal", containerStats.containerOrdinal)
+ .set("container_is_bad", containerStats.isBad ? 1 : 0)
+ .set("mismatch_rate", containerStats.misMatchRate)
+ .set("alignment_start", containerStats.alignmentContext.getAlignmentStart())
+ .set("alignment_span", containerStats.alignmentContext.getAlignmentSpan());
+ }
+ })
+ {
+ tsvWriter.writeHeaderIfApplies();
+ if (badContigs.isEmpty()) {
+ tsvWriter.writeComment("No bad containers detected");
+ } else {
+ tsvWriter.writeComment("Bad containers:");
+ for (final Map.Entry entry : badContigs.entrySet()) {
+ tsvWriter.writeAllRecords(entry.getValue().badContainers());
+ }
+ }
+ if (goodContainers.isEmpty()) {
+ tsvWriter.writeComment("No good mapped containers detected");
+ } else {
+ tsvWriter.writeComment("Good containers:");
+ tsvWriter.writeAllRecords(goodContainers);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java
index 8339dfaccc4..8a0145b9307 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzer.java
@@ -4,7 +4,6 @@
import org.broadinstitute.hellbender.engine.GATKPath;
import java.io.Closeable;
-import java.io.File;
import java.io.IOException;
/**
@@ -13,11 +12,11 @@
public abstract class HTSAnalyzer implements Closeable {
protected GATKPath inputPath;
- protected File outputFile;
+ protected GATKPath outputPath;
- public HTSAnalyzer(final GATKPath filePath, final File outputFile) {
+ public HTSAnalyzer(final GATKPath filePath, final GATKPath outputPath) {
this.inputPath = filePath;
- this.outputFile = outputFile;
+ this.outputPath = outputPath;
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java
index d58438a7b61..b987a72cb37 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/filediagnostics/HTSAnalyzerFactory.java
@@ -3,21 +3,19 @@
import htsjdk.samtools.util.FileExtensions;
import org.broadinstitute.hellbender.engine.GATKPath;
-import java.io.File;
-
/**
* Class for creating an analyzer based on an alignment file type.
*/
public class HTSAnalyzerFactory {
- public static HTSAnalyzer getFileAnalyzer(final GATKPath inputPath, final File outputFile, final long countLimit) {
+ public static HTSAnalyzer getFileAnalyzer(final GATKPath inputPath, final GATKPath outputPath, final long countLimit) {
System.out.println(inputPath.getRawInputString());
if (inputPath.isCram()) {
- return new CRAMAnalyzer(inputPath, outputFile, countLimit);
+ return new CRAMAnalyzer(inputPath, outputPath, countLimit);
} else if (inputPath.hasExtension(FileExtensions.CRAM_INDEX)) {
- return new CRAIAnalyzer(inputPath, outputFile);
+ return new CRAIAnalyzer(inputPath, outputPath);
} else if (inputPath.hasExtension(FileExtensions.BAI_INDEX)) {
- return new BAIAnalyzer(inputPath, outputFile);
+ return new BAIAnalyzer(inputPath, outputPath);
} else {
throw new RuntimeException("Unsupported diagnostic file type: " + inputPath.getRawInputString());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/BaseFuncotatorArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/BaseFuncotatorArgumentCollection.java
index afe0b5d6676..cf283d44008 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/BaseFuncotatorArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/BaseFuncotatorArgumentCollection.java
@@ -80,6 +80,14 @@ public abstract class BaseFuncotatorArgumentCollection implements Serializable {
)
public TranscriptSelectionMode transcriptSelectionMode = FuncotatorArgumentDefinitions.TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE;
+ @Advanced
+ @Argument(
+ fullName = FuncotatorArgumentDefinitions.PREFER_MANE_TRANSCRIPT_MODE,
+ optional = true,
+ doc = "If this flag is set, Funcotator will prefer 'MANE_Plus_Clinical' followed by 'MANE_select' transcripts (including those not tagged 'basic') if one is present for a given variant. If neither tag is present it use the default behavior (only base transcripts)."
+ )
+ public boolean MANETranscriptMode = false;
+
@Argument(
fullName = FuncotatorArgumentDefinitions.TRANSCRIPT_LIST_LONG_NAME,
optional = true,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotateSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotateSegments.java
index 15f4e1dc41b..beba85c3d54 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotateSegments.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotateSegments.java
@@ -147,7 +147,8 @@ public void onTraversalStart() {
new FlankSettings(0,0),
true,
funcotatorArgs.minNumBasesForValidSegment,
- funcotatorArgs.spliceSiteWindow
+ funcotatorArgs.spliceSiteWindow,
+ funcotatorArgs.MANETranscriptMode
).stream()
.filter(DataSourceFuncotationFactory::isSupportingSegmentFuncotation)
.collect(Collectors.toList());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java
index 12843950a48..e8a74603fc1 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/Funcotator.java
@@ -794,7 +794,8 @@ public void onTraversalStart() {
new FlankSettings(funcotatorArgs.fivePrimeFlankSize, funcotatorArgs.threePrimeFlankSize),
false,
funcotatorArgs.minNumBasesForValidSegment,
- funcotatorArgs.spliceSiteWindow
+ funcotatorArgs.spliceSiteWindow,
+ funcotatorArgs.MANETranscriptMode
);
logger.info("Initializing Funcotator Engine...");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java
index 09ec1e3ada8..bf04ea3aa9e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorArgumentDefinitions.java
@@ -36,6 +36,8 @@ public class FuncotatorArgumentDefinitions {
public static final String TRANSCRIPT_SELECTION_MODE_LONG_NAME = "transcript-selection-mode";
public static final TranscriptSelectionMode TRANSCRIPT_SELECTION_MODE_DEFAULT_VALUE = TranscriptSelectionMode.CANONICAL;
+ public static final String PREFER_MANE_TRANSCRIPT_MODE = "prefer-mane-transcripts";
+
/**
* Do not give this a static default value or the integration tests will get hosed.
*/
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java
index c34b942b148..73a256a794c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/FuncotatorEngine.java
@@ -528,7 +528,7 @@ else if ( funcotatorArgs.referenceVersion.equals(BaseFuncotatorArgumentCollectio
}
// Record whether we need to revert the contigs back to B37 after annotation:
- if (FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants) && mustConvertInputContigsToHg19) {
+ if (mustConvertInputContigsToHg19 && FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants)) {
this.mustRevertVariantContigsFromHg19ToB37 = true;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
index 01791264a01..fcc39124ca8 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/DataSourceUtils.java
@@ -329,6 +329,7 @@ private static boolean isValidDirectory(final Path p) {
* ignored for those that don't.
* @param minBasesForValidSegment The minimum number of bases for a segment to be considered valid.
* @param spliceSiteWindowSize The number of bases on either side of a splice site for a variant to be a {@link org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation.VariantClassification#SPLICE_SITE} variant.
+ * @param preferMANETranscriptsWhereApplicable If this is set, in {@link GencodeFuncotationFactory}, we will only emit MANE transcripts if any are availible for a given variant, otherwise behaves as normal.
* @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information.
*/
public static List createDataSourceFuncotationFactoriesForDataSources(final Map dataSourceMetaData,
@@ -340,7 +341,8 @@ public static List createDataSourceFuncotationFact
final FlankSettings flankSettings,
final boolean doAttemptSegmentFuncotationForTranscriptDatasources,
final int minBasesForValidSegment,
- final int spliceSiteWindowSize) {
+ final int spliceSiteWindowSize,
+ final boolean preferMANETranscriptsWhereApplicable) {
Utils.nonNull(dataSourceMetaData);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
@@ -379,7 +381,7 @@ public static List createDataSourceFuncotationFact
case GENCODE:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, GencodeGtfFeature.class, false);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode,
- userTranscriptIdSet, featureInput, flankSettings, doAttemptSegmentFuncotationForTranscriptDatasources, minBasesForValidSegment, spliceSiteWindowSize);
+ userTranscriptIdSet, featureInput, flankSettings, doAttemptSegmentFuncotationForTranscriptDatasources, minBasesForValidSegment, spliceSiteWindowSize, preferMANETranscriptsWhereApplicable);
break;
case VCF:
featureInput = createAndRegisterFeatureInputs(path, properties, gatkToolInstance, lookaheadFeatureCachingInBp, VariantContext.class, false);
@@ -596,7 +598,8 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data
final FlankSettings flankSettings,
final boolean isSegmentFuncotationEnabled,
final int minBasesForValidSegment,
- final int spliceSiteWindowSize) {
+ final int spliceSiteWindowSize,
+ final boolean onlyUseMANETranscriptsWhenApplicable) {
Utils.nonNull(dataSourceFile);
Utils.nonNull(dataSourceProperties);
Utils.nonNull(annotationOverridesMap);
@@ -626,7 +629,8 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data
ncbiBuildVersion,
isSegmentFuncotationEnabled,
minBasesForValidSegment,
- spliceSiteWindowSize
+ spliceSiteWindowSize,
+ onlyUseMANETranscriptsWhenApplicable
);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java
index ec65f35c2b0..0565ca6ce4c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/funcotator/dataSources/gencode/GencodeFuncotationFactory.java
@@ -242,6 +242,11 @@ public class GencodeFuncotationFactory extends DataSourceFuncotationFactory {
*/
private boolean isSegmentFuncotationEnabled;
+ /**
+ * If this is true, only MANE transcripts will be used for funcotation creation when at least one is present.
+ */
+ private boolean preferMANETranscripts;
+
//==================================================================================================================
// Constructors:
@@ -354,7 +359,7 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,
this(gencodeTranscriptFastaFilePath, version, name, transcriptSelectionMode, userRequestedTranscripts,
annotationOverrides, mainFeatureInput, flankSettings, isDataSourceB37, ncbiBuildVersion,
- isSegmentFuncotationEnabled, minBasesForValidSegment, FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE);
+ isSegmentFuncotationEnabled, minBasesForValidSegment, FuncotatorUtils.DEFAULT_SPLICE_SITE_WINDOW_SIZE, false);
}
/**
@@ -385,7 +390,8 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,
final String ncbiBuildVersion,
final boolean isSegmentFuncotationEnabled,
final int minBasesForValidSegment,
- final int spliceSiteWindowSize) {
+ final int spliceSiteWindowSize,
+ final boolean preferMANETranscriptsWhereApplicable) {
super(mainFeatureInput, minBasesForValidSegment);
@@ -429,6 +435,8 @@ public GencodeFuncotationFactory(final Path gencodeTranscriptFastaFilePath,
// Initialize overrides / defaults:
initializeAnnotationOverrides( annotationOverrides );
+
+ this.preferMANETranscripts = preferMANETranscriptsWhereApplicable;
}
private Path localizeGencodeTranscriptFastaFile( final Path gencodeTranscriptFastaFilePath ) {
@@ -622,6 +630,28 @@ private static List convertFeaturesToGencodeGtfGeneFeatur
.collect(Collectors.toList());
}
+ /**
+ * If MANE_Plus_Clinical transcripts are avalible, only return them, followed by MANE_Select transcripts, followed by only the basic transcripts if none were MANE_Plus_Clinical or MANE_Select.
+ * @param transcripts of gencode transcripts to possibly filter
+ * @return
+ */
+ @VisibleForTesting
+ static List retreiveMANESelectModeTranscriptsCriteria(final List transcripts) {
+ final List plusClincal = transcripts.stream()
+ .filter(g -> hasTag(g, MANE_PLUS_CLINICAL)).toList();
+ if (plusClincal.size() > 0) {
+ return plusClincal;
+ }
+
+ final List maneSelectTranscripts = transcripts.stream()
+ .filter(g -> hasTag(g, MANE_SELECT)).toList();
+
+ if (maneSelectTranscripts.size() > 0) {
+ return maneSelectTranscripts;
+ }
+
+ return transcripts.stream().filter(GencodeFuncotationFactory::isBasic).collect(Collectors.toList());
+ }
/**
* {@inheritDoc}
@@ -853,16 +883,21 @@ static boolean isVariantInCodingRegion(final GencodeFuncotation.VariantClassific
*/
private List createFuncotationsHelper(final VariantContext variant, final Allele altAllele, final GencodeGtfGeneFeature gtfFeature, final ReferenceContext reference) {
- final List transcriptList;
+ List transcriptList;
// Only get basic transcripts if we're using data from Gencode:
if ( gtfFeature.getGtfSourceFileType().equals(GencodeGtfCodec.GTF_FILE_TYPE_STRING) ) {
- transcriptList = retrieveBasicTranscripts(gtfFeature);
- }
- else {
+ if (preferMANETranscripts) {
+ // Filter out the non-MANE_Select/Mane_Plus_Clinical transcripts if we're only using MANE transcripts:
+ transcriptList = retreiveMANESelectModeTranscriptsCriteria(gtfFeature.getTranscripts());
+ } else {
+ transcriptList = retrieveBasicTranscripts(gtfFeature);
+ }
+ } else {
transcriptList = gtfFeature.getTranscripts();
}
+
return createFuncotationsHelper(variant, altAllele, reference, transcriptList);
}
@@ -979,9 +1014,14 @@ static final GencodeFuncotation createDefaultFuncotationsOnProblemVariant( final
private static boolean isBasic(final GencodeGtfTranscriptFeature transcript) {
// Check if this transcript has the `basic` tag:
+ return hasTag(transcript, GencodeGTFFieldConstants.FeatureTag.BASIC);
+ }
+
+ private static boolean hasTag(final GencodeGtfTranscriptFeature transcript, final GencodeGTFFieldConstants.FeatureTag tag) {
+ // Check if this transcript has the given tag:
return transcript.getOptionalFields().stream()
.filter( f -> f.getName().equals("tag") )
- .filter( f -> f.getValue().equals(GencodeGTFFieldConstants.FeatureTag.BASIC.toString()) )
+ .filter( f -> f.getValue().equals(tag.toString()) )
.count() > 0;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSBuildReferenceTaxonomyUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSBuildReferenceTaxonomyUtils.java
index eb4a7687080..43e57b6fd78 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSBuildReferenceTaxonomyUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pathseq/PSBuildReferenceTaxonomyUtils.java
@@ -313,13 +313,13 @@ public static BufferedReader getBufferedReaderTarGz(final String tarPath, final
try {
InputStream result = null;
final TarArchiveInputStream tarStream = new TarArchiveInputStream(new GZIPInputStream(new FileInputStream(tarPath)));
- TarArchiveEntry entry = tarStream.getNextTarEntry();
+ TarArchiveEntry entry = tarStream.getNextEntry();
while (entry != null) {
if (entry.getName().equals(fileName)) {
result = tarStream;
break;
}
- entry = tarStream.getNextTarEntry();
+ entry = tarStream.getNextEntry();
}
if (result == null) {
throw new UserException.BadInput("Could not find file " + fileName + " in tarball " + tarPath);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java
index 1ac964daeac..716e256d620 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java
@@ -41,9 +41,10 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter;
-import scala.Serializable;
import java.io.IOException;
+import java.io.Serial;
+import java.io.Serializable;
import java.nio.file.Paths;
import java.util.List;
import java.util.Set;
@@ -364,6 +365,7 @@ private static List processEvidenceTargetLinks(List COMPLEX_VARIANT_SUBTYPE_MAP = HashBiMap.create(Map.ofEntries(
+ entry("delINV", ComplexVariantSubtype.delINV),
+ entry("INVdel", ComplexVariantSubtype.INVdel),
+ entry("dupINV", ComplexVariantSubtype.dupINV),
+ entry("INVdup", ComplexVariantSubtype.INVdup),
+ entry("delINVdel", ComplexVariantSubtype.delINVdel),
+ entry("dupINVdup", ComplexVariantSubtype.dupINVdup),
+ entry("delINVdup", ComplexVariantSubtype.delINVdup),
+ entry("dupINVdel", ComplexVariantSubtype.dupINVdel),
+ entry("piDUP_FR", ComplexVariantSubtype.piDUP_FR),
+ entry("piDUP_RF", ComplexVariantSubtype.piDUP_RF),
+ entry("dDUP", ComplexVariantSubtype.dDUP),
+ entry("dDUP_iDEL", ComplexVariantSubtype.dDUP_iDEL),
+ entry("INS_iDEL", ComplexVariantSubtype.INS_iDEL),
+ entry("CTX_PP/QQ", ComplexVariantSubtype.CTX_PP_QQ),
+ entry("CTX_PQ/QP", ComplexVariantSubtype.CTX_PQ_QP),
+ entry("CTX_INV", ComplexVariantSubtype.CTX_INV)
+ ));
+
// not defined in output vcf header but used in internal id that is currently output in the ID column
public static final String INTERVAL_VARIANT_ID_FIELD_SEPARATOR = "_";
public static final String DUP_TAN_CONTRACTION_INTERNAL_ID_START_STRING = "DEL-DUPLICATION-TANDEM-CONTRACTION";
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
index 0f95878b389..3f3258d6161 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
@@ -10,6 +10,7 @@
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.tools.walkers.sv.SVSegment;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
@@ -34,7 +35,8 @@ public class SVCallRecord implements SVLocatable {
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
GATKSVVCFConstants.SVTYPE,
- GATKSVVCFConstants.CPX_TYPE
+ GATKSVVCFConstants.CPX_TYPE,
+ GATKSVVCFConstants.CPX_INTERVALS
);
private final String id;
@@ -57,6 +59,7 @@ public class SVCallRecord implements SVLocatable {
// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
+ private final List cpxIntervals;
public SVCallRecord(final String id,
final String contigA,
@@ -67,6 +70,7 @@ public SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
+ final List cpxIntervals,
final Integer length,
final List algorithms,
final List alleles,
@@ -75,7 +79,7 @@ public SVCallRecord(final String id,
final Set filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
- this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
+ this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}
@@ -88,6 +92,7 @@ protected SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
+ final List cpxIntervals,
final Integer length,
final List algorithms,
final List alleles,
@@ -100,6 +105,7 @@ protected SVCallRecord(final String id,
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
Utils.nonNull(filters);
+ Utils.nonNull(cpxIntervals);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
@@ -107,6 +113,7 @@ protected SVCallRecord(final String id,
this.positionB = positionB;
this.type = Utils.nonNull(type);
this.cpxSubtype = cpxSubtype;
+ this.cpxIntervals = canonicalizeComplexEventList(cpxIntervals);
this.algorithms = Collections.unmodifiableList(algorithms);
this.alleles = Collections.unmodifiableList(alleles);
this.altAlleles = alleles.stream().filter(allele -> !allele.isNoCall() && !allele.isReference()).collect(Collectors.toList());
@@ -135,11 +142,26 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
// CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
// B references the source sequence.
if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
- Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
- "End coordinate cannot precede start");
+ if (IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) > 0) {
+ throw new IllegalArgumentException("End precedes start in variant " + id);
+ }
+ }
+ for (final ComplexEventInterval interval : cpxIntervals) {
+ Utils.nonNull(interval);
+ validatePosition(interval.getContig(), interval.getStart(), dictionary);
+ validatePosition(interval.getContig(), interval.getEnd(), dictionary);
}
}
+ /**
+ * Sorts complex intervals list so that they can be efficiently compared across records.
+ * @param intervals complex intervals
+ * @return canonicalized list
+ */
+ private static List canonicalizeComplexEventList(final List intervals) {
+ return intervals.stream().sorted(Comparator.comparing(ComplexEventInterval::encode)).collect(Collectors.toList());
+ }
+
private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
final SAMSequenceRecord seq = dictionary.getSequence(contig);
Utils.validateArg(seq != null, "Contig " + contig + " not found in dictionary");
@@ -148,7 +170,7 @@ private static void validatePosition(final String contig, final int position, fi
private static Map validateAttributes(final Map attributes) {
for (final String key : INVALID_ATTRIBUTES) {
- Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with invalid key: " + key);
+ Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with reserved key: " + key);
}
return attributes;
}
@@ -180,6 +202,7 @@ private static Integer inferLength(final GATKSVVCFConstants.StructuralVariantAnn
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && inputLength != null) {
throw new IllegalArgumentException("Input length should be null for type " + type.name() + " but found " + inputLength);
}
+ // TODO complex subtypes should be checked and handled properly, but for now we just pass through SVLEN
return inputLength;
}
}
@@ -384,7 +407,45 @@ public Double getLog10PError() {
return log10PError;
}
- public GATKSVVCFConstants.ComplexVariantSubtype getCpxSubtype() {
- return cpxSubtype;
+ public List getComplexEventIntervals() {
+ return cpxIntervals;
+ }
+
+ public static final class ComplexEventInterval extends SVSegment {
+
+ public ComplexEventInterval(final GATKSVVCFConstants.StructuralVariantAnnotationType intervalType,
+ final SimpleInterval interval) {
+ super(intervalType, interval);
+ }
+
+ public static ComplexEventInterval decode(final String str, final SAMSequenceDictionary dictionary) {
+ Utils.nonNull(str);
+ final String[] tokens = str.split("_", 2);
+ if (tokens.length < 2) {
+ throw new IllegalArgumentException("Expected complex interval with format \"SVTYPE_chr:pos-end\" but found \"" + str + "\"");
+ }
+ final SimpleInterval interval = new SimpleInterval(tokens[1]);
+ if (!IntervalUtils.intervalIsOnDictionaryContig(interval, dictionary)) {
+ throw new IllegalArgumentException("Invalid CPX interval: " + interval);
+ }
+ return new ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(tokens[0]), interval);
+ }
+
+ public String encode() {
+ return getIntervalSVType().name() + "_" + getInterval().toString();
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ ComplexEventInterval that = (ComplexEventInterval) o;
+ return getIntervalSVType() == that.getIntervalSVType() && Objects.equals(getInterval(), that.getInterval());
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getIntervalSVType(), getInterval());
+ }
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
index 92c7f22835a..cf31d654727 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java
@@ -24,8 +24,7 @@ public final class SVCallRecordUtils {
private static final Set VALID_TYPES = new HashSet<>(Arrays.asList(GATKSVVCFConstants.StructuralVariantAnnotationType.values()).stream()
.map(GATKSVVCFConstants.StructuralVariantAnnotationType::name).collect(Collectors.toList()));
- private static final Set VALID_CPX_SUBTYPES = new HashSet<>(Arrays.asList(GATKSVVCFConstants.ComplexVariantSubtype.values()).stream()
- .map(GATKSVVCFConstants.ComplexVariantSubtype::name).collect(Collectors.toList()));
+ private static final Set VALID_CPX_SUBTYPES = GATKSVVCFConstants.COMPLEX_VARIANT_SUBTYPE_MAP.keySet();
/**
* Create a builder for a variant from an {@link SVCallRecord} for VCF interoperability
@@ -34,34 +33,18 @@ public final class SVCallRecordUtils {
*/
public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) {
Utils.nonNull(record);
- final int end;
final GATKSVVCFConstants.StructuralVariantAnnotationType type = record.getType();
- final GATKSVVCFConstants.ComplexVariantSubtype cpxType = record.getComplexSubtype();
- final boolean isDispersedDup = cpxType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP
- || cpxType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL;
- if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
- || type == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
- || type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX
- || isDispersedDup) {
- end = record.getPositionA();
- } else {
- end = record.getPositionB();
- }
+ final int end;
final Integer end2;
final String chr2;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
- || type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
+ || type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
+ // TODO this may need to be modified in the future to handle complex translocations
+ end = record.getPositionA();
end2 = record.getPositionB();
chr2 = record.getContigB();
- } else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
- if (isDispersedDup) {
- end2 = record.getPositionB();
- chr2 = record.getContigB();
- } else {
- end2 = null;
- chr2 = null;
- }
} else {
+ end = record.getPositionB();
end2 = null;
chr2 = null;
}
@@ -90,14 +73,21 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2);
builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, chr2);
}
+ final GATKSVVCFConstants.ComplexVariantSubtype cpxType = record.getComplexSubtype();
if (cpxType != null) {
- builder.attribute(GATKSVVCFConstants.CPX_TYPE, record.getComplexSubtype().toString());
+ builder.attribute(GATKSVVCFConstants.CPX_TYPE, getComplexSubtypeString(cpxType));
+ }
+ final List cpxIntervals = record.getComplexEventIntervals();
+ if (!cpxIntervals.isEmpty()) {
+ builder.attribute(GATKSVVCFConstants.CPX_INTERVALS, cpxIntervals.stream().map(SVCallRecord.ComplexEventInterval::encode).collect(Collectors.toList()));
}
builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength());
if ((svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INV
- || svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INS)
+ || svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
+ || svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX
+ || svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
@@ -183,12 +173,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
*/
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
- record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
+ record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}
@@ -300,20 +290,19 @@ public static Stream convertInversionsToBreakends(final SVCallReco
}
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
- record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
+ record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
- record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
+ record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}
/**
* Creates a new {@link SVCallRecord} from the given {@link VariantContext}, keeping any variant fields.
- * @see SVCallRecordUtils#create(VariantContext, boolean)
*/
- public static SVCallRecord create(final VariantContext variant) {
- return create(variant, true);
+ public static SVCallRecord create(final VariantContext variant, final SAMSequenceDictionary dictionary) {
+ return create(variant, true, dictionary);
}
/**
@@ -322,15 +311,15 @@ public static SVCallRecord create(final VariantContext variant) {
* @param keepVariantAttributes retain variant attribute fields
* @return converted record
*/
- public static SVCallRecord create(final VariantContext variant, boolean keepVariantAttributes) {
+ public static SVCallRecord create(final VariantContext variant, boolean keepVariantAttributes, final SAMSequenceDictionary dictionary) {
Utils.nonNull(variant);
final String id = variant.getID();
final String contigA = variant.getContig();
final int positionA = variant.getStart();
final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
- final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype =
- type == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX ? getComplexSubtype(variant) : null;
+ final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
+ final List cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
final List algorithms = getAlgorithms(variant);
final String strands;
@@ -368,21 +357,11 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
if (!(hasContig2 && hasEnd2)) {
throw new UserException.BadInput("Attributes " + GATKSVVCFConstants.END2_ATTRIBUTE +
- " and " + GATKSVVCFConstants.CONTIG2_ATTRIBUTE + " are required for BND records (variant " +
- variant.getID() + ").");
+ " and " + GATKSVVCFConstants.CONTIG2_ATTRIBUTE + " are required for BND and CTX records " +
+ "(variant " + variant.getID() + ").");
}
contigB = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null);
positionB = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, 0);
- } else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
- // If CHR2/END2 are defined, use them
- if (hasContig2 && hasEnd2) {
- contigB = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null);
- positionB = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, 0);
- } else {
- // Otherwise treat like any other variant
- contigB = contigA;
- positionB = variant.getEnd();
- }
} else {
contigB = contigA;
// Force reset of END coordinate
@@ -395,8 +374,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final Double log10PError = variant.hasLog10PError() ? variant.getLog10PError() : null;
final Map sanitizedAttributes = sanitizeAttributes(attributes);
- return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, length, algorithms,
- variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError);
+ return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
+ cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
+ variant.getFilters(), log10PError);
+ }
+
+ private static List parseComplexIntervals(final List intervals, final SAMSequenceDictionary dictionary) {
+ return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
}
private static Map sanitizeAttributes(final Map attributes) {
@@ -426,15 +410,19 @@ public static List getAlgorithms(final VariantContext variant) {
public static GATKSVVCFConstants.ComplexVariantSubtype getComplexSubtype(final VariantContext variant) {
Utils.nonNull(variant);
- final String subtypeString = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null);
+ String subtypeString = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null);
if (subtypeString == null) {
return null;
}
- if (!VALID_CPX_SUBTYPES.contains(subtypeString)) {
+ if (!GATKSVVCFConstants.COMPLEX_VARIANT_SUBTYPE_MAP.containsKey(subtypeString)) {
throw new IllegalArgumentException("Invalid CPX subtype: " + subtypeString + ", valid values are: " +
String.join(", ", VALID_CPX_SUBTYPES));
}
- return GATKSVVCFConstants.ComplexVariantSubtype.valueOf(subtypeString);
+ return GATKSVVCFConstants.COMPLEX_VARIANT_SUBTYPE_MAP.get(subtypeString);
+ }
+
+ public static String getComplexSubtypeString(final GATKSVVCFConstants.ComplexVariantSubtype subtype) {
+ return GATKSVVCFConstants.COMPLEX_VARIANT_SUBTYPE_MAP.inverse().get(subtype);
}
private static String getStrands(final VariantContext variant, final GATKSVVCFConstants.StructuralVariantAnnotationType type) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
index 12d93a3baa2..39228617d26 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java
@@ -192,8 +192,8 @@ public SVCallRecord collapse(final SVClusterEngine.OutputCluster cluster) {
final Double quality = collapseQuality(items);
return new SVCallRecord(representative.getId(), representative.getContigA(), start, strandA, representative.getContigB(),
- end, strandB, type, representative.getComplexSubtype(), length, algorithms, alleles, genotypes, attributes,
- filters, quality, dictionary);
+ end, strandB, type, representative.getComplexSubtype(), representative.getComplexEventIntervals(),
+ length, algorithms, alleles, genotypes, attributes, filters, quality, dictionary);
}
protected List collapseAlleles(final List altAlleles, final Allele refAllele) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java
index e800e0283d9..8d5b381ee81 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java
@@ -9,6 +9,9 @@
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
+import java.util.Iterator;
+import java.util.List;
+
/**
*
Main class for SV clustering. Two items are clustered together if they:
*
@@ -42,7 +45,7 @@ public class CanonicalSVLinkage extends SVClusterLinkage
public static final double DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY = 0.8;
public static final double DEFAULT_SIZE_SIMILARITY_DEPTH_ONLY = 0;
- public static final int DEFAULT_WINDOW_DEPTH_ONLY = 0;
+ public static final int DEFAULT_WINDOW_DEPTH_ONLY = 10000000;
public static final double DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY = 0;
public static final double DEFAULT_RECIPROCAL_OVERLAP_MIXED = 0.8;
@@ -154,6 +157,13 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
return false;
}
+ // If complex, test complex intervals
+ if (a.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX
+ && b.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX &&
+ !testComplexIntervals(a, b, params.getReciprocalOverlap(), params.getSizeSimilarity(), params.getWindow(), dictionary)) {
+ return false;
+ }
+
// Reciprocal overlap and size similarity
// Check bypassed if both are inter-chromosomal
final Boolean hasReciprocalOverlapAndSizeSimilarity;
@@ -182,6 +192,40 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
}
}
+ /**
+ * Performs overlap testing on each pair of complex intervals in two records, requiring each pair to be
+ * sufficiently similar by reciprocal overlap, size similarity, and breakend proximity.
+ */
+ private static boolean testComplexIntervals(final SVCallRecord a, final SVCallRecord b, final double overlapThreshold,
+ final double sizeSimilarityThreshold, final int window,
+ final SAMSequenceDictionary dictionary) {
+ final List intervalsA = a.getComplexEventIntervals();
+ final List intervalsB = b.getComplexEventIntervals();
+ if (intervalsA.size() != intervalsB.size()) {
+ return false;
+ }
+ final Iterator iterA = intervalsA.iterator();
+ final Iterator iterB = intervalsB.iterator();
+ for (int i = 0; i < intervalsA.size(); i++) {
+ final SVCallRecord.ComplexEventInterval cpxIntervalA = iterA.next();
+ final SVCallRecord.ComplexEventInterval cpxIintervalB = iterB.next();
+ if (cpxIntervalA.getIntervalSVType() != cpxIintervalB.getIntervalSVType()) {
+ return false;
+ }
+ final SimpleInterval intervalA = cpxIntervalA.getInterval();
+ final SimpleInterval intervalB = cpxIintervalB.getInterval();
+ if (!(IntervalUtils.isReciprocalOverlap(intervalA, intervalB, overlapThreshold)
+ && testSizeSimilarity(intervalA.getLengthOnReference(), intervalB.getLengthOnReference(), sizeSimilarityThreshold)
+ && testBreakendProximity(new SimpleInterval(intervalA.getContig(), intervalA.getStart(), intervalA.getStart()),
+ new SimpleInterval(intervalA.getContig(), intervalA.getEnd(), intervalA.getEnd()),
+ new SimpleInterval(intervalB.getContig(), intervalB.getStart(), intervalB.getStart()),
+ new SimpleInterval(intervalB.getContig(), intervalB.getEnd(), intervalB.getEnd()), window, dictionary))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
private static boolean testReciprocalOverlap(final SVCallRecord a, final SVCallRecord b, final double threshold) {
final SimpleInterval intervalA = new SimpleInterval(a.getContigA(), a.getPositionA(), a.getPositionA() + getLength(a, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
final SimpleInterval intervalB = new SimpleInterval(b.getContigA(), b.getPositionA(), b.getPositionA() + getLength(b, INSERTION_ASSUMED_LENGTH_FOR_OVERLAP) - 1);
@@ -189,28 +233,36 @@ private static boolean testReciprocalOverlap(final SVCallRecord a, final SVCallR
}
private static boolean testSizeSimilarity(final SVCallRecord a, final SVCallRecord b, final double threshold) {
- final int sizeSimilarityLengthA = getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
- final int sizeSimilarityLengthB = getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY);
- return Math.min(sizeSimilarityLengthA, sizeSimilarityLengthB) / (double) Math.max(sizeSimilarityLengthA, sizeSimilarityLengthB) >= threshold;
+ return testSizeSimilarity(getLength(a, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY),
+ getLength(b, INSERTION_ASSUMED_LENGTH_FOR_SIZE_SIMILARITY), threshold);
+ }
+
+ private static boolean testSizeSimilarity(final int lengthA, final int lengthB, final double threshold) {
+ return Math.min(lengthA, lengthB) / (double) Math.max(lengthA, lengthB) >= threshold;
}
private static boolean testBreakendProximity(final SVCallRecord a, final SVCallRecord b, final int window,
final SAMSequenceDictionary dictionary) {
- final SimpleInterval intervalA1 = a.getPositionAInterval().expandWithinContig(window, dictionary);
- final SimpleInterval intervalA2 = a.getPositionBInterval().expandWithinContig(window, dictionary);
- if (intervalA1 == null) {
- logger.warn("Invalid start position " + a.getPositionA() + " in record " + a.getId() +
+ return testBreakendProximity(a.getPositionAInterval(), a.getPositionBInterval(),
+ b.getPositionAInterval(), b.getPositionBInterval(), window, dictionary);
+ }
+
+ private static boolean testBreakendProximity(final SimpleInterval intervalA1, final SimpleInterval intervalA2,
+ final SimpleInterval intervalB1, final SimpleInterval intervalB2,
+ final int window, final SAMSequenceDictionary dictionary) {
+ final SimpleInterval intervalA1Padded = intervalA1.expandWithinContig(window, dictionary);
+ final SimpleInterval intervalA2Padded = intervalA2.expandWithinContig(window, dictionary);
+ if (intervalA1Padded == null) {
+ logger.warn("Invalid start position " + intervalA1.getContig() + ":" + intervalA1.getStart() +
" - record will not be matched");
return false;
}
- if (intervalA2 == null) {
- logger.warn("Invalid end position " + a.getPositionB() + " in record " + a.getId() +
+ if (intervalA2Padded == null) {
+ logger.warn("Invalid end position " + intervalA2.getContig() + ":" + intervalA2.getStart() +
" - record will not be matched");
return false;
}
- final SimpleInterval intervalB1 = b.getPositionAInterval();
- final SimpleInterval intervalB2 = b.getPositionBInterval();
- return intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2);
+ return intervalA1Padded.overlaps(intervalB1) && intervalA2Padded.overlaps(intervalB2);
}
/**
@@ -218,7 +270,8 @@ private static boolean testBreakendProximity(final SVCallRecord a, final SVCallR
*/
private static int getLength(final SVCallRecord record, final int missingInsertionLength) {
Utils.validate(record.isIntrachromosomal(), "Record must be intra-chromosomal");
- if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS) {
+ if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS ||
+ record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
return record.getLength() == null ? missingInsertionLength : Math.max(record.getLength(), 1);
} else if (record.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.BND) {
return record.getPositionB() - record.getPositionA() + 1;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/ClusteringParameters.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/ClusteringParameters.java
index 2365c5041e8..fe665c96c72 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/ClusteringParameters.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/ClusteringParameters.java
@@ -17,6 +17,7 @@ public class ClusteringParameters {
// if true, both reciprocal overlap and window criteria must be met
// if false, reciprocal overlap and/or window criteria must be met
+ // NOTE this is currently set to true in all tools but is being kept for possible future use
private final boolean requiresOverlapAndProximity;
// returns true if two given records are the correct type of pair for this parameter set
@@ -57,7 +58,7 @@ public boolean isValidPair(final SVCallRecord a, final SVCallRecord b) {
}
public static ClusteringParameters createDepthParameters(final double reciprocalOverlap, final double sizeSimilarity, final int window, final double sampleOverlap) {
- return new ClusteringParameters(reciprocalOverlap, sizeSimilarity, window, sampleOverlap, false, (a,b) -> a.isDepthOnly() && b.isDepthOnly());
+ return new ClusteringParameters(reciprocalOverlap, sizeSimilarity, window, sampleOverlap, true, (a,b) -> a.isDepthOnly() && b.isDepthOnly());
}
public static ClusteringParameters createMixedParameters(final double reciprocalOverlap, final double sizeSimilarity, final int window, final double sampleOverlap) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
index cec6dbec129..a8740aaf678 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java
@@ -29,13 +29,28 @@
import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
-import org.broadinstitute.hellbender.utils.*;
+import org.broadinstitute.hellbender.utils.GenomeLoc;
+import org.broadinstitute.hellbender.utils.GenomeLocParser;
+import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
+import org.broadinstitute.hellbender.utils.IntervalMergingRule;
+import org.broadinstitute.hellbender.utils.IntervalSetRule;
+import org.broadinstitute.hellbender.utils.IntervalUtils;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
-
-import java.util.*;
+import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
import java.util.stream.Collectors;
+import static org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter.Mode.STARTS_IN;
+
/**
* Perform joint genotyping on one or more samples pre-called with HaplotypeCaller
*
@@ -114,7 +129,7 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
/**
* Import all data between specified intervals. Improves performance using large lists of intervals, as in exome
* sequencing, especially if GVCF data only exists for specified intervals. Use with
- * --only-output-calls-starting-in-intervals if input GVCFs contain calls outside the specified intervals.
+ * --{@value StandardArgumentDefinitions#VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME} if input GVCFs contain calls outside the specified intervals.
*/
@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
shortName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
@@ -158,13 +173,14 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
/**
* This option can only be activated if intervals are specified.
*/
+ @DeprecatedFeature
@Advanced
@Argument(fullName= ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
- doc="Restrict variant output to sites that start within provided intervals",
- optional=true)
+ doc="Restrict variant output to sites that start within provided intervals, equivalent to '--"+StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME+" STARTS_IN'",
+ optional=true,
+ mutex = {StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME})
private boolean onlyOutputCallsStartingInIntervals = false;
-
@Argument(fullName = FORCE_OUTPUT_INTERVALS_NAME,
suppressFileExpansion = true, doc = "sites at which to output genotypes even if non-variant in samples", optional = true)
protected final List forceOutputIntervalStrings = new ArrayList<>();
@@ -186,9 +202,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
private VariantContextWriter vcfWriter;
- /** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
- private List intervals;
-
private OverlapDetector forceOutputIntervals;
private boolean forceOutputIntervalsPresent;
@@ -250,6 +263,11 @@ public void onTraversalStart() {
logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions.");
}
+ if (onlyOutputCallsStartingInIntervals) {
+ logger.warn("The --" + ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " option is deprecated. Please use '--" + StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME + " STARTS_IN' for an equivalent filtering.");
+ this.userOutputVariantIntervalFilteringMode = STARTS_IN;
+ }
+
forceOutputIntervalsPresent = !forceOutputIntervalStrings.isEmpty();
if (includeNonVariants && forceOutputIntervalsPresent ) {
@@ -269,23 +287,14 @@ public void onTraversalStart() {
final VCFHeader inputVCFHeader = getHeaderForVariants();
- if(onlyOutputCallsStartingInIntervals) {
- if( !hasUserSuppliedIntervals()) {
- throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
- }
- }
-
- intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
- Collections.emptyList();
-
- final Collection variantAnnotations = makeVariantAnnotations();
+ final Collection variantAnnotations = makeVariantAnnotations();
final Set annotationsToKeep = getAnnotationsToKeep();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);
merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);
//methods that cannot be called in engine bc its protected
- Set defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
+ final Set defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines();
vcfWriter = createVCFWriter(outputFile);
//create engine object
@@ -294,7 +303,6 @@ public void onTraversalStart() {
//call initialize method in engine class that creates VCFWriter object and writes a header to it
vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter);
-
}
private Set getAnnotationsToKeep() {
@@ -316,9 +324,7 @@ public void apply(final Locatable loc, List variants, ReadsConte
final VariantContext regenotypedVC = gvcfEngine.callRegion(loc, variants, ref, features, merger, somaticInput, tlodThreshold, afTolerance, forceOutput);
if (regenotypedVC != null) {
- final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart());
- if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC)) &&
- (!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains (variantStart)))) {
+ if ((forceOutput || !GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC))) {
vcfWriter.add(regenotypedVC);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
index 312a90872e7..9036ecdd44d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
@@ -150,8 +150,9 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
final VariantContext result;
+ // only re-genotype polymorphic sites
if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) {
- // only re-genotype polymorphic sites
+ // note that the calculateGenotypes method also calculates the QUAL score
final VariantContext regenotypedVC = calculateGenotypes(originalVC, includeNonVariants);
if (regenotypedVC == null) {
return null;
@@ -186,7 +187,7 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
//don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
- return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false, keepSB)).make();
+ return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(annotated, false, keepSB)).make();
} else if (includeNonVariants) {
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true, false)).make();
@@ -461,24 +462,29 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final
attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, GenotypeGVCFs.PHASED_HOM_VAR_STRING);
}
- // create AD if it's not there
- if ( !oldGT.hasAD() && vc.isVariant() ) {
+ // create AD if it's not there, but only if there's data
+ if ( !oldGT.hasAD() && vc.isVariant() && depth > 0) {
final int[] AD = new int[vc.getNAlleles()];
AD[0] = depth;
builder.AD(AD);
}
if ( createRefGTs ) {
- // move the GQ to RGQ
- if (oldGT.hasGQ()) {
+ //keep 0 depth samples and 0 GQ samples as no-call
+ if (depth > 0 && oldGT.hasGQ()) {
+ if (oldGT.getGQ() > 0) {
+ final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
+ builder.alleles(refAlleles);
+ } else {
+ builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
+ }
+
+ // move the GQ to RGQ
builder.noGQ();
attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ());
- }
-
- //keep 0 depth samples and 0 GQ samples as no-call
- if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) {
- final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
- builder.alleles(refAlleles);
+ } else {
+ builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
+ builder.noGQ().noDP();
}
// also, the PLs are technically no longer usable
@@ -494,8 +500,8 @@ static List cleanupGenotypeAnnotations(final VariantContext vc, final
* Does this genotype look like it has no reads and should be excluded from annotations?
*/
static boolean excludeFromAnnotations(Genotype oldGT) {
- return oldGT.isHomRef() && !oldGT.hasPL()
- && ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP())
+ return (oldGT.isHomRef() || oldGT.isNoCall())
+ && (!oldGT.hasDP() || oldGT.getDP() == 0)
&& oldGT.hasGQ() && oldGT.getGQ() == 0;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
index f46758fc39e..9484da25a9a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
@@ -439,7 +439,8 @@ protected static void removeStaleAttributesAfterMerge(final Map
attributes.remove(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
attributes.remove(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
attributes.remove(VCFConstants.END_KEY);
- attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
+ attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
+ attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
}
/**
@@ -579,6 +580,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
if (!doSomaticMerge) {
+ //do attribute subsetting
if (g.hasPL() || g.hasAD()) {
int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
if (g.hasPL()) {
@@ -590,8 +592,10 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
if (g.hasAD()) {
genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}
+
+ }
//clean up low confidence hom refs for better annotations later
- } else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
+ if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
}
}
@@ -658,7 +662,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
genotypeBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
- targetAlleles, new GenotypeBuilder(g.getSampleName(), originalGTAlleles).make(), null);
+ targetAlleles, new GenotypeBuilder(g).alleles(originalGTAlleles).make(), null);
mergedGenotypes.add(genotypeBuilder.make());
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
index 1d9949b219e..f979b38dc9c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java
@@ -4,13 +4,20 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
+import htsjdk.variant.vcf.VCFHeaderLineCount;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
+import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import java.util.*;
public final class AnnotationUtils {
+
+ public static final String ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX = "AS_";
public static final String ALLELE_SPECIFIC_RAW_DELIM = "|";
public static final String ALLELE_SPECIFIC_REDUCED_DELIM = ",";
public static final String ALLELE_SPECIFIC_SPLIT_REGEX = "\\|"; //String.split takes a regex, so we need to escape the pipe
@@ -74,16 +81,23 @@ public static List decodeAnyASList( final String somethingList) {
* @param annotation the annotation to be tested
* @return true if the annotation is expected to have values per-allele
*/
- public static boolean isAlleleSpecific(final InfoFieldAnnotation annotation) {
+ public static boolean isAlleleSpecific(final VariantAnnotation annotation) {
return annotation instanceof AlleleSpecificAnnotation;
}
+ public static boolean isAlleleSpecificGatkKey(final String annotationKey) {
+ final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey);
+ return header.getCountType().equals(VCFHeaderLineCount.A) ||
+ header.getCountType().equals(VCFHeaderLineCount.R) ||
+ annotationKey.startsWith(ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX);
+ }
+
/**
- * Handles all the Java and htsjdk parsing shenanigans
- * @param rawDataString should not have surrounding brackets
+ * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
+ * @param rawDataString may have surrounding brackets, with raw delimiter
* @return
*/
- public static List getAlleleLengthListOfString(String rawDataString) {
+ public static List getAlleleLengthListOfStringFromRawData(String rawDataString) {
if (rawDataString == null) {
return Collections.emptyList();
}
@@ -93,6 +107,21 @@ public static List getAlleleLengthListOfString(String rawDataString) {
return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data
}
+ /**
+ * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
+ * @param dataString may have surrounding brackets, with reduced delimieter
+ * @return
+ */
+ public static List getAlleleLengthListOfString(String dataString) {
+ if (dataString == null) {
+ return Collections.emptyList();
+ }
+ if (dataString.startsWith("[")) {
+ dataString = dataString.substring(1, dataString.length() - 1).replaceAll("\\s", "");
+ }
+ return Arrays.asList(dataString.split(ALLELE_SPECIFIC_REDUCED_DELIM, -1)); //-1 to keep empty data
+ }
+
static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) {
final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
" and possibly subsequent");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index 7045f4678ec..8ea5771b78e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -10,6 +10,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotation;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Event;
@@ -27,7 +28,7 @@
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
summary="Describe the complexity of an assembly region")
-public class AssemblyComplexity implements JumboInfoAnnotation {
+public class AssemblyComplexity implements JumboInfoAnnotation, AlleleSpecificAnnotation {
@Argument(fullName = "assembly-complexity-reference-mode",
doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes",
@@ -189,5 +190,4 @@ private static int uniqueVariants(final Haplotype hap1, final Haplotype hap2, fi
private static int editDistance(final Haplotype hap1, final Haplotype hap2, final int excludedPosition) {
return uniqueVariants(hap1, hap2, excludedPosition) + uniqueVariants(hap2, hap1, excludedPosition);
}
-
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java
index 8646a63e21d..ab9d8fc6b71 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/QualByDepth.java
@@ -7,6 +7,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
@@ -57,7 +58,7 @@ public Map annotate(final ReferenceContext ref,
final VariantContext vc,
final AlleleLikelihoods likelihoods) {
Utils.nonNull(vc);
- if ( !vc.hasLog10PError() ) {
+ if ( !(vc.hasLog10PError() || vc.hasAttribute(GATKVCFConstants.RAW_QUAL_APPROX_KEY)) ) {
return Collections.emptyMap();
}
@@ -72,7 +73,16 @@ public Map annotate(final ReferenceContext ref,
return Collections.emptyMap();
}
- final double qual = -10.0 * vc.getLog10PError();
+ final double qual;
+ if (vc.hasLog10PError()) {
+ qual = -10.0 * vc.getLog10PError();
+ } else {
+ try {
+ qual = vc.getAttributeAsInt(GATKVCFConstants.RAW_QUAL_APPROX_KEY, 0);
+ } catch (NumberFormatException e) {
+ throw new GATKException("Error at: " + vc.getContig() + ":" + vc.getStart() + " when parsing " + GATKVCFConstants.RAW_QUAL_APPROX_KEY + ": " + e.getMessage());
+ }
+ }
double QD = qual / depth;
// Hack: see note in the fixTooHighQD method below
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
index f29676286e0..fcac0673b89 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java
@@ -91,8 +91,8 @@
* --expression foo.FILTER
*
*
- *
Caveat
- *
This tool outputs no annotations by default, all annotations/groups must be specified explicitly.
+ *
Caveats
+ *
This tool outputs no annotations by default, all annotations/groups must be specified explicitly. This tool accepts VCF format files only. Using GVCF files as input may result in unexpected behavior.
*
*
Special note on RankSumTestAnnotations
*
RankSumAnnotations produced by this tool are not the same as those produced by the HaplotypeCaller. Without the
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
index e83be35f50f..930a6a51033 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java
@@ -145,6 +145,10 @@ public List getInfoAnnotations() {
return Collections.unmodifiableList(infoAnnotations);
}
+ public List getJumboInfoAnnotations() {
+ return Collections.unmodifiableList(jumboInfoAnnotations);
+ }
+
/**
*
* @param infoAnnotationClassName the name of the Java class, NOT the annotation VCF key
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
index f8bb3b3cea9..d0e15fa5ac0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_QualByDepth.java
@@ -5,8 +5,6 @@
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
-import htsjdk.variant.vcf.VCFHeaderLine;
-import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.lang3.StringUtils;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.ReferenceContext;
@@ -78,7 +76,9 @@ public List getRawDescriptions() {
public Map annotate(final ReferenceContext ref,
final VariantContext vc,
final AlleleLikelihoods likelihoods ) {
- return Collections.emptyMap();
+ // first vc is used for the annotation and the second vc here is used just to get the alleles, so in this case we can pass the same vc for both
+ Map annotation = finalizeRawData(vc, vc);
+ return (annotation == null ? Collections.emptyMap() : Collections.singletonMap(getKeyNames().get(0), annotation.get(getKeyNames().get(0))));
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
index d8466d0d997..b521f157d28 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java
@@ -153,7 +153,7 @@ public Map finalizeRawData(final VariantContext vc, final Varia
}
protected void parseRawDataString(ReducibleAnnotationData> myData) {
- List values = AnnotationUtils.getAlleleLengthListOfString(myData.getRawData());
+ List values = AnnotationUtils.getAlleleLengthListOfStringFromRawData(myData.getRawData());
if (values.size() != myData.getAlleles().size()) {
throw new IllegalStateException("Number of alleles and number of allele-specific entries do not match. " +
"Allele-specific annotations should have an entry for each allele including the reference.");
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
index acf1c74bf9c..11890925065 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/flow/FlowAnnotatorBase.java
@@ -34,7 +34,6 @@
* are accumulated as well.
*/
public abstract class FlowAnnotatorBase implements InfoFieldAnnotation {
- private final static Logger logger = LogManager.getLogger(FlowAnnotatorBase.class);
protected final OneShotLogger flowMissingOneShotLogger = new OneShotLogger(FlowAnnotatorBase.class);
@@ -202,7 +201,7 @@ protected void variantType(final VariantContext vc, final LocalContext localCont
if (isSpecial(alleles.get(i))){
continue;
}
- if ((localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
+ if ((localContext.hmerIndelLength==null) || (localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
isHmer=false;
}
}
@@ -250,7 +249,12 @@ protected void isHmerIndel(final VariantContext vc, final LocalContext localCont
// get byte before and after
final byte before = getReferenceNucleotide(localContext, vc.getStart() - 1);
final byte[] after = getReferenceHmerPlus(localContext, vc.getEnd() + 1, MOTIF_SIZE);
-
+ if (after.length==0){
+ flowMissingOneShotLogger.warn("Failed to get hmer from reference, probably because the " +
+ "variant is very close to the end of the chromosome, isHmerIndel and RightMotif annotations will not be calculated. " +
+ "Variant position: " + vc.getContig() + ":" + vc.getEnd() + 1);
+ return;
+ }
// build two haplotypes. add byte before and after
final byte[] refHap = buildHaplotype(before, ref.getBases(), after);
final byte[] altHap = buildHaplotype(before, alt.getBases(), after);
@@ -338,6 +342,12 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
}
String motif = getRefMotif(localContext, vc.getStart() - MOTIF_SIZE, MOTIF_SIZE);
+ if (motif.length() != MOTIF_SIZE){
+ flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because the variant is very close to the " +
+ "start of the chromosome. LeftMotif annotation will not be calculated. " +
+ "Variant position: " + vc.getContig() + ":" + vc.getStart());
+ return;
+ }
if ( a.length() != refLength ) {
motif = motif.substring(1) + vc.getReference().getBaseString().substring(0, 1);
}
@@ -350,8 +360,13 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
protected void getRightMotif(final VariantContext vc, final LocalContext localContext) {
final int refLength = vc.getReference().length();
- final String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
-
+ String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
+ if (motif.length() != MOTIF_SIZE){
+ flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because " +
+ "the variant is close to the end of the chromosome. RightMotif annotation will not be calculated. " +
+ "Variant position: " + vc.getContig() + ":" + vc.getStart());
+ return;
+ }
// fill empty entries (non indel alelles)
for ( int i = 0 ; i < localContext.rightMotif.size() ; i++ ) {
if ( localContext.rightMotif.get(i) == null ) {
@@ -366,6 +381,11 @@ protected void gcContent(final VariantContext vc, final LocalContext localContex
final int begin = vc.getStart() - (GC_CONTENT_SIZE / 2);
final String seq = getRefMotif(localContext, begin + 1, GC_CONTENT_SIZE);
+ if ( seq.length() != GC_CONTENT_SIZE ) {
+ flowMissingOneShotLogger.warn("gcContent will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
+ " because the variant is too close to the edge of the chromosome");
+ return;
+ }
int gcCount = 0;
for ( byte b : seq.getBytes() ) {
if ( b == 'G' || b == 'C' ) {
@@ -424,11 +444,11 @@ protected void cycleSkip(final VariantContext vc, final LocalContext localContex
localContext.attributes.put(GATKVCFConstants.FLOW_CYCLESKIP_STATUS, css);
}
- // get a single nucleoid from reference
+ // get a single nucleotid from reference
private byte getReferenceNucleotide(final LocalContext localContext, final int start) {
final int index = start - localContext.ref.getWindow().getStart();
final byte[] bases = localContext.ref.getBases();
- Utils.validIndex(index, bases.length);
+ Utils.validIndex(index, bases.length); // do not catch, if here the location is outside of the reference, there is a problem!
return bases[index];
}
@@ -436,7 +456,13 @@ private byte getReferenceNucleotide(final LocalContext localContext, final int s
private byte[] getReferenceHmerPlus(final LocalContext localContext, final int start, final int additional) {
int index = start - localContext.ref.getWindow().getStart();
final byte[] bases = localContext.ref.getBases();
- Utils.validIndex(index, bases.length);
+ try {
+ Utils.validIndex(index, bases.length);
+ } catch (IllegalArgumentException e) {
+ flowMissingOneShotLogger.warn("Failed to get hmer from reference, too close to the edge. " +
+ "Position: " + localContext.ref.getContig() + ":" + index);
+ return new byte[0];
+ }
// get hmer
final StringBuilder sb = new StringBuilder();
@@ -458,8 +484,12 @@ private String getRefMotif(final LocalContext localContext, final int start, fin
final byte[] bases = localContext.ref.getBases();
final int startIndex = start - localContext.ref.getWindow().getStart();
final int endIndex = startIndex + length;
- Utils.validIndex(startIndex, bases.length);
- Utils.validIndex(endIndex-1, bases.length);
+ try {
+ Utils.validIndex(startIndex, bases.length);
+ Utils.validIndex(endIndex-1, bases.length);
+ } catch (IllegalArgumentException e) {
+ return "";
+ }
return new String(Arrays.copyOfRange(bases, startIndex, endIndex));
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
index 9c4f09d1bcd..9d6c40a691c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/contamination/GetPileupSummaries.java
@@ -9,6 +9,7 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
@@ -100,6 +101,13 @@
* file that have AF of 0.01 or more.
*
*
+ *
+ * Finally for those using mappers other than bwa mem or dragen-os, {@code --minimum-mapping-quality} threshold is
+ * set to 50, which limits the usable reads that tool considers for generating pileups. Certain mappers are known to
+ * assign scores less that this threshold even for the unique mappings. If you observe all empty results in your
+ * summary file please adjust the {@code --minimum-mapping-quality} parameter according to your input files.
+ *
+ *
*/
@CommandLineProgramProperties(
summary = "Tabulates pileup metrics for inferring contamination",
@@ -112,8 +120,7 @@ public class GetPileupSummaries extends LocusWalker {
public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency";
public static final String MAX_SITE_AF_SHORT_NAME = "max-af";
public static final String MIN_SITE_AF_SHORT_NAME = "min-af";
- public static final String MIN_MAPPING_QUALITY_LONG_NAME = "min-mapping-quality";
- public static final String MIN_MAPPING_QUALITY_SHORT_NAME = "mmq";
+
private static final double DEFAULT_MIN_POPULATION_AF = 0.01;
private static final double DEFAULT_MAX_POPULATION_AF = 0.2;
@@ -137,9 +144,6 @@ public class GetPileupSummaries extends LocusWalker {
doc = "Maximum population allele frequency of sites to consider.", optional = true)
private double maxPopulationAlleleFrequency = DEFAULT_MAX_POPULATION_AF;
- @Argument(fullName = MIN_MAPPING_QUALITY_LONG_NAME, shortName = MIN_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum read mapping quality", optional = true)
- private int minMappingQuality = DEFAULT_MINIMUM_MAPPING_QUALITY;
-
private boolean sawVariantsWithoutAlleleFrequency = false;
private boolean sawVariantsWithAlleleFrequency = false;
@@ -168,6 +172,7 @@ public boolean requiresFeatures() {
@Override
public List getDefaultReadFilters() {
final List filters = new ArrayList<>();
+ filters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
filters.add(ReadFilterLibrary.MAPPING_QUALITY_AVAILABLE);
filters.add(ReadFilterLibrary.MAPPING_QUALITY_NOT_ZERO);
filters.add(ReadFilterLibrary.MAPPED);
@@ -208,8 +213,7 @@ public void apply(AlignmentContext alignmentContext, ReferenceContext referenceC
final VariantContext vc = vcs.get(0);
if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) {
- final ReadPileup pileup = alignmentContext.getBasePileup()
- .makeFilteredPileup(pe -> pe.getRead().getMappingQuality() >= minMappingQuality);
+ final ReadPileup pileup = alignmentContext.getBasePileup();
try {
writer.writeRecord(new PileupSummary(vc, pileup));
} catch (final IOException ex) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java
new file mode 100644
index 00000000000..d5b4b0c995a
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java
@@ -0,0 +1,66 @@
+package org.broadinstitute.hellbender.tools.walkers.conversion;
+
+import htsjdk.samtools.util.Interval;
+
+/**
+ * A class that represents information extracted from a feature in a GTF file.
+ * The {@code GtfInfo} object encapsulates details about a specific gene or transcript,
+ * including its type, gene name, and interval.
+ *
+ *
The {@code GtfInfo.Type} enum specifies whether the features is a
+ * {@code GENE} or a {@code TRANSCRIPT}.
+ *
+ *
The interval specifies the feature's contig (chromosome), start position,
+ * and end position, providing the precise location of the gene or transcript
+ * on the genome.
+ *
+ *
Example usage:
+ *
+ * Interval interval = new Interval("chr1", 1000, 2000);
+ * GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, "MAPK1");
+ *
Convert Gencode GTF files to BED format with options for gene and transcript level processing.
+ * This tool allows for the extraction of gene and transcript information from Gencode GTF files and
+ * outputs the data in BED format.
+ *
+ *
+ *
The conversion process includes sorting entries
+ * by karyotype, providing flexibility in the selection of either gene or transcript level
+ * data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use.
+ * Note that it has been tested for both human and mouse Gencode GTFs.
+ *
+ *
Usage examples
+ *
Example commands to run GtfToBed for typical scenarios:
+ *
+ *
(i) Convert GTF to BED with gene level data
+ *
This mode extracts and converts gene data from the input GTF file to BED format:
+ */
+
+@CommandLineProgramProperties(
+ summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.",
+ oneLineSummary = "Gencode GTF to BED",
+ programGroup = ShortVariantDiscoveryProgramGroup.class
+)
+
+@DocumentedFeature
+@WorkflowProperties
+public class GtfToBed extends FeatureWalker {
+ public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript";
+ public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript";
+ public static final String INPUT_LONG_NAME = "gtf-path";
+ protected final Logger logger = LogManager.getLogger(this.getClass());
+
+ @Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file")
+ public GATKPath inputFile;
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file")
+ public GATKPath outputFile;
+
+ @Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true)
+ public boolean sortByTranscript = false;
+
+ @Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts")
+ public boolean sortByBasic = false;
+
+ //stores either gene or transcript ID and summary information about the feature
+ private final Map featureInfoMap = new HashMap<>();
+
+ //Sequence Dictionary
+ private SAMSequenceDictionary sequenceDictionary = null;
+
+ @Override
+ protected boolean isAcceptableFeatureType(Class extends Feature> featureType) {
+ return featureType.isAssignableFrom(GencodeGtfFeature.class);
+ }
+
+ // runs per line of gtf file
+ @Override
+ public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+ // list of all features of the gene
+ List geneFeatures = feature.getAllFeatures();
+
+ // process each gtf feature in the list of gene features
+ for (GencodeGtfFeature gtfFeature : geneFeatures) {
+ // the basic tag is in optional fields
+ List> optionalFields = getOptionalFields(gtfFeature);
+
+ // if the gtf feature is a Gene
+ if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) {
+ processGeneFeature(gtfFeature);
+ }
+
+ // check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag
+ else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) {
+ if (sortByBasic) {
+ for (GencodeGtfFeature.OptionalField> field : optionalFields) {
+ if ("basic".equals(field.getValue())) {
+ processTranscriptFeature(gtfFeature);
+ }
+ }
+ } else {
+ processTranscriptFeature(gtfFeature);
+ }
+ }
+ }
+ }
+
+ // gets the tag out of the list of optional fields
+ private List> getOptionalFields(GencodeGtfFeature gtfFeature) {
+ List> optionalFields = null;
+ try {
+ optionalFields = gtfFeature.getOptionalField("tag");
+ } catch (Exception e) {
+ logger.error("Could not retrieve optional fields: ", e);
+ }
+ return optionalFields;
+ }
+
+ // stores the gene ID and Interval info in hashmap
+ private void processGeneFeature(GencodeGtfFeature gtfFeature) {
+ final int geneStart = gtfFeature.getStart();
+ final int geneEnd = gtfFeature.getEnd();
+ final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
+
+ // put the interval, type as gene, and the name of gene
+ final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
+
+ // store in hashmap with key as geneId
+ featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo);
+ }
+
+ // stores the transcript ID and Interval info in hashmap
+ private void processTranscriptFeature(GencodeGtfFeature gtfFeature) {
+ //get interval and put the interval, type as transcript, and the name of the gene it's in
+ final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd());
+ final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName());
+
+ //store in hashmap with key as transcriptId
+ featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo);
+
+ //update start/end of corresponding gene if needed
+ updateGeneStart(gtfFeature);
+ updateGeneEnd(gtfFeature);
+ }
+
+ // update the gene interval start position based on the transcript
+ private void updateGeneStart(GencodeGtfFeature gtfFeature) {
+ // get the start value of the gene
+ int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart();
+
+ // if the transcript start is less than the gene start
+ if (gtfFeature.getStart() < geneStart) {
+ // set the gene start to be the transcript start
+ geneStart = gtfFeature.getStart();
+ updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd());
+ }
+ }
+
+ // update the gene interval end position based on the transcript
+ private void updateGeneEnd(GencodeGtfFeature gtfFeature) {
+ // get the end value of the gene
+ int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd();
+
+ // if the transcript end is greater than the gene end
+ if (gtfFeature.getEnd() > geneEnd) {
+ // set the gene end to be the transcript end
+ geneEnd = gtfFeature.getEnd();
+ updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd);
+ }
+ }
+
+ // updates an interval of the gene if it needs to be changed
+ private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) {
+ Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
+ GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
+ featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo);
+ }
+
+ @Override
+ public void onTraversalStart() {
+ sequenceDictionary = getBestAvailableSequenceDictionary();
+ if(sequenceDictionary == null){
+ throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ").");
+ }
+ }
+
+ // runs immediately after it has gone through each line of gtf (apply method)
+ @Override
+ public Object onTraversalSuccess() {
+ // create linked hash map to store sorted values of idToInfo
+ LinkedHashMap karyotypeIdToInfo = getSortedMap(sequenceDictionary);
+
+ // if user wants to sort by transcript only use transcripts else only use genes
+ GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE;
+ writeToBed(selectedType, karyotypeIdToInfo);
+
+ return null;
+ }
+
+ //Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId
+ public static class GtfInfoComparator implements Comparator> {
+
+ private final SAMSequenceDictionary dictionary;
+
+ GtfInfoComparator(SAMSequenceDictionary dictionary) {
+ this.dictionary = dictionary;
+ }
+
+ // compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object
+ @Override
+ public int compare(Map.Entry e1, Map.Entry e2) {
+ final Interval e1Interval = e1.getValue().getInterval();
+ final Interval e2Interval = e2.getValue().getInterval();
+
+ Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig());
+ Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig());
+
+ //compare by contig, then start, then by key
+ return Comparator
+ .comparingInt((Map.Entry e) ->
+ dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex())
+ .thenComparingInt(e -> e.getValue().getInterval().getStart())
+ .thenComparing(Map.Entry::getKey)
+ .compare(e1,e2);
+
+
+ }
+ }
+
+ // sorts the map containing the features based on contig and start position
+ private LinkedHashMap getSortedMap(SAMSequenceDictionary sequenceDictionary) {
+ // create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator
+ List> entries = new ArrayList<>(featureInfoMap.entrySet());
+ entries.sort(new GtfInfoComparator(sequenceDictionary));
+
+ // put each (sorted) entry in the list into a linked hashmap
+ LinkedHashMap karyotypeIdToInfo = new LinkedHashMap<>();
+ for (Map.Entry entry : entries) {
+ karyotypeIdToInfo.put(entry.getKey(), entry.getValue());
+ }
+
+ return karyotypeIdToInfo;
+ }
+
+ // writes to bed file
+ private void writeToBed(GtfInfo.Type type, Map sortedMap) {
+ try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) {
+ for (Map.Entry entry : sortedMap.entrySet()) {
+ if (entry.getValue().getType() == type) {
+ String line = formatBedLine(entry, type);
+ writer.write((line + System.lineSeparator()).getBytes());
+ }
+ }
+ } catch (IOException e) {
+ throw new GATKException("Error writing to BED file", e);
+ }
+ }
+
+ // formats each line of the bed file depending on whether user has selected gene or transcript
+ private String formatBedLine(Map.Entry entry, GtfInfo.Type type) {
+ GtfInfo info = entry.getValue();
+ String line = info.getInterval().getContig() + "\t" +
+ info.getInterval().getStart() + "\t" +
+ info.getInterval().getEnd() + "\t" +
+ info.getGeneName();
+
+ if (type == GtfInfo.Type.TRANSCRIPT) {
+ line += "," + entry.getKey();
+ }
+
+ return line;
+ }
+
+ @Override
+ public GATKPath getDrivingFeaturePath() {
+ return inputFile;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java
index 2c3b38f0a9d..0406ad861d5 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java
@@ -41,6 +41,7 @@
*
If there are multiple variants that start at a site, it chooses one of them randomly.
*
When there are overlapping indels (but with different start positions) only the first will be chosen.
*
This tool works only for SNPs and for simple indels (but not for things like complex substitutions).
+ *
This tool works only with VCF files. Using GVCF files as input may result in unexpected behavior.
*
*
Input
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java
new file mode 100644
index 00000000000..83ac040011c
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java
@@ -0,0 +1,626 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import com.google.common.annotations.VisibleForTesting;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.broadinstitute.barclay.argparser.*;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
+import org.broadinstitute.hellbender.engine.FeatureContext;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.engine.ReadWalker;
+import org.broadinstitute.hellbender.engine.ReferenceContext;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.groundtruth.SeriesStats;
+import org.broadinstitute.hellbender.utils.read.*;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.*;
+
+@CommandLineProgramProperties(
+ summary = "This program converts the flow qualities that Ultima Genomics CRAM reports to more conventional base qualities. " +
+ "Specifically, the generated quality will report the probability that a specific base is a sequencing error mismatch, " +
+ "while auxilary tags qa, qt, qg, qc report specific probability that a specific base X is a A->X error. " +
+ "Since mismatch error in flow-based chemistries can only occur as a result of several indel errors, " +
+ "we implemented several strategies to estimate the probability of a mismatch which can be specified" +
+ "using the svnq-mode parameter: " +
+ "Legacy - the quality value from flow matrix is used. " +
+ "Optimistic - assuming that the probability of the indel errors are p1 and p2, then snvq=p1*p2 - assuming they always coincide. " +
+ "Pessimistic - snvq=(1-p1)*(1-p2) - assuming they never coincide. " +
+ "Geometric - snvq=sqrt(Optimistic*Pessimistic) - i.e. the geometric mean of the optimistic and Pessimistic modes. " +
+ "The Geometric is set as the default mode",
+ oneLineSummary = "Add SNV Quality to the flow-based CRAM",
+ programGroup = FlowBasedProgramGroup.class
+)
+
+@DocumentedFeature
+@ExperimentalFeature
+public final class AddFlowSNVQuality extends ReadWalker {
+
+ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+ shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
+ doc = "File to which reads should be written")
+ @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
+ public GATKPath output = null;
+ private SAMFileGATKReadWriter outputWriter;
+
+ @ArgumentCollection
+ public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();
+
+ @ArgumentCollection
+ public AddFlowSNVQualityArgumentCollection aqArgs = new AddFlowSNVQualityArgumentCollection();
+
+ public static final char PHRED_ASCII_BASE = '!';
+
+ public static final int ERROR_PROB_BAND_1LESS = 0;
+ public static final int ERROR_PROB_BAND_KEY = 1;
+ public static final int ERROR_PROB_BAND_1MORE = 2;
+ public static final int ERROR_PROB_BANDS = 3;
+
+ public double minLikelihoodProbRate = 1e-6;
+ public int maxQualityScore = 60;
+
+ // locals
+ private SeriesStats inputQualStats = new SeriesStats();
+ private SeriesStats outputBQStats = new SeriesStats();
+ private SeriesStats outputQAltStats = new SeriesStats();
+ private SeriesStats outputQCalledStats = new SeriesStats();
+ private SeriesStats outputSumPStats = new SeriesStats();
+
+ // private class to hold the base probabilities and SNVQ probabilties for a read
+ class ReadProbs {
+ double[] baseProbs;
+ double[][] snvqProbs; // length of first dimension is flow order length
+ }
+
+ @Override
+ public void onTraversalStart() {
+ super.onTraversalStart();
+ outputWriter = createSAMWriter(output, true);
+ }
+
+ @Override
+ public void closeTool() {
+ super.closeTool();
+ if ( outputWriter != null ) {
+ outputWriter.close();
+ }
+
+ try {
+ if ( aqArgs.debugCollectStatsInto != null )
+ printStats(aqArgs.debugCollectStatsInto);
+ } catch (IOException e) {
+ throw new GATKException("", e);
+ }
+ }
+
+ @Override
+ public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
+
+ // include supplementary alignments?
+ if ( read.isSupplementaryAlignment() && !aqArgs.keepSupplementaryAlignments ) {
+ return;
+ }
+
+ // include qc-failed reads?
+ if ( read.failsVendorQualityCheck() && !aqArgs.includeQcFailedReads ) {
+ return;
+ }
+
+ // collect input stats
+ if ( aqArgs.debugCollectStatsInto != null ) {
+ collectInputStats(read);
+ }
+
+ // add SNVQ attributes
+ addBaseQuality(read, getHeaderForReads(), aqArgs.maxPhredScore, fbargs);
+
+ // collect output stats
+ if ( aqArgs.debugCollectStatsInto != null ) {
+ collectOutputStats(read);
+ if ( aqArgs.debugReadName.size() != 0 && aqArgs.debugReadName.contains(read.getName()) ) {
+ dumpOutputRead(read);
+ }
+ }
+
+ // write read to output
+ outputWriter.addRead(read);
+ }
+
+ private void collectInputStats(GATKRead read) {
+ for ( byte q : read.getBaseQualitiesNoCopy() ) {
+ inputQualStats.add(q);
+ }
+ }
+
+ private void collectOutputStats(GATKRead read) {
+ if ( aqArgs.outputQualityAttribute != null ) {
+ if (read.hasAttribute(aqArgs.outputQualityAttribute)) {
+ for (byte q : read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes()) {
+ outputBQStats.add(SAMUtils.fastqToPhred((char) q));
+ }
+ }
+ } else {
+ for (byte q : read.getBaseQualitiesNoCopy()) {
+ outputBQStats.add(q);
+ }
+ }
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+ final byte[] bases = read.getBasesNoCopy();
+ final double[] sumP = new double[bases.length];
+ for ( int i = 0 ; i < 4 ; i++ ) {
+ byte altBase = rgInfo.flowOrder.getBytes()[i];
+ String attrValue = read.getAttributeAsString(attrNameForNonCalledBase(altBase));
+ int ofs = 0;
+ for ( byte q : attrValue.getBytes() ) {
+ if ( bases[ofs] != altBase ) {
+ outputQAltStats.add(SAMUtils.fastqToPhred((char)q));
+ } else {
+ outputQCalledStats.add(SAMUtils.fastqToPhred((char)q));
+ }
+ sumP[ofs] += Math.pow(10.0, SAMUtils.fastqToPhred((char)q) / -10.0);
+ ofs++;
+
+ }
+ }
+ for ( double p : sumP ) {
+ outputSumPStats.add(p);
+ }
+ }
+
+ // dump read as a csv for easier analysis
+ private void dumpOutputRead(GATKRead read) {
+
+ try {
+ // open file
+ final String fname = aqArgs.debugCollectStatsInto + "." + read.getName() + ".csv";
+ logger.info("dumping read into: " + fname);
+ final PrintWriter pw = new PrintWriter(fname);
+
+ // write header
+ final StringBuilder hdr = new StringBuilder();
+ hdr.append("pos,base,qual,tp,t0,bq");
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
+ for (int i = 0; i < 4; i++) {
+ hdr.append(",");
+ hdr.append(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i)));
+ }
+ hdr.append(",qCalled");
+ pw.println(hdr);
+
+ // access data
+ final byte[] bases = read.getBasesNoCopy();
+ final byte[] quals = read.getBaseQualitiesNoCopy();
+ final byte[] tp = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_TAG_NAME);
+ final byte[] t0 = read.getAttributeAsByteArray(FlowBasedRead.FLOW_MATRIX_T0_TAG_NAME);
+ final byte[] bq = (aqArgs.outputQualityAttribute != null)
+ ? read.getAttributeAsString(aqArgs.outputQualityAttribute).getBytes()
+ : null;
+ final byte[][] qX = new byte[4][];
+ for (int i = 0; i < 4; i++) {
+ qX[i] = read.getAttributeAsString(attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i))).getBytes();
+ }
+
+ // write lines
+ List line = new LinkedList<>();
+ for (int pos = 0; pos < bases.length; pos++) {
+ line.clear();
+
+ // position
+ line.add(Integer.toString(pos));
+
+ // base, qual
+ line.add(Character.toString(bases[pos]));
+ line.add(Integer.toString(quals[pos]));
+
+ // tp,t0,bq
+ line.add(Integer.toString(tp[pos]));
+ line.add(Integer.toString(SAMUtils.fastqToPhred((char)t0[pos])));
+ if ( bq != null ) {
+ line.add(Integer.toString(SAMUtils.fastqToPhred((char) bq[pos])));
+ }
+
+ // qX
+ int calledIndex = -1;
+ for (int i = 0; i < 4; i++) {
+ line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[i][pos])));
+ if ( bases[pos] == rgInfo.flowOrder.charAt(i) ) {
+ calledIndex = i;
+ }
+ }
+
+ // qCalled
+ if ( calledIndex >= 0 ) {
+ line.add(Integer.toString(SAMUtils.fastqToPhred((char)qX[calledIndex][pos])));
+ } else {
+ line.add("-1");
+ }
+
+ // write the line
+ pw.println(StringUtils.join(line, ','));
+ }
+
+ // close file
+ pw.close();
+ } catch (IOException e) {
+ throw new GATKException("", e);
+ }
+ }
+
+ private void printStats(final String fname) throws IOException {
+
+ inputQualStats.csvWrite(fname + ".inputQual.csv");
+ outputBQStats.csvWrite(fname + ".outputBQ.csv");
+ outputQAltStats.csvWrite(fname + ".outputQAlt.csv");
+ outputQCalledStats.csvWrite(fname + ".outputQCalled.csv");
+ outputSumPStats.csvWrite(fname + ".outputSumP.csv");
+ }
+
+ static public String attrNameForNonCalledBase(byte nonCalledBase) {
+ return attrNameForNonCalledBase((char)nonCalledBase);
+ }
+
+ static public String attrNameForNonCalledBase(char nonCalledBase) {
+ return "q" + Character.toLowerCase(nonCalledBase);
+ }
+
+ public void addBaseQuality(final GATKRead read, final SAMFileHeader hdr, double maxPhredScore, FlowBasedArgumentCollection fbargs) {
+
+ // take in phred score limit
+ if ( !Double.isNaN(maxPhredScore) ) {
+ maxQualityScore = (int)maxPhredScore;
+ minLikelihoodProbRate = Math.pow(10, -maxPhredScore / 10.0);
+ }
+
+ // convert to a flow base read
+ final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(hdr, read);
+ final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
+ final int flowOrderLength = FlowBasedReadUtils.calcFlowOrderLength(rgInfo.flowOrder);
+
+ // generate base and snvq probabilities for the read
+ final ReadProbs readProbs = generateFlowReadBaseAndSNVQErrorProbabilities(fbRead, flowOrderLength, rgInfo.flowOrder.getBytes());
+
+ // install in read
+ if ( aqArgs.outputQualityAttribute != null ) {
+ read.setAttribute(aqArgs.outputQualityAttribute, new String(convertErrorProbToFastq(readProbs.baseProbs)));
+ } else {
+ read.setBaseQualities(convertErrorProbToPhred(readProbs.baseProbs));
+ }
+ for ( int i = 0 ; i < flowOrderLength ; i++ ) {
+ final String name = AddFlowSNVQuality.attrNameForNonCalledBase(rgInfo.flowOrder.charAt(i));
+ read.setAttribute(name, new String(convertErrorProbToFastq(readProbs.snvqProbs[i])));
+ }
+ }
+
+ // Not using SamUtils function since normally an error probability can not be zero.
+ // still, this method is called to convert base quality as well as snvq, which is computed.
+ // the following check is a safety, in case snvq produces a zero.
+ private char[] convertErrorProbToFastq(double[] errorProb) {
+
+ byte[] phred = convertErrorProbToPhred(errorProb);
+ return SAMUtils.phredToFastq(phred).toCharArray();
+ }
+
+ // Not using SamUtils function since normally an error probability can not be zero.
+ // still, this method is called to convert base quality as well as snvq, which is computed.
+ // the following check is a safety, in case snvq produces a zero.
+ private byte[] convertErrorProbToPhred(double[] errorProb) {
+
+ final byte[] phred = new byte[errorProb.length];
+ for ( int i = 0 ; i < errorProb.length ; i++ ) {
+
+ if ( errorProb[i] == 0 ) {
+ phred[i] = (byte)maxQualityScore;
+ } else {
+ final double p = errorProb[i];
+ phred[i] = (byte)Math.round(-10 * Math.log10(p));
+ }
+ }
+ return phred;
+ }
+
+ /**
+ * generate base and snvq probabilties for a read.
+ *
+ * @param fbRead a flow based read
+ * @param flowOrderLength number of bases in flow order (essentially number of valid base values)
+ * @param flowOrder the flow order itself (which can be the size of flowOrderLength or a repeat of it
+ *
+ * @return an instance of a private class containing the base probabilities as well as the snvq probabilities
+ */
+ private ReadProbs generateFlowReadBaseAndSNVQErrorProbabilities(final FlowBasedRead fbRead, final int flowOrderLength, byte[] flowOrder) {
+
+ /**
+ * access key and error probabilities
+ * for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix}
+ */
+ final int[] key = fbRead.getKey();
+ final double[][] errorProbBands = extractErrorProbBands(fbRead, minLikelihoodProbRate);
+
+ // allocate returned prob arrays
+ final double[] baseProbs = new double[fbRead.getBasesNoCopy().length];
+ final double[][] snvqProbs = new double[flowOrderLength][];
+ for ( int i = 0 ; i < snvqProbs.length ; i++ ) {
+ snvqProbs[i] = new double[baseProbs.length];
+ }
+
+ // loop over hmers via flow key
+ int base = 0;
+ Map allBaseProb0 = new LinkedHashMap<>();
+ Map allBaseProb1 = new LinkedHashMap<>();
+
+ for ( int flow = 0 ; flow < key.length ; flow++ ) {
+ if ( key[flow] != 0 ) {
+
+ // establish initial stat
+ allBaseProb0.clear();
+ allBaseProb1.clear();
+ int flow_i = (flow % flowOrderLength);
+
+ // establish hmer quality
+ final int hmerLength = key[flow];
+ final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength, flowOrder, allBaseProb0, allBaseProb1);
+
+ // install value in first byte of the hmer
+ baseProbs[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer
+ for ( int i = 0 ; i < flowOrderLength ; i++ ) {
+ if ( allBaseProb0.containsKey(flowOrder[i]) ) {
+ snvqProbs[i][base - 1] = allBaseProb0.get(flowOrder[i]);
+ } else if ( i != flow_i ) {
+ snvqProbs[i][base - 1] = minLikelihoodProbRate;
+ }
+ }
+
+ // for hmers longer than 1
+ if ( hmerLength > 1 ) {
+
+ // skip all but last (leave with zero error probability)
+ base += (hmerLength - 2);
+
+ // fill last base from computed error probability
+ baseProbs[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1
+
+ for ( int i = 0 ; i < flowOrderLength ; i++ ) {
+ if ( allBaseProb1.containsKey(flowOrder[i]) ) {
+ final double p = allBaseProb1.get(flowOrder[i]);
+ for ( int j = 0 ; j < hmerLength - 1 ; j++ ) {
+ snvqProbs[i][base - 1 - j] = (j == 0) ? p : minLikelihoodProbRate; // all but last get the min prob
+ }
+ } else if ( i != flow_i ) {
+ for ( int j = 0 ; j < hmerLength - 1 ; j++ ) {
+ snvqProbs[i][base - 1 - j] = minLikelihoodProbRate;
+ }
+ }
+ }
+ }
+
+ // override result for the last base with the original hmer error probability
+ if ( base == baseProbs.length ) {
+ baseProbs[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow];
+ }
+ }
+ }
+
+ // adjust probability of called bases so that sum will be 1, also enforce min prob
+ final byte[] bases = fbRead.getBasesNoCopy();
+ for ( int ofs = 0 ; ofs < bases.length ; ofs++ ) {
+
+ // go through alt bases and accumulate p, find out index of called bases (in flow order)
+ final byte calledBase = bases[ofs];
+ double altP = 0;
+ int calledIndex = -1;
+ for (int i = 0; i < flowOrderLength; i++) {
+ if ( calledBase != flowOrder[i] ) {
+ snvqProbs[i][ofs] = Math.max(minLikelihoodProbRate, snvqProbs[i][ofs]);
+ altP += snvqProbs[i][ofs];
+ } else {
+ calledIndex = i;
+ }
+ }
+ if ( calledBase < 0 ) {
+ throw new GATKException(String.format("failed to locate called base %c in flow order %s", (char)calledBase, flowOrder));
+ }
+
+ // install probability in called base slot
+ snvqProbs[calledIndex][ofs] = Math.max(0, 1 - altP);
+
+ // at this point, bq becomes trivial (?)
+ baseProbs[ofs] = 1 - snvqProbs[calledIndex][ofs];
+ }
+
+ // build return value
+ ReadProbs readProbs = new ReadProbs();
+ readProbs.baseProbs = baseProbs;
+ readProbs.snvqProbs = snvqProbs;
+ return readProbs;
+ }
+
+ // extract error probability bands. middle (1) band is the key prob.
+ // lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths
+ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) {
+
+ // access key
+ final int[] key = flowRead.getKey();
+
+ // allocate result
+ double[][] result = new double[ERROR_PROB_BANDS][];
+ for ( int i = 0 ; i < result.length ; i++ ) {
+ result[i] = new double[key.length];
+ }
+
+ for ( int i = 0 ; i < key.length ; i++ ) {
+
+ // extract key probability
+ result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue);
+
+ // -1
+ if ( key[i] > 0 ) {
+ result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue);
+ } else {
+ result[ERROR_PROB_BAND_1LESS][i] = minValue;
+ }
+
+ // +1
+ if ( key[i] < flowRead.getMaxHmer() ) {
+ result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue);
+ } else {
+ result[ERROR_PROB_BAND_1MORE][i] = minValue;
+ }
+ }
+
+ return result;
+ }
+
+ @VisibleForTesting
+ protected double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow,
+ final int flowOrderLength, byte[] flowOrder,
+ Map allBaseProb0, Map allBaseProb1) {
+
+ // result is left/right error probabilities
+ final double[] errorProbs = new double[2];
+ final int hmerLength = key[flow];
+
+ errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength, flowOrder, allBaseProb0);
+ if ( hmerLength != 1 ) {
+ errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength, flowOrder, allBaseProb1);
+ }
+
+ return errorProbs;
+ }
+
+ private double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr,
+ final int flowOrderLength, final byte[] flowOrder, final Map allBaseProb) {
+
+ // create a key slice of the area around the flow/hmer.
+ final int minIndex = Math.max(flow - (flowOrderLength - 1), 0);
+ final int maxIndex = Math.min(flow + (flowOrderLength - 1), key.length - 1);
+ final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1);
+ final int hmerLength = key[flow];
+
+ // walk the flows towards the side until (and including) the first non-zero key
+ // on hmers of length 1 we walk both sides
+ final class SliceInfo {
+ int[] slice;
+ byte altByte;
+ int sideFlow;
+ }
+ final List slices = new LinkedList<>();
+ final int[] incrs = (hmerLength != 1)
+ ? new int[] { sideIncr }
+ : new int[] { sideIncr, -sideIncr};
+ for (int incr : incrs) {
+ for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) {
+
+ // side flow can no overflow the slice
+ if ( sideFlow < minIndex || sideFlow > maxIndex ) {
+ break;
+ }
+
+ // create a alternative key slice by incrementing sideFlow and decrementing flow
+ final int[] altSlice = Arrays.copyOf(slice, slice.length);
+ altSlice[sideFlow - minIndex] += 1;
+ altSlice[flow - minIndex] -= 1;
+ if ( sliceIsValidForConsideration(altSlice, flowOrderLength) ) {
+ SliceInfo si = new SliceInfo();
+ si.slice = altSlice;
+ si.altByte = flowOrder[sideFlow % flowOrderLength];
+ si.sideFlow = sideFlow;
+ slices.add(si);
+ }
+
+ // is the sideFlow (the first encountered) non-zero? if so, break
+ if (key[sideFlow] != 0) {
+ break;
+ }
+ }
+ }
+
+ // at this point, we have a list of valid slices. figure out the error probability for each of them
+ // and compute the base quality
+ final double keyP = sliceProbs(slice, minIndex, key, errorProbBands, flow, flow)[0];
+ double sumP = keyP;
+ for ( final SliceInfo si : slices ) {
+ final double[] sliceP = sliceProbs(si.slice, minIndex, key, errorProbBands, flow, si.sideFlow);
+ if ( allBaseProb != null ) {
+ allBaseProb.put(si.altByte, getSnvq(sliceP[0], sliceP[1], sliceP[2], aqArgs.snvMode));
+ }
+ sumP += sliceP[0];
+ }
+ final double ep = 1 - (keyP / sumP);
+
+ return ep;
+ }
+
+ static double getSnvq(final double sliceP, final double p1, final double p2, AddFlowSNVQualityArgumentCollection.SnvqModeEnum snvMode) {
+ if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Legacy ) {
+ return sliceP;
+ } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Optimistic ) {
+ return (p1 * p2);
+ } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Pessimistic ) {
+ return (1 - (1 - p1) * (1 - p2));
+ } else if ( snvMode == AddFlowSNVQualityArgumentCollection.SnvqModeEnum.Geometric ) {
+ return Math.sqrt((p1 * p2) * (1 - (1 - p1) * (1 - p2)));
+ } else {
+ throw new GATKException("unknown snvqMode: " + snvMode);
+ }
+ }
+
+ // compute probability for a slice
+ private static double[] sliceProbs(final int[] slice, final int minIndex, final int[] key, final double[][] errorProbBands,
+ final int flow, final int sideFlow) {
+
+ double accumulatedP = 1.0;
+ double p1 = 0.0;
+ double p2 = 0.0;
+ int key_i = minIndex;
+ for ( int i = 0 ; i < slice.length ; i++, key_i++ ) {
+ final int hmer = key[key_i];
+ final int band;
+ if ( slice[i] == (hmer - 1) ) {
+ band = ERROR_PROB_BAND_1LESS;
+ } else if ( slice[i] == (hmer + 1) ) {
+ band = ERROR_PROB_BAND_1MORE;
+ } else if ( slice[i] == hmer ){
+ band = ERROR_PROB_BAND_KEY;
+ } else {
+ throw new GATKException("slice[i] and hmer are too far apart: " + slice[i] + " " + hmer);
+ }
+ final double p = errorProbBands[band][key_i];
+ accumulatedP *= p;
+
+ // collect p1/p2 (flow and sideFlow probs)
+ if ( key_i == flow ) {
+ p1 = p;
+ }
+ if ( key_i == sideFlow ) {
+ p2 = p;
+ }
+ }
+
+ return new double[] {accumulatedP, p1, p2};
+ }
+
+ static boolean sliceIsValidForConsideration(final int[] slice, final int flowOrderLength) {
+
+ // look for strings of consecutive zeros in length of flowOrderLength - 1
+ int consecutiveZeros = 0;
+ for ( int key : slice ) {
+ if ( key != 0 ) {
+ consecutiveZeros = 0;
+ } else {
+ consecutiveZeros++;
+ if ( consecutiveZeros >= (flowOrderLength - 1) ) {
+ return false;
+ }
+ }
+ }
+
+ // if here, not found -> valid
+ return true;
+ }
+}
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java
new file mode 100644
index 00000000000..4c20b14e59c
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java
@@ -0,0 +1,70 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.Hidden;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Set of arguments for the {@link AddFlowSNVQuality}
+ */
+public class AddFlowSNVQualityArgumentCollection implements Serializable{
+ private static final long serialVersionUID = 1L;
+ public static final String MAX_PHRED_SCORE_FULL_NAME = "max-phred-score";
+ public static final String KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME = "keep-supplementary-alignments";
+ public static final String INCLUDE_QC_FAILED_READ_FULL_NAME = "include-qc-failed-read";
+ public static final String SNVQ_MODE_FULL_NAME = "snvq-mode";
+ public static final String OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME = "output-quality-attribute";
+ public static final String DEBUG_READ_NAME_FULL_NAME = "debug-read-name";
+ public static final String DEBUG_COLLECT_STATS_INTO_FULL_NAME = "debug-collect-stats-into";
+
+ public enum SnvqModeEnum {
+ Legacy,
+ Optimistic,
+ Pessimistic,
+ Geometric
+ };
+
+ /**
+ * maximum value for
+ * delta in score
+ **/
+ @Argument(fullName = MAX_PHRED_SCORE_FULL_NAME, doc = "Limit value for phred scores", optional = true)
+ public double maxPhredScore = Double.NaN;
+
+ /**
+ * keep supplementary alignments?
+ **/
+ @Argument(fullName = KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME, doc = "keep supplementary alignments ?", optional = true)
+ public boolean keepSupplementaryAlignments = true;
+
+ @Advanced
+ @Argument(fullName= INCLUDE_QC_FAILED_READ_FULL_NAME, doc = "include reads with QC failed flag", optional = true)
+ public boolean includeQcFailedReads = true;
+
+ /**
+ * snvq computation mode
+ */
+ @Argument(fullName = SNVQ_MODE_FULL_NAME, doc = "snvq calculation mode.", optional = true)
+ public SnvqModeEnum snvMode = SnvqModeEnum.Geometric;
+
+ /**
+ * By default this tool overwrites the QUAL field with the new qualities. Setting this argument saves the original qualities in the specified SAM tag.
+ */
+ @Argument(fullName = OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME, doc = "alternate SAM tag to put original quality scores instead of overwriting the QUAL field. If not used, QUAL will be overwritten.", optional = true)
+ public String outputQualityAttribute = null;
+
+ /**
+ * debug read names?
+ **/
+ @Hidden
+ @Argument(fullName = DEBUG_READ_NAME_FULL_NAME, doc = "Read names of reads to output details of as part of debugging. ", optional = true)
+ public List debugReadName = null;
+
+ @Advanced
+ @Hidden
+ @Argument(fullName= DEBUG_COLLECT_STATS_INTO_FULL_NAME, doc = "Statistics about the reads will be output to given filename.", optional = true)
+ public String debugCollectStatsInto = null;
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
index 2014ce3030e..19783e1984f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/FlowFeatureMapper.java
@@ -337,7 +337,7 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
headerInfo.add(new VCFInfoHeaderLine(VCF_MAPQ, 1, VCFHeaderLineType.Integer, "Read mapqe"));
headerInfo.add(new VCFInfoHeaderLine(VCF_CIGAR, 1, VCFHeaderLineType.String, "Read CIGAR"));
headerInfo.add(new VCFInfoHeaderLine(VCF_READ_COUNT, 1, VCFHeaderLineType.Integer, "Number of reads containing this location"));
- headerInfo.add(new VCFInfoHeaderLine(VCF_FILTERED_COUNT, 1, VCFHeaderLineType.Integer, "Number of reads containing this location that agree with reference according to fitler"));
+ headerInfo.add(new VCFInfoHeaderLine(VCF_FILTERED_COUNT, 1, VCFHeaderLineType.Integer, "Number of reads containing this location that pass the adjacent base filter"));
headerInfo.add(new VCFInfoHeaderLine(VCF_FC1, 1, VCFHeaderLineType.Integer, "Number of M bases different on read from references"));
headerInfo.add(new VCFInfoHeaderLine(VCF_FC2, 1, VCFHeaderLineType.Integer, "Number of features before score threshold filter"));
headerInfo.add(new VCFInfoHeaderLine(VCF_LENGTH, 1, VCFHeaderLineType.Integer, "Read length"));
@@ -475,7 +475,8 @@ private void enrichFeature(final MappedFeature fr) {
for ( ReadContext rc : readQueue ) {
if ( rc.read.contains(loc) ) {
fr.readCount++;
- if ( mapper.noFeatureButFilterAt(rc.read, rc.referenceContext, fr.start) == FeatureMapper.FilterStatus.Filtered ) {
+ FeatureMapper.FilterStatus fs = mapper.noFeatureButFilterAt(rc.read, rc.referenceContext, fr.start);
+ if ( (fs == FeatureMapper.FilterStatus.Filtered) || (fs == FeatureMapper.FilterStatus.NoFeatureAndFiltered) ) {
fr.filteredCount++;
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
index 3f23cb5a6e6..e67f5ee570e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/SNVMapper.java
@@ -277,40 +277,37 @@ public FilterStatus noFeatureButFilterAt(GATKRead read, ReferenceContext referen
readOfs += delta;
refOfs += delta;
- if ( bases[readOfs] == ref[refOfs] ) {
-
- // check that this is really a SNV (must be surrounded by identical ref)
- boolean surrounded = true;
- for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
- final int bIndex = readOfs-1-i;
- final int rIndex = refOfs-1-i;
- if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
- surrounded = false;
- continue;
- }
- if ( bases[bIndex] != ref[rIndex] ) {
- surrounded = false;
- }
+ final boolean noFeature = bases[readOfs] == ref[refOfs];
+
+ // check that this is really a SNV (must be surrounded by identical ref)
+ boolean surrounded = true;
+ for ( int i = 0 ; i < surroundBefore && surrounded ; i++ ) {
+ final int bIndex = readOfs-1-i;
+ final int rIndex = refOfs-1-i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
+ continue;
}
- for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
- final int bIndex = readOfs+1+i;
- final int rIndex = refOfs+1+i;
- if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
- surrounded = false;
- continue;
- }
- if ( bases[bIndex] != ref[rIndex] ) {
- surrounded = false;
- }
+ if ( bases[bIndex] != ref[rIndex] ) {
+ surrounded = false;
}
- if ( !surrounded ) {
+ }
+ for (int i = 0; i < surroundAfter && surrounded ; i++ ) {
+ final int bIndex = readOfs+1+i;
+ final int rIndex = refOfs+1+i;
+ if ( bIndex < 0 || bIndex >= bases.length || rIndex < 0 || rIndex >= ref.length ) {
+ surrounded = false;
continue;
}
+ if ( bases[bIndex] != ref[rIndex] ) {
+ surrounded = false;
+ }
+ }
+ if ( !surrounded ) {
+ continue;
+ }
- // this is it! no feature but filtered in
- return FilterStatus.NoFeatureAndFiltered;
- } else
- return FilterStatus.Filtered;
+ return noFeature ? FilterStatus.NoFeatureAndFiltered : FilterStatus.Filtered;
} else {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java
index dd6a2e93838..a2642f20150 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/filters/VariantFiltration.java
@@ -114,6 +114,7 @@ public final class VariantFiltration extends VariantWalker {
public static final String CLUSTER_WINDOW_SIZE_LONG_NAME = "cluster-window-size";
public static final String MASK_EXTENSION_LONG_NAME = "mask-extension";
public static final String MASK_NAME_LONG_NAME = "mask-name";
+ public static final String MASK_DESCRIPTION_LONG_NAME = "mask-description";
public static final String FILTER_NOT_IN_MASK_LONG_NAME = "filter-not-in-mask";
public static final String MISSING_VAL_LONG_NAME = "missing-values-evaluate-as-failing";
public static final String INVERT_LONG_NAME = "invert-filter-expression";
@@ -238,6 +239,14 @@ public final class VariantFiltration extends VariantWalker {
@Argument(fullName=ALLELE_SPECIFIC_LONG_NAME, optional=true, doc="Set mask at the allele level. This option is not compatible with clustering.")
public boolean applyForAllele = false;
+ /**
+ * If a mask interval list is provided, then set the description of the filter in the VCF header to this String.
+ * Note that if spaces are needed, then the entire description should be enclosed in quotes. Also note that if
+ * --filter-not-in-mask is used, the description should be adapted to reflect the reverse logic.
+ */
+ @Argument(fullName=MASK_DESCRIPTION_LONG_NAME, optional=true, doc="Description to add to the FILTER field in VCF header for the mask filter.")
+ public String maskDescription;
+
// JEXL expressions for the filters
private List filterExps;
private List genotypeFilterExps;
@@ -305,7 +314,9 @@ private void initializeVcfWriter() {
}
if ( mask != null ) {
- if (filterRecordsNotInMask) {
+ if (maskDescription != null) {
+ hInfo.add(new VCFFilterHeaderLine(maskName, maskDescription));
+ } else if (filterRecordsNotInMask) {
hInfo.add(new VCFFilterHeaderLine(maskName, "Doesn't overlap a user-input mask"));
} else {
hInfo.add(new VCFFilterHeaderLine(maskName, "Overlaps a user-input mask"));
@@ -331,6 +342,9 @@ public void onTraversalStart() {
if (filterRecordsNotInMask && mask == null) {
throw new CommandLineException.BadArgumentValue(FILTER_NOT_IN_MASK_LONG_NAME, "argument not allowed if mask argument is not provided");
}
+ if (maskDescription != null && mask == null) {
+ throw new CommandLineException.BadArgumentValue(MASK_DESCRIPTION_LONG_NAME, "argument not allowed if mask argument is not provided");
+ }
filterExps = VariantContextUtils.initializeMatchExps(filterNames, filterExpressions);
genotypeFilterExps = VariantContextUtils.initializeMatchExps(genotypeFilterNames, genotypeFilterExpressions);
howToTreatMissingValues = failMissingValues ? JexlMissingValueTreatment.TREAT_AS_MATCH : JexlMissingValueTreatment.TREAT_AS_MISMATCH;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
index 86ef7018f2d..aede6c33fc2 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/AlleleSubsettingUtils.java
@@ -139,7 +139,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
}
// restrict AD to the new allele subset
- if(g.hasAD()) {
+ if(g.hasAD() && gb.makeWithShallowCopy().hasAD()) {
final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD());
gb.AD(newAD);
// if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java
index 1846fbe806d..ad40e18251f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/GenotypingEngine.java
@@ -395,7 +395,11 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
*/
protected final boolean cannotBeGenotyped(final VariantContext vc) {
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
- && vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) { //likelihoods may be missing when reading from GenomicsDB if there are more alts that GDB args allow
+ // likelihoods may be missing when reading from GenomicsDB if there are more alts that GDB args allow
+ // so ensure all genotypes (outside of 0/0 and ./.) have likelihoods
+ && vc.getGenotypes().stream().filter( g -> !(g.isNoCall() || g.isHomRef()) ).allMatch(Genotype::hasLikelihoods)
+ // if all sites are no calls or hom refs then keep a site where any samples have likelihoods
+ && vc.getGenotypes().stream().anyMatch(Genotype::hasLikelihoods)) {
return false;
}
// protect against too many alternate alleles that we can't even run AF on:
@@ -403,8 +407,8 @@ protected final boolean cannotBeGenotyped(final VariantContext vc) {
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
return true;
- }else {
- logger.warn("No genotype contained sufficient data to recalculate site and allele qualities. Site will be skipped at location "
+ } else {
+ logger.warn("Some genotypes contained insufficient data to recalculate site and allele qualities. Site will be skipped at location "
+ vc.getContig() + ":" + vc.getStart());
return true;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyper.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyper.java
index b6f192cb735..850c529d1a9 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyper.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyper.java
@@ -38,6 +38,7 @@
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.writers.GVCFWriter;
+import org.broadinstitute.hellbender.utils.variant.writers.IntervalFilteringVcfWriter;
import org.reflections.Reflections;
import java.util.*;
@@ -111,10 +112,12 @@ public final class GnarlyGenotyper extends VariantWalker {
/**
* This option can only be activated if intervals are specified.
*/
+ @DeprecatedFeature
@Advanced
- @Argument(fullName = GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
- doc="Restrict variant output to sites that start within provided intervals",
- optional=true)
+ @Argument(fullName= GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME,
+ doc="Restrict variant output to sites that start within provided intervals, equivalent to '--"+StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME+" STARTS_IN'",
+ optional=true,
+ mutex = {StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME})
private boolean onlyOutputCallsStartingInIntervals = false;
@Argument(fullName = GenomicsDBImport.MERGE_INPUT_INTERVALS_LONG_NAME,
@@ -145,9 +148,6 @@ public final class GnarlyGenotyper extends VariantWalker {
private final RMSMappingQuality mqCalculator = RMSMappingQuality.getInstance();
private final Set> allAlleleSpecificAnnotations = new HashSet<>();
- /** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */
- private List intervals;
-
@Override
public boolean requiresReference() {
return true;
@@ -180,15 +180,13 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
@Override
public void onTraversalStart() {
- final VCFHeader inputVCFHeader = getHeaderForVariants();
- if(onlyOutputCallsStartingInIntervals) {
- if( !intervalArgumentCollection.intervalsSpecified()) {
- throw new CommandLineException.MissingArgument("-L or -XL", "Intervals are required if --" + GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " was specified.");
- }
+ if (onlyOutputCallsStartingInIntervals) {
+ logger.warn("The --" + GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME + " option is deprecated. Please use '--" + StandardArgumentDefinitions.VARIANT_OUTPUT_INTERVAL_FILTERING_MODE_LONG_NAME + " STARTS_IN' for an equivalent filtering.");
+ this.userOutputVariantIntervalFilteringMode = IntervalFilteringVcfWriter.Mode.STARTS_IN;
}
- intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
- Collections.emptyList();
+
+ final VCFHeader inputVCFHeader = getHeaderForVariants();
final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples());
@@ -260,11 +258,10 @@ private void setupVCFWriter(VCFHeader inputVCFHeader, SampleList samples) {
@SuppressWarnings({"unchecked", "rawtypes"})
@Override
public void apply(VariantContext variant, ReadsContext reads, ReferenceContext ref, FeatureContext features) {
- SimpleInterval variantStart = new SimpleInterval(variant.getContig(), variant.getStart(), variant.getStart());
//return early if there's no non-symbolic ALT since GDB already did the merging
if ( !variant.isVariant() || !GATKVariantContextUtils.isProperlyPolymorphic(variant)
- || variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0
- || (onlyOutputCallsStartingInIntervals && !intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
+ || variant.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) == 0 )
+ {
if (keepAllSites) {
VariantContextBuilder builder = new VariantContextBuilder(mqCalculator.finalizeRawMQ(variant)); //don't fill in QUAL here because there's no alt data
builder.filter(GATKVCFConstants.LOW_QUAL_FILTER_NAME);
@@ -291,7 +288,7 @@ public void apply(VariantContext variant, ReadsContext reads, ReferenceContext r
finalizedVC = genotyperEngine.finalizeGenotype(variant);
}
//could return null if the variant didn't pass the genotyping arg calling/emission threshold
- if (finalizedVC != null && (!onlyOutputCallsStartingInIntervals || intervals.stream().anyMatch(interval -> interval.contains(variantStart)))) {
+ if (finalizedVC != null) {
vcfWriter.add(finalizedVC);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
index 1874238913c..9b6c5030497 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java
@@ -393,14 +393,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
final double[] genotypeLikelihoods,
final List allelesToUse) {
- if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) {
- //gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ();
- GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL,
- genotypeLikelihoods, allelesToUse, null);
- } else {
- GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
- genotypeLikelihoods, allelesToUse, null);
- }
+ GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
+ genotypeLikelihoods, allelesToUse, g, null);
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java
index a0f804ffc11..bd8cf62e851 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java
@@ -239,6 +239,25 @@ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, fi
return result;
}
+ /**
+ * The following functions estimate the error probability for an hmer. specifically two error
+ * probability values are generated: one for the first base of the hmer and another for the
+ * rest of its bases.
+ *
+ * The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability
+ * It iterates over the possible valid combinations of errors and sums them up.
+ *
+ * @param key - key (hmer length) in flow space
+ * @param errorProbBands - for each flow (position in the key) three error probabilities are provided:
+ * [0] - for the hmer being one base shorter
+ * [1] - for the hmer to be at its length
+ * [2] - for the hmer to be one base longer
+ * @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length)
+ * @param flowOrderLength - the cycle length of of the flow order (usually 4)
+ * @return an array of two probabilities:
+ * [0] - probability for the first base of the hmer
+ * [1] - probability for the rest of the bases of the hmer
+ */
@VisibleForTesting
protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
index 7b32156c40a..74434c89cbe 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
@@ -858,9 +858,9 @@ else if ( hasQ || hasZ ) {
cols.put("ReadName", read.getName());
// haplotypes and reference scores
- cols.put("PaternalHaplotypeScore", paternal.score);
- cols.put("MaternalHaplotypeScore", maternal.score);
- cols.put("RefHaplotypeScore", refScore);
+ cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score));
+ cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score));
+ cols.put("RefHaplotypeScore", String.format("%.6f", refScore));
// build haplotype keys
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
index f3191fbaadb..3cee7b801bb 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
@@ -157,6 +157,7 @@ public class GroundTruthScorer extends ReadWalker {
private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1;
private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1;
+ private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
/*
Private accumulator class for counting false/true observations (hence Boolean).
@@ -502,7 +503,7 @@ public void closeTool() {
// write reports
if ( reportFilePath != null ) {
final GATKReport report = new GATKReport(
- BooleanAccumulator.newReportTable(qualReport, "qual", fbargs.probabilityRatioThreshold, omitZerosFromReport),
+ BooleanAccumulator.newReportTable(qualReport, "qual", DEFAULT_RATIO_THRESHOLD, omitZerosFromReport),
BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport),
BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport),
PercentileReport.newReportTable(percentileReports, qualityPercentiles)
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
index 151e0ae4867..2ba594b7058 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
@@ -1,7 +1,11 @@
package org.broadinstitute.hellbender.tools.walkers.groundtruth;
-import org.apache.commons.collections.map.LazySortedMap;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.LinkedHashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
@@ -9,6 +13,8 @@
public class SeriesStats {
+ private static final Logger logger = LogManager.getLogger(SeriesStats.class);
+
// local state
private double last = Double.NaN;
private int count = 0;
@@ -16,8 +22,30 @@ public class SeriesStats {
private double min = Double.NaN;
private double max = Double.NaN;
private SortedMap bins = new TreeMap<>();
+ private int intCount = 0;
+ private Map auxBins = new LinkedHashMap<>();
+
+ public void csvWrite(final String path) throws IOException {
+ logger.info("Writing SeriesStats " + toDigest() + " into " + path);
+ PrintWriter pw = new PrintWriter(path);
+ pw.println("value,count");
+ boolean intKeys = isIntKeys();
+ for (Map.Entry entry : bins.entrySet() ) {
+ if ( intKeys ) {
+ pw.println(String.format("%d,%d", entry.getKey().intValue(), entry.getValue().get()));
+ } else {
+ pw.println(String.format("%f,%d", entry.getKey(), entry.getValue().get()));
+ }
+ }
+ pw.close();
+ }
- void add(double v) {
+ public void add(int v) {
+ add((double)v);
+ intCount++;
+ }
+
+ public void add(double v) {
// save in simple values
last = v;
@@ -31,10 +59,11 @@ void add(double v) {
count++;
// save in bins
- if ( bins.containsKey(v) ) {
- bins.get(v).incrementAndGet();
+ final Double key = v;
+ if ( bins.containsKey(key) ) {
+ bins.get(key).incrementAndGet();
} else {
- bins.put(v, new AtomicInteger(1));
+ bins.put(key, new AtomicInteger(1));
}
}
@@ -109,4 +138,23 @@ public double getStd() {
return Math.sqrt(variance);
}
+ public Map getBins() {
+ return this.bins;
+ }
+
+ public Map getAuxBins() {
+ return this.auxBins;
+ }
+
+ public String toDigest() {
+ if ( isIntKeys() ) {
+ return String.format("count=%d, min=%d, max=%d, median=%d, bin.count=%d", getCount(), (int)getMin(), (int)getMax(), (int)getMedian(), getBins().size());
+ } else {
+ return String.format("count=%d, min=%f, max=%f, median=%f, bin.count=%d", getCount(), getMin(), getMax(), getMedian(), getBins().size());
+ }
+ }
+
+ private boolean isIntKeys() {
+ return (count == intCount);
+ }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
index ebc0f61d3f6..3a350228128 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFiltering.java
@@ -29,6 +29,8 @@
import java.util.stream.Collectors;
import java.util.stream.IntStream;
+import com.google.common.annotations.VisibleForTesting;
+
/**
* Filtering haplotypes that contribute weak alleles to the genotyping.
*
@@ -278,7 +280,8 @@ private AlleleLikelihoods subsetHaplotypesByAlleles(final A
* @param sorThreshold only variants with SOR above threshold will be considered
* @return list of alleles that can be removed
*/
- private List identifyBadAlleles(final List collectedRPLs, final List collectedSORs,
+ @VisibleForTesting
+ List identifyBadAlleles(final List collectedRPLs, final List collectedSORs,
final List alleles,
final double qualThreshold,
final double sorThreshold) {
@@ -303,9 +306,11 @@ private List identifyBadAlleles(final List collectedRPLs, final
//we then add alleles with high SOR. Note that amongh all allleles with the SOR higher than the SOR_THRESHOLD
//we will first filter the one with the lowest QUAL.
logger.debug(() -> String.format("SHA:: Have %d candidates with low QUAL", rplCount));
- for (int i = sorIndices.length-1 ; (i >= 0) && (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD) ; i--) {
- if (!result.contains(alleles.get(sorIndices[i]))) {
- result.add(alleles.get(sorIndices[i]));
+ for (int i = sorIndices.length-1 ; (i >= 0) ; i--) {
+ if (collectedSORs.get(sorIndices[i])>SOR_THRESHOLD){
+ if (!result.contains(alleles.get(sorIndices[i]))) {
+ result.add(alleles.get(sorIndices[i]));
+ }
}
}
logger.debug(() -> String.format("SHA:: Have %d candidates with high SOR", result.size() - rplCount));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java
index 2788cc21575..010a9e21398 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AlleleFilteringHC.java
@@ -11,7 +11,10 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;
import java.io.OutputStreamWriter;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
/**
* Filtering haplotypes that contribute weak alleles to the genotyping. This is a version that determines if allele is weak using
@@ -57,14 +60,16 @@ int getAlleleLikelihoodVsInverse(final AlleleLikelihoods allel
final GenotypingLikelihoods genotypingLikelihoods = genotypesModel.calculateLikelihoods(alleleList,
genotypingData, null, 0, null);
- AFCalculationResult af = afCalc.fastCalculateDiploidBasedOnGLs(genotypingLikelihoods, genotypingEngine.getPloidyModel().totalPloidy());
- final double log10Confidence = af.log10ProbOnlyRefAlleleExists();
- final double phredScaledConfidence = (10.0 * log10Confidence) + 0.0;
- final int[] asPL = genotypingLikelihoods.sampleLikelihoods(0).getAsPLs();
-
- logger.debug(() -> String.format("GAL:: %s: %d %d %d", allele.toString(), asPL[0], asPL[1], asPL[2]));
- return Math.min(asPL[1]-asPL[0], asPL[2]-asPL[0]);
+ List perSamplePLs = new ArrayList<>();
+ for (int i = 0; i < genotypingLikelihoods.numberOfSamples(); i++) {
+ final int[] pls = genotypingLikelihoods.sampleLikelihoods(i).getAsPLs();
+ perSamplePLs.add(Math.min(pls[1] - pls[0], pls[2] - pls[0]));
+ final int finalI = i;
+ logger.debug(() -> String.format("GAL (%s):: %s: %d %d %d",
+ genotypingLikelihoods.getSample(finalI), allele.toString(), pls[0], pls[1], pls[2]));
+ }
+ return Collections.min(perSamplePLs);
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
index 1d7eecc0fe8..fa172768608 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
@@ -133,12 +133,12 @@ public AlleleLikelihoods computeReadLikelihoods(final List<
@Override
public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
final double log10ErrorRate = Math.log10(expectedErrorRate);
- final double catastrophicErrorRate = fbargs.fillingValue;
- final double log10catastrophicErrorRate = Math.log10(fbargs.fillingValue);
+ final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors.
+ final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);
return read -> {
final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate);
- final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * fbargs.fillingValue)) :
- Math.ceil(read.getLength() * fbargs.fillingValue);
+ final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) :
+ Math.ceil(read.getLength() * largeEventErrorRate);
return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate;
};
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java
index 700659297c9..27c5f0eb875 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java
@@ -132,12 +132,13 @@ public AlleleLikelihoods computeReadLikelihoods(final List<
@Override
public ToDoubleFunction log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
final double log10ErrorRate = Math.log10(expectedErrorRate);
- final double catastrophicErrorRate = Math.log10(fbargs.fillingValue);
+ final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors.
+ final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);
return read -> {
final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate));
- final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * fbargs.fillingValue));
- return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*catastrophicErrorRate;
+ final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate));
+ return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate;
};
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
index 6a4d3f5c06a..c166b061784 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -11,6 +11,8 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
@@ -20,6 +22,7 @@
import java.util.*;
import java.util.stream.Collectors;
+import java.util.stream.IntStream;
/**
* For each sample and for each allele a list feature vectors of supporting reads
@@ -33,6 +36,11 @@ public class FeaturizedReadSets {
public static final int DEFAULT_BASE_QUALITY = 25;
private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
+ private static final int FEATURES_PER_RANGE = 5;
+ private static final List RANGES = List.of(5, 10, 25, 50);
+ public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
+ private static final int VERY_BAD_QUAL_THRESHOLD = 10;
+ private static final int BAD_QUAL_THRESHOLD = 20;
private FeaturizedReadSets() { }
@@ -42,8 +50,9 @@ public static List>> getReadVectors(final VariantContext vc,
final AlleleLikelihoods haplotypeLikelihoods,
final int refDownsample,
final int altDownsample,
- final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
- return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode);
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode,
+ final Map readGroupIndices) {
+ return getReadVectors(vc, samples, likelihoods, haplotypeLikelihoods, refDownsample, altDownsample, Collections.emptyMap(), mutect3DatasetMode, readGroupIndices);
}
// returns Lists (in allele order) of lists of read vectors supporting each allele
@@ -54,7 +63,8 @@ public static List>> getReadVectors(final VariantContext vc,
final int refDownsample,
final int altDownsample,
final Map altDownsampleMap,
- final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode,
+ final Map readGroupIndices) {
final Map> readsByAllele = likelihoods.alleles().stream()
.collect(Collectors.toMap(a -> a, a -> new ArrayList<>()));
@@ -77,24 +87,26 @@ public static List>> getReadVectors(final VariantContext vc,
.forEach(ba -> ba.evidence.getReads().forEach(read -> bestHaplotypes.put(read, ba.allele)));
return vc.getAlleles().stream()
- .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode)).collect(Collectors.toList()))
+ .map(allele -> readsByAllele.get(allele).stream().map(read -> featurize(read, vc, bestHaplotypes, mutect3DatasetMode, readGroupIndices)).collect(Collectors.toList()))
.collect(Collectors.toList());
}
private static List featurize(final GATKRead read, final VariantContext vc,
final Map bestHaplotypes,
- final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
+ final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode,
+ final Map readGroupIndices) {
final List result = new ArrayList<>();
+ result.add(readGroupIndices.get(read.getReadGroup())); // this is read group metadata rather than part of the tensor
result.add(read.getMappingQuality());
result.add(BaseQuality.getBaseQuality(read, vc).orElse(DEFAULT_BASE_QUALITY));
result.add(read.isFirstOfPair() ? 1 : 0);
result.add(read.isReverseStrand() ? 1 : 0);
// distances from ends of read
- final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
- result.add(readPosition);
- result.add(read.getLength() - readPosition);
+ final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
+ result.add(readPositionOfVariantStart);
+ result.add(read.getLength() - readPositionOfVariantStart);
result.add(Math.abs(read.getFragmentLength()));
@@ -123,17 +135,67 @@ private static List featurize(final GATKRead read, final VariantContext
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);
+
+ for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
+ result.add(0);
+ }
} else {
- final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+ byte[] haplotypeBases = haplotype.getBases();
+ final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
- final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+ final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);
final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);
+
+ final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
+ final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
+ asm.stepForwardOnGenome();
+ final List rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();
+
+ while (!asm.isRightEdge()) {
+ final PileupElement pe = asm.makePileupElement();
+ final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);
+
+ // pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
+ final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
+ if (relevantRange.isPresent()) {
+ final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
+ if (pe.isAfterInsertion()) {
+ featuresToAddTo[0]++;
+ }
+
+ if (pe.isDeletion()) {
+ featuresToAddTo[1]++;
+ } else {
+ final byte base = pe.getBase();
+ final byte qual = pe.getQual();
+ final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];
+
+ if (base != haplotypeBase) {
+ featuresToAddTo[2]++;
+ }
+
+ if (qual < VERY_BAD_QUAL_THRESHOLD) {
+ featuresToAddTo[3]++;
+ } else if (qual < BAD_QUAL_THRESHOLD) {
+ featuresToAddTo[4]++;
+ }
+ }
+ }
+ asm.stepForwardOnGenome();
+ }
+
+ for (final int[] featuresToAdd : rangedFeatures) {
+ for (final int val : featuresToAdd) {
+ result.add(val);
+ }
+ }
}
- Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
+ // the +1 is for the read group index that comes before the features
+ Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures() + 1, "Wrong number of features");
return result;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
index 0087441ae1a..a3e04a2f02a 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -6,8 +6,13 @@
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.DeprecatedFeature;
+import org.broadinstitute.hellbender.cmdline.GATKPlugin.DefaultGATKVariantAnnotationArgumentCollection;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureInput;
+import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
+import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.FlowBasedAlignmentArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.*;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
@@ -95,18 +100,6 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
return new MutectReadThreadingAssemblerArgumentCollection();
}
- @Override
- public ReadThreadingAssembler createReadThreadingAssembler(){
- if(mitochondria ) {
- assemblerArgs.recoverAllDanglingBranches = true;
- if (assemblerArgs.pruningLogOddsThreshold == ReadThreadingAssemblerArgumentCollection.DEFAULT_PRUNING_LOG_ODDS_THRESHOLD) {
- assemblerArgs.pruningLogOddsThreshold = DEFAULT_MITO_PRUNING_LOG_ODDS_THRESHOLD;
- }
- }
-
- return super.createReadThreadingAssembler();
- }
-
@ArgumentCollection
public CollectF1R2CountsArgumentCollection f1r2Args = new CollectF1R2CountsArgumentCollection();
@@ -164,8 +157,7 @@ public ReadThreadingAssembler createReadThreadingAssembler(){
public double getDefaultAlleleFrequency() {
return afOfAllelesNotInGermlineResource >= 0 ? afOfAllelesNotInGermlineResource :
- (mitochondria ? DEFAULT_AF_FOR_MITO_CALLING:
- (normalSamples.isEmpty() ? DEFAULT_AF_FOR_TUMOR_ONLY_CALLING : DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING));
+ (normalSamples.isEmpty() ? DEFAULT_AF_FOR_TUMOR_ONLY_CALLING : DEFAULT_AF_FOR_TUMOR_NORMAL_CALLING);
}
/**
@@ -176,6 +168,20 @@ public double getDefaultAlleleFrequency() {
@Argument(fullName = MITOCHONDRIA_MODE_LONG_NAME, optional = true, doc="Mitochondria mode sets emission and initial LODs to 0.")
public Boolean mitochondria = false;
+ /**
+ * List of arguments to be set when a user specifies mitochondria mode. Each argument will be displayed in the help message.
+ */
+ public String[] getMitochondriaModeNameValuePairs() {
+ return new String[]{
+ DEFAULT_AF_LONG_NAME, String.valueOf(DEFAULT_AF_FOR_MITO_CALLING),
+ EMISSION_LOD_LONG_NAME, String.valueOf(DEFAULT_MITO_EMISSION_LOD),
+ INITIAL_TUMOR_LOG_10_ODDS_LONG_NAME, String.valueOf(DEFAULT_MITO_INITIAL_LOG_10_ODDS),
+ ReadThreadingAssemblerArgumentCollection.RECOVER_ALL_DANGLING_BRANCHES_LONG_NAME, "true",
+ ReadThreadingAssemblerArgumentCollection.PRUNING_LOD_THRESHOLD_LONG_NAME, String.valueOf(DEFAULT_MITO_PRUNING_LOG_ODDS_THRESHOLD),
+ StandardArgumentDefinitions.ANNOTATION_LONG_NAME, "OriginalAlignment"
+ };
+ }
+
/**
* If true, collect Mutect3 data for learning; otherwise collect data for generating calls with a pre-trained model
*/
@@ -211,8 +217,8 @@ public double getDefaultAlleleFrequency() {
public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
public enum Mutect3DatasetMode {
- ILLUMINA(11),
- ULTIMA(11);
+ ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES),
+ ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES);
final private int numReadFeatures;
@@ -229,6 +235,10 @@ public int getNumReadFeatures() {
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
* If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions.
+ *
+ * Although the normal use of this input is in generating training data, it can also be used when generating test data
+ * for making Permutect calls. In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as
+ * usual, and Permutect uses the labels to analyze the quality of its results.
*/
@Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true)
public FeatureInput mutect3TrainingTruth;
@@ -246,7 +256,7 @@ public double getEmissionLogOdds() {
if (emitReferenceConfidence != ReferenceConfidenceMode.NONE) {
return MathUtils.log10ToLog(DEFAULT_GVCF_LOG_10_ODDS);
}
- return MathUtils.log10ToLog(mitochondria && emissionLog10Odds == DEFAULT_EMISSION_LOG_10_ODDS ? DEFAULT_MITO_EMISSION_LOD : emissionLog10Odds);
+ return MathUtils.log10ToLog(emissionLog10Odds);
}
/**
@@ -259,7 +269,7 @@ public double getInitialLogOdds() {
if (emitReferenceConfidence != ReferenceConfidenceMode.NONE) {
return MathUtils.log10ToLog(DEFAULT_GVCF_LOG_10_ODDS);
}
- return MathUtils.log10ToLog(mitochondria && initialLog10Odds == DEFAULT_INITIAL_LOG_10_ODDS ? DEFAULT_MITO_INITIAL_LOG_10_ODDS : initialLog10Odds);
+ return MathUtils.log10ToLog(initialLog10Odds);
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java
index f9c3733acca..cbe36375e03 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2.java
@@ -1,14 +1,17 @@
package org.broadinstitute.hellbender.tools.walkers.mutect;
+import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
@@ -132,9 +135,9 @@
*
*
*
(iii) Mitochondrial mode
- *
Mutect2 automatically sets parameters appropriately for calling on mitochondria with the --mitochondria flag.
+ *
Mutect2 automatically sets parameters appropriately for calling on mitochondria with the --mitochondria-mode flag.
* Specifically, the mode sets --initial-tumor-lod to 0, --tumor-lod-to-emit to 0, --af-of-alleles-not-in-resource to
- * 4e-3, and the advanced parameter --pruning-lod-threshold to -4.
+ * 4e-3, and the advanced parameter --pruning-lod-threshold to -4*ln(10).
*
*
* gatk Mutect2 \
@@ -262,7 +265,8 @@ public boolean shouldTrackPileupsForAssemblyRegions() {
@Override
public void onTraversalStart() {
VariantAnnotatorEngine annotatorEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), null, Collections.emptyList(), false, false);
- m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(), referenceArguments.getReferenceSpecifier(), annotatorEngine);
+ m2Engine = new Mutect2Engine(MTAC, assemblyRegionArgs, createOutputBamIndex, createOutputBamMD5, getHeaderForReads(),
+ getBestAvailableSequenceDictionary(), referenceArguments.getReferenceSpecifier(), annotatorEngine);
vcfWriter = createVCFWriter(outputVCF);
if (m2Engine.emitReferenceConfidence()) {
logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions.");
@@ -281,17 +285,6 @@ public void onTraversalStart() {
m2Engine.writeHeader(vcfWriter, getDefaultToolVCFHeaderLines());
}
- @Override
- public Collection makeVariantAnnotations(){
- final Collection annotations = super.makeVariantAnnotations();
-
- if (MTAC.mitochondria) {
- annotations.add(new OriginalAlignment());
- }
-
- return annotations;
- }
-
@Override
public Object onTraversalSuccess() {
m2Engine.writeExtraOutputs(new File(outputVCF + DEFAULT_STATS_EXTENSION));
@@ -320,6 +313,12 @@ public void closeTool() {
*/
@Override
protected String[] customCommandLineValidation() {
+ if (MTAC.mitochondria) {
+ ModeArgumentUtils.setArgValues(
+ getCommandLineParser(),
+ MTAC.getMitochondriaModeNameValuePairs(),
+ M2ArgumentCollection.MITOCHONDRIA_MODE_LONG_NAME);
+ }
if (MTAC.flowMode != M2ArgumentCollection.FlowMode.NONE) {
ModeArgumentUtils.setArgValues(
getCommandLineParser(),
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
index 1d2096f3c2b..c831a4169e0 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.mutect;
import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
@@ -72,7 +73,7 @@
public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseable {
private static final List STANDARD_MUTECT_INFO_FIELDS = Arrays.asList(GATKVCFConstants.NORMAL_LOG_10_ODDS_KEY, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, GATKVCFConstants.NORMAL_ARTIFACT_LOG_10_ODDS_KEY,
- GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY,
+ GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY,
GATKVCFConstants.GERMLINE_QUAL_KEY, GATKVCFConstants.CONTAMINATION_QUAL_KEY, GATKVCFConstants.SEQUENCING_QUAL_KEY,
GATKVCFConstants.POLYMERASE_SLIPPAGE_QUAL_KEY, GATKVCFConstants.READ_ORIENTATION_QUAL_KEY,
GATKVCFConstants.STRAND_QUAL_KEY, GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, GATKVCFConstants.N_COUNT_KEY, GATKVCFConstants.AS_UNIQUE_ALT_READ_SET_COUNT_KEY);
@@ -94,6 +95,7 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab
final private M2ArgumentCollection MTAC;
private SAMFileHeader header;
+ private SAMSequenceDictionary sequenceDictionary;
private final int minCallableDepth;
public static final String CALLABLE_SITES_NAME = "callable";
@@ -136,9 +138,12 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseab
* @param referenceSpec reference specifier for the reference
* @param annotatorEngine annotator engine built with desired annotations
*/
- public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs, final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) {
+ public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentCollection assemblyRegionArgs,
+ final boolean createBamOutIndex, final boolean createBamOutMD5, final SAMFileHeader header,
+ final SAMSequenceDictionary sequenceDictionary, final GATKPath referenceSpec, final VariantAnnotatorEngine annotatorEngine) {
this.MTAC = Utils.nonNull(MTAC);
this.header = Utils.nonNull(header);
+ this.sequenceDictionary = sequenceDictionary;
minCallableDepth = MTAC.callableDepth;
referenceReader = ReferenceUtils.createReferenceReader(Utils.nonNull(referenceSpec));
aligner = SmithWatermanAligner.getAligner(MTAC.smithWatermanImplementation);
@@ -162,7 +167,7 @@ public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentColl
annotationEngine = Utils.nonNull(annotatorEngine);
assemblyEngine = MTAC.createReadThreadingAssembler();
likelihoodCalculationEngine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(MTAC.likelihoodArgs, MTAC.fbargs, true, MTAC.likelihoodArgs.likelihoodEngineImplementation);
- genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine);
+ genotypingEngine = new SomaticGenotypingEngine(MTAC, normalSamples, annotationEngine, header, sequenceDictionary);
haplotypeBAMWriter = AssemblyBasedCallerUtils.createBamWriter(MTAC, createBamOutIndex, createBamOutMD5, header);
trimmer = new AssemblyRegionTrimmer(assemblyRegionArgs, header.getSequenceDictionary());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
index 942c2de15b2..ad708c8dac8 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
@@ -1,5 +1,9 @@
package org.broadinstitute.hellbender.tools.walkers.mutect;
+import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMReadGroupRecord;
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
@@ -43,6 +47,10 @@ private enum Label {
ARTIFACT, VARIANT, UNLABELED, IGNORE
}
+ private final SAMSequenceDictionary sequenceDictionary;
+
+ private final Map readGroupIndices = new HashMap<>();
+
// number of additional variant features for assembly complexity, reference context
private static final int NUM_EXTRA_FEATURES = 9;
@@ -65,6 +73,8 @@ private enum Label {
private static final int MIN_REF = 5;
private final PrintWriter printWriter;
+ private final PrintWriter contigPrintWriter;
+ private final PrintWriter readGroupPrintWriter;
// number of nonartifact data to keep for each artifact datum
private final int nonArtifactPerArtifact;
@@ -79,9 +89,15 @@ private enum Label {
private final EnumMap> unmatchedArtifactAltCounts;
- public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount, final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples) {
+ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode, final int maxRefCount,
+ final int maxAltCount, final int nonArtifactPerArtifact, final Set normalSamples,
+ final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) {
try {
printWriter = new PrintWriter(new FileWriter(Utils.nonNull(datasetFile)));
+ final File contigTableFile = datasetFile.toPath().resolveSibling("contigs.table").toFile();
+ final File readGroupTableFile = datasetFile.toPath().resolveSibling("read-groups.table").toFile();
+ contigPrintWriter = new PrintWriter(new FileWriter(contigTableFile));
+ readGroupPrintWriter = new PrintWriter(new FileWriter(readGroupTableFile));
} catch (IOException ex) {
throw new UserException.BadInput("Could not create dataset file writer");
}
@@ -92,6 +108,12 @@ public Mutect3DatasetEngine(final File datasetFile, final boolean trainingMode,
this.maxRefCount = maxRefCount;
this.maxAltCount = maxAltCount;
+ this.sequenceDictionary = sequenceDictionary;
+ final List readGroups = header.getReadGroups();
+ for (int n = 0; n < readGroups.size(); n++) {
+ readGroupIndices.put(readGroups.get(n).getReadGroupId(), n);
+ }
+
unmatchedArtifactAltCounts = new EnumMap<>(VariantType.class);
for (final VariantType type : VariantType.values()) {
unmatchedArtifactAltCounts.put(type, new ArrayBlockingQueue<>(CAPACITY));
@@ -106,7 +128,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final M2ArgumentCollection.Mutect3DatasetMode mutect3DatasetMode) {
final String refBases = ReferenceBases.annotate(ref, vc);
final String refAllele = vc.getReference().getBaseString();
- final String contig = vc.getContig();
+ final int contigIndex = sequenceDictionary.getSequenceIndex(vc.getContig());
final int position = vc.getStart();
final Set tumorSamples = likelihoods.samples().stream().filter(sample -> !normalSamples.contains(sample)).collect(Collectors.toSet());
final int numAlt = vc.getNAlleles() - 1;
@@ -153,7 +175,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final int diff = altAlleleString.length() - refAllele.length();
final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);
- if (trainingMode) {
+ if (trainingMode) { // training mode -- collecting tensors to train the Permutect artifact model
final ArrayBlockingQueue unmatchedQueue = unmatchedArtifactAltCounts.get(type);
final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;
@@ -182,8 +204,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
} else {
labels.add(Label.IGNORE);
}
- } else {
- labels.add(Label.UNLABELED);
+ } else { // not training mode -- we are generating tensors in order to apply the Permutect artifact model to a callset
+ if (truthVCs.isPresent()) {
+ // here, for the purposes of test data, both sequencing errors and technical artifacts get the "ARTIFACT" label
+ labels.add(truthAlleles.contains(remappedAltAlelle) ? Label.VARIANT : Label.ARTIFACT);
+ } else {
+ labels.add(Label.UNLABELED);
+ }
}
}
@@ -199,9 +226,9 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
// TODO: for now we don't really need normal reads
// note that the following use the VC's allele order, not necessarily the likelihoods' allele order
final List>> normalReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, normalSamples,
- likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, mutect3DatasetMode);
+ likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, mutect3DatasetMode, readGroupIndices);
final List>> tumorReadVectorsByAllele = FeaturizedReadSets.getReadVectors(vc, tumorSamples,
- likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap, mutect3DatasetMode);
+ likelihoods, logFragmentLikelihoods, maxRefCount, maxAltCount, altDownsampleMap, mutect3DatasetMode, readGroupIndices);
// ref and alt reads have already been downsampled by the read featurizer
final List> tumorRefReads = tumorReadVectorsByAllele.get(0);
@@ -222,7 +249,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final List> normalAltReads = normalReadVectorsByAllele.get(n+1);
printWriter.println(labels.get(n).toString());
- printWriter.printf("%s:%d,%s->%s%n", contig, position, refAllele, altAllele);
+ printWriter.printf("%d:%d,%s->%s%n", contigIndex, position, refAllele, altAllele);
printWriter.println(refBases);
printWriter.println(numberString(variantFeatureVector, "%.2f", " "));
//printWriter.printf("%d %d %d %d%n", tumorRefReads.size(), tumorAltReads.size(), normalRefReads.size(), normalAltReads.size());
@@ -322,5 +349,16 @@ private int[] sumADsOverSamples(final VariantContext vc, final Set sampl
@Override
public void close() {
printWriter.close();
+
+ for (final SAMSequenceRecord contigRecord : sequenceDictionary.getSequences()) {
+ contigPrintWriter.println(String.format("%s\t%d", contigRecord.getContig(), contigRecord.getSequenceIndex()));
+ }
+
+ for (final Map.Entry entry : readGroupIndices.entrySet()) {
+ readGroupPrintWriter.println(String.format("%s\t%d", entry.getKey(), entry.getValue()));
+ }
+
+ contigPrintWriter.close();
+ readGroupPrintWriter.close();
}
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
index e4504398998..55bef4dc806 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/SomaticGenotypingEngine.java
@@ -4,10 +4,12 @@
import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Doubles;
import htsjdk.samtools.SAMFileHeader;
+import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
@@ -51,7 +53,9 @@ public class SomaticGenotypingEngine implements AutoCloseable {
private final double refPseudocount = 1;
private final double altPseudocount;
- public SomaticGenotypingEngine(final M2ArgumentCollection MTAC, final Set normalSamples, final VariantAnnotatorEngine annotationEngine) {
+ public SomaticGenotypingEngine(final M2ArgumentCollection MTAC, final Set normalSamples,
+ final VariantAnnotatorEngine annotationEngine,
+ final SAMFileHeader header, final SAMSequenceDictionary sequenceDictionary) {
this.MTAC = MTAC;
altPseudocount = MTAC.minAF == 0.0 ? 1 : 1 - Math.log(2)/Math.log(MTAC.minAF);
@@ -61,7 +65,7 @@ public SomaticGenotypingEngine(final M2ArgumentCollection MTAC, final Set logFragmentLikelihoods = logReadLikelihoods.groupEvidence(MTAC.independentMates ? read -> read : GATKRead::getName, Fragment::createAndAvoidFailure);
+ final Set potentialSomaticEventsInRegion = new HashSet<>();
for( final int loc : eventStarts ) {
final List eventsAtThisLoc = AssemblyBasedCallerUtils.getVariantsFromActiveHaplotypes(loc, haplotypes, false);
- VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
- if( mergedVC == null ) {
+ VariantContext merged = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
+ if( merged == null ) {
continue;
}
+ final VariantContext mergedVC = emitRefConf ? ReferenceConfidenceUtils.addNonRefSymbolicAllele(merged) : merged;
// converting haplotype likelihoods to allele likelihoods
final Map> alleleMapper = AssemblyBasedCallerUtils.createAlleleMapper(mergedVC, loc, haplotypes, true);
@@ -127,7 +133,6 @@ public CalledHaplotypes callMutations(
logLikelihoods.retainEvidence(variantCallingRelevantFragmentOverlap::overlaps);
if (emitRefConf) {
- mergedVC = ReferenceConfidenceUtils.addNonRefSymbolicAllele(mergedVC);
logLikelihoods.addNonReferenceAllele(Allele.NON_REF_ALLELE);
}
final List> tumorMatrices = IntStream.range(0, logLikelihoods.numberOfSamples())
@@ -152,13 +157,21 @@ public CalledHaplotypes callMutations(
.filter(allele -> forcedAlleles.contains(allele) || tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.collect(Collectors.toList());
- final long somaticAltCount = tumorAltAlleles.stream()
+ final List allelesToGenotype = tumorAltAlleles.stream()
.filter(allele -> forcedAlleles.contains(allele) || !hasNormal || MTAC.genotypeGermlineSites || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
- .count();
+ .toList();
+
+ // record somatic alleles for later use in the Event Count annotation
+ // note that in tumor-only calling it does not attempt to detect germline events
+ mergedVC.getAlternateAlleles().stream()
+ .filter(allele -> tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
+ .filter(allele -> !hasNormal || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
+ .map(allele -> new Event(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getReference(), allele))
+ .forEach(potentialSomaticEventsInRegion::add);
// if every alt allele is germline, skip this variant. However, if some alt alleles are germline and others
// are not we emit them all so that the filtering engine can see them
- if (somaticAltCount == 0) {
+ if (allelesToGenotype.isEmpty()) {
continue;
}
@@ -222,8 +235,41 @@ public CalledHaplotypes callMutations(
final List outputCalls = AssemblyBasedCallerUtils.phaseCalls(returnCalls, calledHaplotypes);
final int eventCount = outputCalls.size();
+
+ // calculate the number of somatic events in the best haplotype of each called variant
+ final Map haplotypeSupportCounts = logReadLikelihoods.alleles().stream()
+ .collect(Collectors.toMap(hap -> hap, label -> new MutableInt(0)));
+ logReadLikelihoods.bestAllelesBreakingTies()
+ .forEach(bestHaplotype -> haplotypeSupportCounts.get(bestHaplotype.allele).increment());
+
+ final Map> haplotypesByEvent= new HashMap<>();
+ for (final Haplotype haplotype : logReadLikelihoods.alleles()) {
+ for (final Event event : haplotype.getEventMap().getEvents()) {
+ haplotypesByEvent.computeIfAbsent(event, e -> new ArrayList<>()).add(haplotype);
+ }
+ }
+ final Map> eventCountAnnotations = new HashMap<>();
+ for (final VariantContext outputCall : outputCalls) {
+ for (final Allele allele : outputCall.getAlternateAlleles()) {
+ // note: this creates the minimal representation behind the scenes
+ final Event event = new Event(outputCall.getContig(), outputCall.getStart(), outputCall.getReference(), allele);
+ // haplotypesByEvent contains every *assembled* event, including events injected into the original assembly,
+ // but there are some modes where we genotype events that were never in an assembly graph, in which case
+ // this annotation is irrelevant
+ if (haplotypesByEvent.containsKey(event)) {
+ final Haplotype bestHaplotype = haplotypesByEvent.get(event).stream()
+ .sorted(Comparator.comparingInt(h -> haplotypeSupportCounts.getOrDefault(h, new MutableInt(0)).intValue()).reversed())
+ .findFirst().get();
+
+ eventCountAnnotations.computeIfAbsent(outputCall, vc -> new ArrayList<>())
+ .add((int) bestHaplotype.getEventMap().getEvents().stream().filter(potentialSomaticEventsInRegion::contains).count());
+ }
+ }
+ }
final List outputCallsWithEventCountAnnotation = outputCalls.stream()
- .map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount).make())
+ .map(vc -> new VariantContextBuilder(vc)
+ .attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCountAnnotations.get(vc))
+ .attribute(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, potentialSomaticEventsInRegion.size()).make())
.collect(Collectors.toList());
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}
@@ -361,9 +407,7 @@ private Optional getForNormal(final Supplier supplier) {
private static Map getNegativeLogPopulationAFAnnotation(List germlineResourceVariants,
final List allAlleles,
final double afOfAllelesNotInGermlineResource) {
- final Optional germlineVC = germlineResourceVariants.isEmpty() ? Optional.empty()
- : Optional.of(germlineResourceVariants.get(0)); // assume only one VC per site
- final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineVC, afOfAllelesNotInGermlineResource);
+ final double[] populationAlleleFrequencies = getGermlineAltAlleleFrequencies(allAlleles, germlineResourceVariants, afOfAllelesNotInGermlineResource);
return ImmutableMap.of(GATKVCFConstants.POPULATION_AF_KEY, MathUtils.applyToArray(populationAlleleFrequencies, x -> - Math.log10(x)));
}
@@ -373,27 +417,35 @@ private static Map getNegativeLogPopulationAFAnnotation(List allAlleles, final Optional germlineVC, final double afOfAllelesNotInGermlineResource) {
+ static double[] getGermlineAltAlleleFrequencies(final List allAlleles, final List germlineVCs, final double afOfAllelesNotInGermlineResource) {
Utils.validateArg(!allAlleles.isEmpty(), "allAlleles are empty -- there is not even a reference allele.");
- if (germlineVC.isPresent()) {
- if (! germlineVC.get().hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
- logger.warn("Germline resource variant at " + germlineVC.get().getContig() + ":" + germlineVC.get().getStart() +" missing AF attribute");
- return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource));
+ final Map alleleFrequencies = new HashMap<>();
+ allAlleles.forEach(a -> alleleFrequencies.put(a, afOfAllelesNotInGermlineResource)); // initialize everything to the default
+
+ // look through every germline resource variant context at this locus and fill in the AFs
+ for (final VariantContext germlineVC : germlineVCs) {
+ if (! germlineVC.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
+ logger.warn("Germline resource variant at " + germlineVC.getContig() + ":" + germlineVC.getStart() +" missing AF attribute");
+ }
+
+ List germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.getAlleles());
+ final List germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC, VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource);
+
+ if (germlineAltAFs.size() == (germlineVC.getNAlleles() - 1)) { // skip VCs with a bad AF field that got parsed as a wrong-length list
+ for (int alleleIndex = 1; alleleIndex < allAlleles.size(); alleleIndex++) { // start at 1 to skip the reference, which doesn't have an AF annotation
+ final Allele allele = allAlleles.get(alleleIndex);
+ // note the -1 since germlineAltAFs do not include ref
+ germlineIndices.get(alleleIndex).ifPresent(germlineIndex -> alleleFrequencies.put(allele, germlineAltAFs.get(germlineIndex - 1)));
+ }
}
- List germlineIndices = GATKVariantContextUtils.alleleIndices(allAlleles, germlineVC.get().getAlleles());
- final List germlineAltAFs = Mutect2Engine.getAttributeAsDoubleList(germlineVC.get(), VCFConstants.ALLELE_FREQUENCY_KEY, afOfAllelesNotInGermlineResource);
-
- return germlineIndices.stream().skip(1) // skip the reference allele
- .mapToDouble(idx -> idx.isPresent() ? germlineAltAFs.get(idx.getAsInt() - 1) : afOfAllelesNotInGermlineResource) // note the -1 since germlineAltAFs do not include ref
- .toArray();
- } else {
- return Doubles.toArray(Collections.nCopies(allAlleles.size() - 1, afOfAllelesNotInGermlineResource));
}
+
+ return allAlleles.stream().skip(1).mapToDouble(alleleFrequencies::get).toArray(); // skip the reference allele
}
/**
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ClusteredEventsFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ClusteredEventsFilter.java
index b7212562bb2..88cb589ef29 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ClusteredEventsFilter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/ClusteredEventsFilter.java
@@ -8,9 +8,11 @@
public class ClusteredEventsFilter extends HardFilter {
private final int maxEventsInRegion;
+ private final int maxEventsInHaplotype;
- public ClusteredEventsFilter(final int maxEventsInRegion) {
+ public ClusteredEventsFilter(final int maxEventsInRegion, final int maxEventsInHaplotype) {
this.maxEventsInRegion = maxEventsInRegion;
+ this.maxEventsInHaplotype = maxEventsInHaplotype;
}
@Override
@@ -18,8 +20,9 @@ public ClusteredEventsFilter(final int maxEventsInRegion) {
@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
- final Integer eventCount = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, -1);
- return eventCount > maxEventsInRegion;
+ final List haplotypeEventCounts = vc.getAttributeAsIntList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 0);
+ final int regionEventCounts = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, 0);
+ return haplotypeEventCounts.stream().mapToInt(n -> n).max().getAsInt() > maxEventsInHaplotype || regionEventCounts > maxEventsInRegion;
}
@Override
@@ -28,5 +31,5 @@ public String filterName() {
}
@Override
- protected List requiredInfoAnnotations() { return Collections.singletonList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); }
+ protected List requiredInfoAnnotations() { return List.of(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); }
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/FilteredHaplotypeFilter.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/FilteredHaplotypeFilter.java
index 9a6375931ae..b725ee2587e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/FilteredHaplotypeFilter.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/FilteredHaplotypeFilter.java
@@ -13,6 +13,7 @@
import java.util.*;
public class FilteredHaplotypeFilter extends Mutect2VariantFilter {
+ private static final double GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT = 0.25;
private final double maxIntraHaplotypeDistance;
// for each pgt + pid phasing string, a list of loci-error probability pairs
@@ -54,10 +55,21 @@ public double calculateErrorProbability(final VariantContext vc, final Mutect2Fi
@Override
protected void accumulateDataForLearning(final VariantContext vc, final ErrorProbabilities errorProbabilities, final Mutect2FilteringEngine filteringEngine) {
- // we record the maximum non-sequencing artifact that is not this filter itself
- final double artifactProbability = errorProbabilities.getProbabilitiesByFilter().entrySet().stream()
- .filter(e -> e.getKey().errorType() != ErrorType.SEQUENCING)
- .filter(e -> !e.getKey().filterName().equals(filterName()))
+ // we record the maximum non-sequencing, non-germline, artifact probability that is not from this filter itself
+ final Map> probabilitiesByFilter = errorProbabilities.getProbabilitiesByFilter();
+
+ final double germlineProbability = probabilitiesByFilter.entrySet().stream()
+ .filter(e -> e.getKey().filterName() == GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)
+ .flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
+ .max(Double::compareTo).orElse(0.0);
+
+ // the normal artifact filter often lights up when there's a non-artifactual germline event, which we don't want here
+ final boolean ignoreNormalArtifact = germlineProbability > GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT;
+
+ final double artifactProbability = probabilitiesByFilter.entrySet().stream()
+ .filter(e -> e.getKey().errorType() != ErrorType.NON_SOMATIC)
+ .filter(e -> !(ignoreNormalArtifact && e.getKey().filterName() == GATKVCFConstants.ARTIFACT_IN_NORMAL_FILTER_NAME))
+ .filter(e -> !e.getKey().filterName().equals(filterName())) // exclude the haplotype filter itself, which would be circular
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/M2FiltersArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/M2FiltersArgumentCollection.java
index cec046e4026..26218b5008d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/M2FiltersArgumentCollection.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/M2FiltersArgumentCollection.java
@@ -55,6 +55,7 @@ public class M2FiltersArgumentCollection {
* Hard filter thresholds
*/
public static final String MAX_EVENTS_IN_REGION_LONG_NAME = "max-events-in-region";
+ public static final String MAX_EVENTS_IN_HAPLOTYPE_LONG_NAME = "max-events-in-haplotype";
public static final String MAX_ALT_ALLELE_COUNT_LONG_NAME = "max-alt-allele-count";
public static final String UNIQUE_ALT_READ_COUNT_LONG_NAME = "unique-alt-read-count";
public static final String MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME = "min-median-mapping-quality";
@@ -65,7 +66,8 @@ public class M2FiltersArgumentCollection {
public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand";
public static final String MIN_AF_LONG_NAME = "min-allele-fraction";
- private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2;
+ private static final int DEFAULT_MAX_EVENTS_IN_REGION = 3;
+ private static final int DEFAULT_MAX_EVENTS_IN_HAPLOTYPE = 2;
private static final int DEFAULT_MAX_ALT_ALLELES = 1;
private static final int DEFAULT_MIN_UNIQUE_ALT_READS = 0;
private static final int DEFAULT_MIN_MEDIAN_MAPPING_QUALITY = 30;
@@ -77,9 +79,12 @@ public class M2FiltersArgumentCollection {
private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0;
private static final double DEFAULT_MIN_AF = 0;
- @Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region. Filter all variants if exceeded.")
+ @Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum number of non-germline events in a single assembly region. Filter all variants if exceeded.")
public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION;
+ @Argument(fullName = MAX_EVENTS_IN_HAPLOTYPE_LONG_NAME, optional = true, doc = "Maximum number of non-germline events in a variant allele's best haplotype.")
+ public int maxEventsInHaplotype = DEFAULT_MAX_EVENTS_IN_HAPLOTYPE;
+
@Argument(fullName = MAX_ALT_ALLELE_COUNT_LONG_NAME, optional = true, doc = "Maximum alt alleles per site.")
public int numAltAllelesThreshold = DEFAULT_MAX_ALT_ALLELES;
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/Mutect2FilteringEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/Mutect2FilteringEngine.java
index 159f67df68d..77d2055587c 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/Mutect2FilteringEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/filtering/Mutect2FilteringEngine.java
@@ -312,7 +312,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {
}
if (!MTFAC.mitochondria && !MTFAC.microbial) {
- filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion));
+ filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion, MTFAC.maxEventsInHaplotype));
filters.add(new MultiallelicFilter(MTFAC.numAltAllelesThreshold));
filters.add(new FragmentLengthFilter(MTFAC.maxMedianFragmentLengthDifference));
filters.add(new PolymeraseSlippageFilter(MTFAC.minSlippageLength, MTFAC.slippageRate));
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
index 81138035bb1..e447d2c1086 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java
@@ -646,11 +646,11 @@ public void closeTool(){
* @param minQuality drop events with quality lower than this
* @return a new record or null
*/
- public static SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final VariantContext variant,
- final double minQuality,
- final Set allosomalContigs,
- final int refAutosomalCopyNumber,
- final SampleDB sampleDB) {
+ public SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final VariantContext variant,
+ final double minQuality,
+ final Set allosomalContigs,
+ final int refAutosomalCopyNumber,
+ final SampleDB sampleDB) {
Utils.nonNull(variant);
if (variant.getGenotypes().size() == 1) {
//only cluster good variants
@@ -672,7 +672,7 @@ public static SVCallRecord createDepthOnlyFromGCNVWithOriginalGenotypes(final Va
.collect(Collectors.toList());
svBuilder.genotypes(genotypesWithECN);
- final SVCallRecord baseRecord = SVCallRecordUtils.create(svBuilder.make(), true);
+ final SVCallRecord baseRecord = SVCallRecordUtils.create(svBuilder.make(), true, dictionary);
final List nonRefGenotypes = baseRecord.getGenotypes().stream()
.filter(g -> !(g.isHomRef() || (g.isNoCall() && !g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT))))
.collect(Collectors.toList());
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
index 0e840e4652d..d49a351c39f 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
@@ -9,10 +9,10 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils;
+import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils;
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SimpleInterval;
-import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfTranscriptFeature;
import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils;
@@ -57,36 +57,6 @@ public class SVAnnotateEngine {
GATKSVVCFConstants.ComplexVariantSubtype.delINVdup,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL);
- // Mini class to package SV type and interval into one object
- @VisibleForTesting
- protected static final class SVSegment {
- private final GATKSVVCFConstants.StructuralVariantAnnotationType intervalSVType;
- private final SimpleInterval interval;
- protected SVSegment(final GATKSVVCFConstants.StructuralVariantAnnotationType svType, final SimpleInterval interval) {
- this.intervalSVType = svType;
- this.interval = interval;
- }
- public GATKSVVCFConstants.StructuralVariantAnnotationType getIntervalSVType() {
- return intervalSVType;
- }
- public SimpleInterval getInterval() {
- return interval;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
- final SVSegment svSegment = (SVSegment) o;
- return intervalSVType == svSegment.intervalSVType && interval.equals(svSegment.interval);
- }
-
- @Override
- public int hashCode() {
- return Objects.hash(intervalSVType, interval);
- }
- }
-
// Container class for all SVIntervalTree trees created from the GTF
@VisibleForTesting
public static final class GTFIntervalTreesContainer {
@@ -862,12 +832,7 @@ protected static boolean includesDispersedDuplication(final GATKSVVCFConstants.C
protected Map annotateStructuralVariant(final VariantContext variant) {
final Map> variantConsequenceDict = new HashMap<>();
final GATKSVVCFConstants.StructuralVariantAnnotationType overallSVType = getSVType(variant);
- final String complexTypeString = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null);
- GATKSVVCFConstants.ComplexVariantSubtype complexType = null;
- if (complexTypeString != null) {
- // replace / in CTX_PP/QQ and CTX_PQ/QP with _ to match ComplexVariantSubtype constants which cannot contain slashes
- complexType = GATKSVVCFConstants.ComplexVariantSubtype.valueOf(complexTypeString.replace("/", "_"));
- }
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType = SVCallRecordUtils.getComplexSubtype(variant);
final boolean includesDispersedDuplication = includesDispersedDuplication(complexType, COMPLEX_SUBTYPES_WITH_DISPERSED_DUP);
final List svSegmentsForGeneOverlaps = getSVSegments(variant, overallSVType, maxBreakendLen, complexType);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
index 4e483652d3c..791befb79fb 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java
@@ -378,7 +378,7 @@ public void closeTool() {
@Override
public void apply(final VariantContext variant, final ReadsContext readsContext,
final ReferenceContext referenceContext, final FeatureContext featureContext) {
- final SVCallRecord call = SVCallRecordUtils.create(variant);
+ final SVCallRecord call = SVCallRecordUtils.create(variant, dictionary);
final SVCallRecord filteredCall;
if (fastMode && call.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
// Strip out non-carrier genotypes to save memory and compute
@@ -447,9 +447,9 @@ public VariantContext buildVariantContext(final SVCallRecord call) {
// Build new variant
final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
- call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(), call.getLength(),
- call.getAlgorithms(), call.getAlleles(), filledGenotypes, call.getAttributes(), call.getFilters(),
- call.getLog10PError(), dictionary);
+ call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getComplexSubtype(),
+ call.getComplexEventIntervals(), call.getLength(), call.getAlgorithms(), call.getAlleles(), filledGenotypes,
+ call.getAttributes(), call.getFilters(), call.getLog10PError(), dictionary);
final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
if (omitMembers) {
builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
index d53c3a22b0b..56c389539c6 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVConcordance.java
@@ -180,7 +180,7 @@ public void apply(final TruthVersusEval truthVersusEval, final ReadsContext read
}
private void add(final VariantContext variant, final boolean isTruth) {
- SVCallRecord record = SVCallRecordUtils.create(variant);
+ SVCallRecord record = SVCallRecordUtils.create(variant, dictionary);
if (!record.getContigA().equals(currentContig)) {
flushClusters(true);
currentContig = record.getContigA();
@@ -199,8 +199,8 @@ protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) {
final List genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList());
return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(),
item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(),
- item.getCpxSubtype(), item.getLength(), item.getAlgorithms(), item.getAlleles(), genotypes,
- item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
+ item.getComplexSubtype(), item.getComplexEventIntervals(), item.getLength(), item.getAlgorithms(),
+ item.getAlleles(), genotypes, item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
}
private static Genotype stripTruthGenotype(final Genotype genotype) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVSegment.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVSegment.java
new file mode 100644
index 00000000000..29f4d5464d0
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVSegment.java
@@ -0,0 +1,56 @@
+package org.broadinstitute.hellbender.tools.walkers.sv;
+
+import htsjdk.samtools.util.Locatable;
+import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.Objects;
+
+// Mini class to package SV type and interval into one object
+public class SVSegment implements Locatable {
+ protected final GATKSVVCFConstants.StructuralVariantAnnotationType intervalSVType;
+ protected final SimpleInterval interval;
+
+ public SVSegment(final GATKSVVCFConstants.StructuralVariantAnnotationType svType, final SimpleInterval interval) {
+ Utils.nonNull(interval);
+ this.intervalSVType = svType;
+ this.interval = interval;
+ }
+
+ public GATKSVVCFConstants.StructuralVariantAnnotationType getIntervalSVType() {
+ return intervalSVType;
+ }
+
+ @Override
+ public String getContig() {
+ return interval.getContig();
+ }
+
+ @Override
+ public int getStart() {
+ return interval.getStart();
+ }
+
+ @Override
+ public int getEnd() {
+ return interval.getEnd();
+ }
+
+ public SimpleInterval getInterval() {
+ return interval;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+ final SVSegment svSegment = (SVSegment) o;
+ return intervalSVType == svSegment.intervalSVType && interval.equals(svSegment.interval);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(intervalSVType, interval);
+ }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecaller.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecaller.java
index 6e02a782a15..33108ec5968 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecaller.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantrecalling/HaplotypeBasedVariantRecaller.java
@@ -135,11 +135,11 @@ public void traverse() {
}
// get reads overlapping haplotypes
- final Map> readsByReader = readsReader.getReads(haplotypeSpan, vcLoc);
+ final Map> readsByReader = readsReader.getReads(haplotypeSpan, vcLoc);
final List variants = new LinkedList<>(Arrays.asList(vc));
if ( logger.isDebugEnabled() ) {
int readCount = 0;
- for ( Collection reads : readsByReader.values() )
+ for ( Collection reads : readsByReader.values() )
readCount += reads.size();
logger.debug(String.format("vcLoc %s, haplotypeSpan: %s, %d haplotypes, %d reads",
vcLoc.toString(), haplotypeSpan.toString(), processedHaplotypes.size(), readCount, variants.size()));
@@ -150,16 +150,16 @@ public void traverse() {
final List