Skip to content

Commit

Permalink
Support fasta.gz in GATKSparkTool (broadinstitute#5290)
Browse files Browse the repository at this point in the history
* fasta.gz references can now be used in GATKSparkTools
* fixes broadinstitute#5258
  • Loading branch information
tomwhite authored and EdwardDixon committed Nov 9, 2018
1 parent eccb5b1 commit da1bb69
Show file tree
Hide file tree
Showing 6 changed files with 18 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.GZIIndex;
import htsjdk.samtools.util.IOUtil;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -588,6 +589,7 @@ protected static String addReferenceFilesForSpark(JavaSparkContext ctx, String r
Path referencePath = IOUtils.getPath(referenceFile);
Path indexPath = ReferenceSequenceFileFactory.getFastaIndexFileName(referencePath);
Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(referencePath);
Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(referencePath);

ctx.addFile(referenceFile);
if (Files.exists(indexPath)) {
Expand All @@ -596,6 +598,9 @@ protected static String addReferenceFilesForSpark(JavaSparkContext ctx, String r
if (Files.exists(dictPath)) {
ctx.addFile(dictPath.toUri().toString());
}
if (Files.exists(gziPath)) {
ctx.addFile(gziPath.toUri().toString());
}

return referencePath.getFileName().toString();
}
Expand Down
2 changes: 2 additions & 0 deletions src/test/java/org/broadinstitute/hellbender/GATKBaseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ public abstract class GATKBaseTest extends BaseTest {
// All of chromosomes 20 and 21 from the b37 reference
public static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta";

public static final String b37_reference_20_21_gz = largeFileTestDir + "human_g1k_v37.20.21.fasta.gz";

public static final String b37_2bit_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.2bit";

public static final String b37_reference_20_21_img = largeFileTestDir + "human_g1k_v37.20.21.fasta.img";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public Object[][] createBQSRTestData() {
final String localResources = getResourceDir();

final String GRCh37Ref_chr2021 = b37_reference_20_21;
final String GRCh37Ref_chr2021_gz = b37_reference_20_21_gz;
final String hiSeqBam_chr20 = localResources + WGS_B37_CH20_1M_1M1K_BAM;
final String hiSeqBam_1read = localResources + "overlappingRead.bam";
final String dbSNPb37_chr20 = localResources + DBSNP_138_B37_CH20_1M_1M1K_VCF;
Expand All @@ -87,6 +88,7 @@ public Object[][] createBQSRTestData() {
// local input/computation/reference
{new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_1read, dbSNPb37_chr2021, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1READ_RECAL)},
{new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(GRCh37Ref_chr2021_gz, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_RECAL)},
{new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_NOINDEL_NOBAQ_RECAL)},
{new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --indels-context-size 4", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_INDELS_CONTEXT_SIZE_4_RECAL)},
{new BQSRTest(GRCh37Ref_chr2021, hiSeqBam_chr20, dbSNPb37_chr20, "-indels --enable-baq --low-quality-tail 5", getResourceDir() + BQSRTestData.EXPECTED_WGS_B37_CH20_1M_1M1K_LOW_QUALITY_TAIL_5_RECAL)},
Expand Down
3 changes: 3 additions & 0 deletions src/test/resources/large/human_g1k_v37.20.21.fasta.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/test/resources/large/human_g1k_v37.20.21.fasta.gz.fai
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/test/resources/large/human_g1k_v37.20.21.fasta.gz.gzi
Git LFS file not shown

0 comments on commit da1bb69

Please sign in to comment.