diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index 0aa340e..040bd1c 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -58,6 +58,8 @@ jobs: tabix gff_file.gff.gz bgzip -c bed_file.bed > bed_file.bed.gz tabix bed_file.bed.gz + bgzip -c vcf_file.vcf > vcf_file.vcf.gz + tabix vcf_file.vcf.gz - name: Run tests run: dub -q test -b=unittest-cov diff --git a/source/dhtslib/vcf/reader.d b/source/dhtslib/vcf/reader.d index 3ce9ac1..a4570fc 100644 --- a/source/dhtslib/vcf/reader.d +++ b/source/dhtslib/vcf/reader.d @@ -1,23 +1,37 @@ module dhtslib.vcf.reader; import std.string: fromStringz, toStringz; +import std.utf: toUTFz; +import std.traits : ReturnType; import dhtslib.vcf.header; import dhtslib.vcf.record; +import dhtslib.tabix; +import dhtslib.coordinates; import htslib.vcf; import htslib.hts_log; +import htslib.kstring; alias BCFReader = VCFReader; +auto VCFReader(string fn, int MAX_UNPACK = BCF_UN_ALL) +{ + return VCFReaderImpl!(CoordSystem.zbc, false)(fn, MAX_UNPACK); +} + +auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, ChromCoordinates!cs region, int MAX_UNPACK = BCF_UN_ALL) +{ + return VCFReaderImpl!(cs,true)(tbxFile, region, MAX_UNPACK); +} + /** Basic support for reading VCF, BCF files */ -struct VCFReader +struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) { // htslib data structures vcfFile *fp; /// htsFile //bcf_hdr_t *hdr; /// header VCFHeader *vcfhdr; /// header wrapper -- no copies bcf1_t* b; /// record for use in iterator, will be recycled - int MAX_UNPACK; /// see htslib.vcf private static int refct; @@ -27,24 +41,54 @@ struct VCFReader this.refct++; } @disable this(); - /// read existing VCF file - /// MAX_UNPACK: setting alternate value could speed reading - this(string fn, int MAX_UNPACK = BCF_UN_ALL) - { - import htslib.hts : hts_set_threads; + static if(isTabixFile){ + + TabixIndexedFile tbx; /// For tabix use + ReturnType!(this.initializeTabixRange) tbxRange; /// For tabix use - if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor"); - this.fp = vcf_open(toStringz(fn), "r"c.ptr); - if (!this.fp) throw new Exception("Could not hts_open file"); - - hts_set_threads(this.fp, 1); // extra decoding thread + /// TabixIndexedFile and coordinates ctor + this(TabixIndexedFile tbxFile, ChromCoordinates!cs region, int MAX_UNPACK = BCF_UN_ALL) + { + this.tbx = tbxFile; + this.tbxRange = initializeTabixRange(region); + this.tbxRange.empty; + /// read the header from TabixIndexedFile + bcf_hdr_t * hdrPtr = bcf_hdr_init(toUTFz!(char *)("r")); + auto err = bcf_hdr_parse(hdrPtr, toUTFz!(char *)(this.tbx.header)); + this.vcfhdr = new VCFHeader(hdrPtr); + + this.b = bcf_init1(); + this.b.max_unpack = MAX_UNPACK; + this.MAX_UNPACK = MAX_UNPACK; + } + + /// copy the TabixIndexedFile.region range + auto initializeTabixRange(ChromCoordinates!cs region) + { + return this.tbx.region(region); + } + }else{ + /// read existing VCF file + /// MAX_UNPACK: setting alternate value could speed reading + this(string fn, int MAX_UNPACK = BCF_UN_ALL) + { + import htslib.hts : hts_set_threads; - this.vcfhdr = new VCFHeader( bcf_hdr_read(this.fp)); + assert(!isTabixFile); + if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor"); + this.fp = vcf_open(toStringz(fn), "r"c.ptr); + if (!this.fp) throw new Exception("Could not hts_open file"); + + hts_set_threads(this.fp, 1); // extra decoding thread - this.b = bcf_init1(); - this.b.max_unpack = MAX_UNPACK; - this.MAX_UNPACK = MAX_UNPACK; + this.vcfhdr = new VCFHeader( bcf_hdr_read(this.fp)); + + this.b = bcf_init1(); + this.b.max_unpack = MAX_UNPACK; + this.MAX_UNPACK = MAX_UNPACK; + } } + /// dtor ~this() { @@ -53,8 +97,10 @@ struct VCFReader // block file close and bcf1_t free() with reference counting // to allow VCFReader to implement Range interface if(!this.refct) { - const ret = vcf_close(this.fp); - if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after reading"); + static if(!isTabixFile){ + const ret = vcf_close(this.fp); + if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after reading"); + } // Deallocate header //bcf_hdr_destroy(this.hdr); @@ -72,7 +118,7 @@ struct VCFReader { return this.vcfhdr; } - + /** VCF version, e.g. VCFv4.2 */ @property string vcfVersion() { return cast(string) fromStringz( bcf_hdr_get_version(this.vcfhdr.hdr) ).idup; } @@ -117,7 +163,15 @@ struct VCFReader // int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); // documentation claims returns -1 on critical errors, 0 otherwise // however it looks like -1 is EOF and -2 is critical errors? - immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); + static if(isTabixFile){ + if(this.tbxRange.empty) return true; + auto str = this.tbxRange.front; + kstring_t kstr; + kputs(toUTFz!(char *)(str), &kstr); + immutable success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b); + this.tbxRange.popFront; + }else + immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); if (success >= 0) return false; else if (success == -1) { // EOF @@ -149,4 +203,59 @@ struct VCFReader // its copy return VCFRecord(this.vcfhdr, bcf_dup(this.b), this.MAX_UNPACK); } +} + +debug(dhtslib_unittest) unittest +{ + import std.stdio; + import htslib.hts_log; + import std.algorithm : map, count; + import std.array : array; + import std.path : buildPath, dirName; + import std.math : approxEqual; + hts_set_log_level(htsLogLevel.HTS_LOG_INFO); + hts_log_info(__FUNCTION__, "Testing VCFReader (Tabix)"); + hts_log_info(__FUNCTION__, "Loading test file"); + + auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); + assert(vcf.count == 14); + vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); + vcf.empty; + VCFRecord rec = vcf.front; + assert(rec.chrom == "1"); + assert(rec.pos == 3000149); + assert(rec.refAllele == "C"); + assert(rec.altAllelesAsArray == ["T"]); + assert(rec.allelesAsArray == ["C","T"]); + assert(approxEqual(rec.qual,59.2)); + assert(rec.filter == "PASS"); + // assert(rec. == ["C","T"]); +} + +debug(dhtslib_unittest) unittest +{ + import std.stdio; + import htslib.hts_log; + import std.algorithm : map, count; + import std.array : array; + import std.path : buildPath, dirName; + import std.math : approxEqual; + hts_set_log_level(htsLogLevel.HTS_LOG_INFO); + hts_log_info(__FUNCTION__, "Testing VCFReader"); + hts_log_info(__FUNCTION__, "Loading test file"); + auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); + auto tbx = TabixIndexedFile(fn); + auto vcf = VCFReader(tbx, OBHO("1:3000151-3062916")); + assert(vcf.count == 3); + vcf = VCFReader(tbx, OBHO("1:3000151-3062916")); + vcf.empty; + VCFRecord rec = vcf.front; + assert(rec.chrom == "1"); + assert(rec.pos == 3000150); + assert(rec.refAllele == "C"); + assert(rec.altAllelesAsArray == ["T"]); + assert(rec.allelesAsArray == ["C","T"]); + assert(approxEqual(rec.qual, 59.2)); + assert(rec.filter == "PASS"); + // assert(rec. == ["C","T"]); } \ No newline at end of file