Skip to content

Commit

Permalink
VCFReader can now accept a TabixIndexedFile and a region to get filte…
Browse files Browse the repository at this point in the history
…red vcf records, fixes #77, fixes #76, already fixed: fixes #68, fixes #73
  • Loading branch information
charlesgregory committed Jun 8, 2021
1 parent bb0f95b commit 7c1e89a
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 20 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ jobs:
tabix gff_file.gff.gz
bgzip -c bed_file.bed > bed_file.bed.gz
tabix bed_file.bed.gz
bgzip -c vcf_file.vcf > vcf_file.vcf.gz
tabix vcf_file.vcf.gz
- name: Run tests
run: dub -q test -b=unittest-cov
Expand Down
149 changes: 129 additions & 20 deletions source/dhtslib/vcf/reader.d
Original file line number Diff line number Diff line change
@@ -1,23 +1,37 @@
module dhtslib.vcf.reader;

import std.string: fromStringz, toStringz;
import std.utf: toUTFz;
import std.traits : ReturnType;

import dhtslib.vcf.header;
import dhtslib.vcf.record;
import dhtslib.tabix;
import dhtslib.coordinates;
import htslib.vcf;
import htslib.hts_log;
import htslib.kstring;

alias BCFReader = VCFReader;

auto VCFReader(string fn, int MAX_UNPACK = BCF_UN_ALL)
{
return VCFReaderImpl!(CoordSystem.zbc, false)(fn, MAX_UNPACK);
}

auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, ChromCoordinates!cs region, int MAX_UNPACK = BCF_UN_ALL)
{
return VCFReaderImpl!(cs,true)(tbxFile, region, MAX_UNPACK);
}

/** Basic support for reading VCF, BCF files */
struct VCFReader
struct VCFReaderImpl(CoordSystem cs, bool isTabixFile)
{
// htslib data structures
vcfFile *fp; /// htsFile
//bcf_hdr_t *hdr; /// header
VCFHeader *vcfhdr; /// header wrapper -- no copies
bcf1_t* b; /// record for use in iterator, will be recycled

int MAX_UNPACK; /// see htslib.vcf

private static int refct;
Expand All @@ -27,24 +41,54 @@ struct VCFReader
this.refct++;
}
@disable this();
/// read existing VCF file
/// MAX_UNPACK: setting alternate value could speed reading
this(string fn, int MAX_UNPACK = BCF_UN_ALL)
{
import htslib.hts : hts_set_threads;
static if(isTabixFile){

TabixIndexedFile tbx; /// For tabix use
ReturnType!(this.initializeTabixRange) tbxRange; /// For tabix use

if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor");
this.fp = vcf_open(toStringz(fn), "r"c.ptr);
if (!this.fp) throw new Exception("Could not hts_open file");

hts_set_threads(this.fp, 1); // extra decoding thread
/// TabixIndexedFile and coordinates ctor
this(TabixIndexedFile tbxFile, ChromCoordinates!cs region, int MAX_UNPACK = BCF_UN_ALL)
{
this.tbx = tbxFile;
this.tbxRange = initializeTabixRange(region);
this.tbxRange.empty;
/// read the header from TabixIndexedFile
bcf_hdr_t * hdrPtr = bcf_hdr_init(toUTFz!(char *)("r"));
auto err = bcf_hdr_parse(hdrPtr, toUTFz!(char *)(this.tbx.header));
this.vcfhdr = new VCFHeader(hdrPtr);

this.b = bcf_init1();
this.b.max_unpack = MAX_UNPACK;
this.MAX_UNPACK = MAX_UNPACK;
}

/// copy the TabixIndexedFile.region range
auto initializeTabixRange(ChromCoordinates!cs region)
{
return this.tbx.region(region);
}
}else{
/// read existing VCF file
/// MAX_UNPACK: setting alternate value could speed reading
this(string fn, int MAX_UNPACK = BCF_UN_ALL)
{
import htslib.hts : hts_set_threads;

this.vcfhdr = new VCFHeader( bcf_hdr_read(this.fp));
assert(!isTabixFile);
if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor");
this.fp = vcf_open(toStringz(fn), "r"c.ptr);
if (!this.fp) throw new Exception("Could not hts_open file");

hts_set_threads(this.fp, 1); // extra decoding thread

this.b = bcf_init1();
this.b.max_unpack = MAX_UNPACK;
this.MAX_UNPACK = MAX_UNPACK;
this.vcfhdr = new VCFHeader( bcf_hdr_read(this.fp));

this.b = bcf_init1();
this.b.max_unpack = MAX_UNPACK;
this.MAX_UNPACK = MAX_UNPACK;
}
}

/// dtor
~this()
{
Expand All @@ -53,8 +97,10 @@ struct VCFReader
// block file close and bcf1_t free() with reference counting
// to allow VCFReader to implement Range interface
if(!this.refct) {
const ret = vcf_close(this.fp);
if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after reading");
static if(!isTabixFile){
const ret = vcf_close(this.fp);
if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after reading");
}

// Deallocate header
//bcf_hdr_destroy(this.hdr);
Expand All @@ -72,7 +118,7 @@ struct VCFReader
{
return this.vcfhdr;
}

/** VCF version, e.g. VCFv4.2 */
@property string vcfVersion() { return cast(string) fromStringz( bcf_hdr_get_version(this.vcfhdr.hdr) ).idup; }

Expand Down Expand Up @@ -117,7 +163,15 @@ struct VCFReader
// int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v);
// documentation claims returns -1 on critical errors, 0 otherwise
// however it looks like -1 is EOF and -2 is critical errors?
immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b);
static if(isTabixFile){
if(this.tbxRange.empty) return true;
auto str = this.tbxRange.front;
kstring_t kstr;
kputs(toUTFz!(char *)(str), &kstr);
immutable success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b);
this.tbxRange.popFront;
}else
immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b);
if (success >= 0) return false;
else if (success == -1) {
// EOF
Expand Down Expand Up @@ -149,4 +203,59 @@ struct VCFReader
// its copy
return VCFRecord(this.vcfhdr, bcf_dup(this.b), this.MAX_UNPACK);
}
}

debug(dhtslib_unittest) unittest
{
import std.stdio;
import htslib.hts_log;
import std.algorithm : map, count;
import std.array : array;
import std.path : buildPath, dirName;
import std.math : approxEqual;
hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
hts_log_info(__FUNCTION__, "Testing VCFReader (Tabix)");
hts_log_info(__FUNCTION__, "Loading test file");

auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
assert(vcf.count == 14);
vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
vcf.empty;
VCFRecord rec = vcf.front;
assert(rec.chrom == "1");
assert(rec.pos == 3000149);
assert(rec.refAllele == "C");
assert(rec.altAllelesAsArray == ["T"]);
assert(rec.allelesAsArray == ["C","T"]);
assert(approxEqual(rec.qual,59.2));
assert(rec.filter == "PASS");
// assert(rec. == ["C","T"]);
}

debug(dhtslib_unittest) unittest
{
import std.stdio;
import htslib.hts_log;
import std.algorithm : map, count;
import std.array : array;
import std.path : buildPath, dirName;
import std.math : approxEqual;
hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
hts_log_info(__FUNCTION__, "Testing VCFReader");
hts_log_info(__FUNCTION__, "Loading test file");
auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz");
auto tbx = TabixIndexedFile(fn);
auto vcf = VCFReader(tbx, OBHO("1:3000151-3062916"));
assert(vcf.count == 3);
vcf = VCFReader(tbx, OBHO("1:3000151-3062916"));
vcf.empty;
VCFRecord rec = vcf.front;
assert(rec.chrom == "1");
assert(rec.pos == 3000150);
assert(rec.refAllele == "C");
assert(rec.altAllelesAsArray == ["T"]);
assert(rec.allelesAsArray == ["C","T"]);
assert(approxEqual(rec.qual, 59.2));
assert(rec.filter == "PASS");
// assert(rec. == ["C","T"]);
}

0 comments on commit 7c1e89a

Please sign in to comment.