diff --git a/source/dhtslib/vcf/header.d b/source/dhtslib/vcf/header.d index 302f0dc..ea1dbcb 100644 --- a/source/dhtslib/vcf/header.d +++ b/source/dhtslib/vcf/header.d @@ -1,20 +1,275 @@ +/** + +This module provides structs that encapsulate VCFHeader and HeaderRecord + +`VCFHeader` encapsulates and owns a `bcf_hdr_t*`, +and provides convenience functions to read and write header records. + +`HeaderRecord` provides an easy way to coonstruct new header records and +convert them to bcf_hrec_t * for use by the htslib API. + +*/ + module dhtslib.vcf.header; +import std.datetime; import std.string: fromStringz, toStringz; +import std.format: format; +import std.traits : isArray, isIntegral, isSomeString; +import std.conv: to, ConvException; +import std.algorithm : map; +import std.array : array; +import std.utf : toUTFz; +import dhtslib.vcf; import htslib.vcf; +import htslib.hts_log; + + +/// Struct for easy setting and getting of bcf_hrec_t values for VCFheader +struct HeaderRecord +{ + /// HeaderRecordType type i.e INFO, contig, FORMAT + HeaderRecordType recType = HeaderRecordType.None; + + /// string of HeaderRecordType type i.e INFO, contig, FORMAT ? + /// or could be ##source=program + /// ====== + string key; + + /// mostly empty except for + /// this ##source=program + /// ======= + string value; + + /// number kv pairs + int nkeys; -/** Wrapper around bcf_hdr_t + /// kv pair keys + string[] keys; - Q: Why do we have VCFHeader, but not SAMHeader? - A: Most (all)? of the SAM (bam1_t) manipulation functions do not require a ptr to the header, - whereas almost all of the VCF (bcf1_t) manipulation functions do. Therefore, we track bcf_hdr_t* - inside each VCFRecord; this is wrapped by VCFHeader for future convenience (for example, - now we have @property nsamples; may move the tag reader and writer functions here?) + /// kv pair values + string[] vals; - In order to avoid double free()'ing an instance bcf_hdr_t, - this wrapper will be the authoritative holder of of bcf_hdr_t ptrs, - it shall be passed around by reference, and copies are disabled. + /// HDR IDX value + int idx = -1; + + /// HDR Length value A, R, G, ., FIXED + HeaderLengths lenthType = HeaderLengths.None; + + /// if HDR Length value is FIXED + /// this is the number + int length = -1; + + /// HDR Length value INT, FLOAT, STRING + HeaderTypes valueType = HeaderTypes.None; + + invariant + { + assert(this.keys.length == this.vals.length); + } + /// ctor from a bcf_hrec_t + this(bcf_hrec_t * rec){ + + /// Set the easy stuff + this.recType = cast(HeaderRecordType) rec.type; + this.key = fromStringz(rec.key).dup; + this.value = fromStringz(rec.value).dup; + this.nkeys = rec.nkeys; + + /// get the kv pairs + /// special logic for Number and Type + for(auto i=0; i < rec.nkeys; i++){ + keys ~= fromStringz(rec.keys[i]).dup; + vals ~= fromStringz(rec.vals[i]).dup; + if(keys[i] == "Number") + { + switch(vals[i]){ + case "A": + this.lenthType = HeaderLengths.OnePerAltAllele; + break; + case "G": + this.lenthType = HeaderLengths.OnePerGenotype; + break; + case "R": + this.lenthType = HeaderLengths.OnePerAllele; + break; + case ".": + this.lenthType = HeaderLengths.Variable; + break; + default: + this.lenthType = HeaderLengths.Fixed; + this.length = vals[i].to!int; + break; + } + } + if(keys[i] == "Type") + { + switch(vals[i]){ + case "Flag": + this.valueType = HeaderTypes.Flag; + break; + case "Integer": + this.valueType = HeaderTypes.Integer; + break; + case "Float": + this.valueType = HeaderTypes.Float; + break; + case "Character": + this.valueType = HeaderTypes.Character; + break; + case "String": + this.valueType = HeaderTypes.String; + break; + default: + throw new Exception(vals[i]~" is not a know BCF Header Type"); + } + } + if(keys[i] == "IDX"){ + this.nkeys--; + this.idx = this.vals[$-1].to!int; + this.keys = this.keys[0..$-1]; + this.vals = this.vals[0..$-1]; + } + + } + } + + /// set Record Type i.e INFO, FORMAT ... + void setHeaderRecordType(HeaderRecordType line) + { + this.recType = line; + this.key = HeaderRecordTypeStrings[line]; + } + + /// get Record Type i.e INFO, FORMAT ... + HeaderRecordType getHeaderRecordType() + { + return this.recType; + } + + /// set Value Type length with integer + void setLength(T)(T number) + if(isIntegral!T) + { + this.lenthType = HeaderLengths.Fixed; + this["Number"] = number.to!string; + } + + /// set Value Type length i.e A, R, G, . + void setLength(HeaderLengths number) + { + this.lenthType = number; + this["Number"] = HeaderLengthsStrings[number]; + } + + /// get Value Type length + string getLength() + { + return this["Number"]; + } + + /// set Value Type i.e Integer, String, Float + void setValueType(HeaderTypes type) + { + this.valueType = type; + this["Type"] = HeaderTypesStrings[type]; + } + + /// get Value Type i.e Integer, String, Float + HeaderTypes getValueType() + { + return this.valueType; + } + + /// set ID field + void setID(string id) + { + this["ID"] = id; + } + + /// get ID field + string getID() + { + return this["ID"]; + } + + /// set Description field + void setDescription(string des) + { + this["Description"] = des; + } + + /// get Description field + string getDescription() + { + return this["Description"]; + } + + /// get a value from the KV pairs + /// if key isn't present thows exception + ref auto opIndex(string index) + { + foreach (i, string key; keys) + { + if(key == index){ + return vals[i]; + } + } + throw new Exception("Key " ~ index ~" not found"); + } + + /// set a value from the KV pairs + /// if key isn't present a new KV pair is + /// added + void opIndexAssign(string value, string index) + { + foreach (i, string key; keys) + { + if(key == index){ + vals[i] = value; + return; + } + } + this.nkeys++; + keys~=index; + vals~=value; + } + + /// convert to bcf_hrec_t for use with htslib functions + bcf_hrec_t * convert(bcf_hdr_t * hdr) + { + if(this.recType == HeaderRecordType.Info || this.recType == HeaderRecordType.Format){ + assert(this.valueType != HeaderTypes.None); + assert(this.lenthType != HeaderLengths.None); + } + + auto str = this.toString; + int parsed; + auto rec = bcf_hdr_parse_line(hdr, toUTFz!(char *)(str), &parsed); + rec.type = this.recType; + return rec; + } + + /// print a string representation of the header record + string toString() + { + string ret = "##" ~ this.key ~ "=" ~ this.value; + if(this.nkeys > 0){ + ret ~= "<"; + for(auto i =0; i < this.nkeys - 1; i++) + { + ret ~= this.keys[i] ~ "=" ~ this.vals[i] ~ ", "; + } + ret ~= this.keys[$-1] ~ "=" ~ this.vals[$-1] ~ ">"; + } + + return ret; + } +} + +/** VCFHeader encapsulates `bcf_hdr_t*` + and provides convenience wrappers to manipulate the header metadata/records. */ struct VCFHeader { @@ -35,6 +290,11 @@ struct VCFHeader assert(this.hdr != null); } + /// copy this header + auto dup(){ + return VCFHeader(bcf_hdr_dup(this.hdr)); + } + /// List of contigs in the header @property string[] sequences() { @@ -61,4 +321,427 @@ struct VCFHeader pragma(inline, true) @property int nsamples() { return bcf_hdr_nsamples(this.hdr); } + int getSampleId(string sam){ + auto ret = bcf_hdr_id2int(this.hdr, HeaderDictTypes.Sample, toUTFz!(char *)(sam)); + if(ret == -1) hts_log_error(__FUNCTION__, "Couldn't find sample in header: " ~ sam); + return ret; + } + + // TODO + /// copy header lines from a template without overwiting existing lines + void copyHeaderLines(bcf_hdr_t *other) + { + assert(this.hdr != null); + assert(0); + // bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const(bcf_hdr_t) *src); + } + + /// Add sample to this VCF + /// * int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample); + int addSample(string name) + in { assert(name != ""); } + do + { + assert(this.hdr != null); + + bcf_hdr_add_sample(this.hdr, toStringz(name)); + + // AARRRRGGGHHHH + // https://github.com/samtools/htslib/issues/767 + bcf_hdr_sync(this.hdr); + + return 0; + } + + /** VCF version, e.g. VCFv4.2 */ + @property string vcfVersion() { return fromStringz( bcf_hdr_get_version(this.hdr) ).idup; } + + /// Add a new header line + int addHeaderLineKV(string key, string value) + { + // TODO check that key is not Info, FILTER, FORMAT (or contig?) + string line = format("##%s=%s", key, value); + + auto ret = bcf_hdr_append(this.hdr, toStringz(line)); + if(ret < 0) + hts_log_error(__FUNCTION__, "Couldn't add header line with key=%s and value =%s".format(key, value)); + auto notAdded = bcf_hdr_sync(this.hdr); + if(notAdded < 0) + hts_log_error(__FUNCTION__, "Couldn't add header line with key=%s and value =%s".format(key, value)); + return ret; + } + + /// Add a new header line -- must be formatted ##key=value + int addHeaderLineRaw(string line) + { + assert(this.hdr != null); + // int bcf_hdr_append(bcf_hdr_t *h, const(char) *line); + const auto ret = bcf_hdr_append(this.hdr, toStringz(line)); + bcf_hdr_sync(this.hdr); + return ret; + } + + /// Add a new header line using HeaderRecord + int addHeaderRecord(HeaderRecord rec) + { + assert(this.hdr != null); + auto ret = bcf_hdr_add_hrec(this.hdr, rec.convert(this.hdr)); + if(ret < 0) + hts_log_error(__FUNCTION__, "Couldn't add HeaderRecord"); + auto notAdded = bcf_hdr_sync(this.hdr); + if(notAdded != 0) + hts_log_error(__FUNCTION__, "Couldn't add HeaderRecord"); + return ret; + } + + /// Remove all header lines of a particular type + void removeHeaderLines(HeaderRecordType linetype) + { + bcf_hdr_remove(this.hdr, linetype, null); + bcf_hdr_sync(this.hdr); + } + + /// Remove a header line of a particular type with the key + void removeHeaderLines(HeaderRecordType linetype, string key) + { + bcf_hdr_remove(this.hdr, linetype, toStringz(key)); + bcf_hdr_sync(this.hdr); + } + + /// get a header record via ID field + HeaderRecord getHeaderRecord(HeaderRecordType linetype, string id) + { + return this.getHeaderRecord(linetype, "ID", id); + } + + /// get a header record via a string value pair + HeaderRecord getHeaderRecord(HeaderRecordType linetype, string key, string value) + { + auto rec = bcf_hdr_get_hrec(this.hdr, linetype, toUTFz!(const(char) *)(key),toUTFz!(const(char) *)(value), null); + if(!rec) throw new Exception("Record could not be found"); + auto ret = HeaderRecord(rec); + // bcf_hrec_destroy(rec); + return ret; + } + + /// Add a filedate= headerline, which is not called out specifically in the spec, + /// but appears in the spec's example files. We could consider allowing a param here. + int addFiledate() + { + return addHeaderLineKV("filedate", (cast(Date) Clock.currTime()).toISOString ); + } + + /** Add INFO (§1.2.2) or FORMAT (§1.2.4) tag + + The INFO tag describes row-specific keys used in the INFO column; + The FORMAT tag describes sample-specific keys used in the last, and optional, genotype column. + + Template parameter: string; must be INFO or FORMAT + + The first four parameters are required; NUMBER and TYPE have specific allowable values. + source and version are optional, but recommended (for INFO only). + + * id: ID tag + * number: NUMBER tag; here a string because it can also take special values {A,R,G,.} (see §1.2.2) + * type: Integer, Float, Flag, Character, and String + * description: Text description; will be double quoted + * source: Annotation source (eg dbSNP) + * version: Annotation version (eg 142) + */ + + void addHeaderLine(HeaderRecordType lineType, T)(string id, T number, HeaderTypes type, + string description="", + string source="", + string _version="") + if((isIntegral!T || is(T == HeaderLengths)) && lineType != HeaderRecordType.None ) + { + HeaderRecord rec; + rec.setHeaderRecordType = lineType; + rec.setID(id); + rec.setLength(number); + rec.setValueType(type); + static if(lineType == HeaderRecordType.Info || lineType == HeaderRecordType.Filter || lineType == HeaderRecordType.FORMAT){ + if(description == ""){ + throw new Exception("description cannot be empty for " ~ HeaderRecordTypeStrings[lineType]); + } + } + rec.setDescription(description); + if(source != "") + rec["source"] = "\"%s\"".format(source); + if(_version != "") + rec["version"] = "\"%s\"".format(_version); + + this.addHeaderRecord(rec); + } + + /** Add FILTER tag (§1.2.3) */ + void addHeaderLine(HeaderRecordType lineType)(string id, string description) + if(lineType == HeaderRecordType.Filter) + { + HeaderRecord rec; + rec.setHeaderRecordType = lineType; + rec.setID(id); + rec.setDescription("\"%s\"".format(description)); + + this.addHeaderRecord(rec); + } + + /** Add FILTER tag (§1.2.3) */ + deprecated void addFilter(string id, string description) + { + addHeaderLine!(HeaderRecordType.Filter)(id, description); + } + + /// string representation of header + string toString(){ + import htslib.kstring; + kstring_t s; + + const int ret = bcf_hdr_format(this.hdr, 0, &s); + if (ret) + { + hts_log_error(__FUNCTION__, + format("bcf_hdr_format returned nonzero (%d) (likely EINVAL, invalid bcf_hdr_t struct?)", ret)); + return "[VCFHeader bcf_hdr_format parse_error]"; + } + + return cast(string) s.s[0 .. s.l]; + } +} + +/// +debug(dhtslib_unittest) +unittest +{ + import std.exception: assertThrown; + import std.stdio: writeln, writefln; + + hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); + + + auto hdr = VCFHeader(bcf_hdr_init("w\0"c.ptr)); + + hdr.addHeaderLineRaw("##INFO="); + hdr.addHeaderLineKV("INFO", ""); + // ##INFO= + hdr.addHeaderLine!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data"); + hdr.addHeaderLineRaw("##contig="); // @suppress(dscanner.style.long_line) + hdr.addHeaderLineRaw("##FILTER="); + + + // Exercise header + assert(hdr.nsamples == 0); + hdr.addSample("NA12878"); + assert(hdr.nsamples == 1); + assert(hdr.vcfVersion == "VCFv4.2"); +} + +/// +debug(dhtslib_unittest) +unittest +{ + import std.exception: assertThrown; + import std.stdio: writeln, writefln; + + hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); + + + auto hdr = VCFHeader(bcf_hdr_init("w\0"c.ptr)); + + hdr.addHeaderLineRaw("##INFO="); + hdr.addHeaderLineKV("INFO", ""); + + auto rec = hdr.getHeaderRecord(HeaderRecordType.Info,"ID","NS"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.key == "INFO"); + assert(rec.nkeys == 4); + assert(rec.keys == ["ID", "Number", "Type", "Description"]); + assert(rec.vals == ["NS", "1", "Integer", "\"Number of Samples With Data\""]); + assert(rec["ID"] == "NS"); + + assert(rec.idx == 1); + + writeln(rec.toString); + + + rec = HeaderRecord(rec.convert(hdr.hdr)); + + assert(rec.recType == HeaderRecordType.Info); + assert(rec.key == "INFO"); + assert(rec.nkeys == 4); + assert(rec.keys == ["ID", "Number", "Type", "Description"]); + assert(rec.vals == ["NS", "1", "Integer", "\"Number of Samples With Data\""]); + assert(rec["ID"] == "NS"); + // assert(rec["IDX"] == "1"); + // assert(rec.idx == 1); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"ID","NS"); + + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "1"); + assert(rec.getValueType == HeaderTypes.Integer); + + rec.idx = -1; + + rec["ID"] = "NS2"; + + hdr.addHeaderRecord(rec); + auto hdr2 = hdr.dup; + // writeln(hdr2.toString); + + rec = hdr2.getHeaderRecord(HeaderRecordType.Info,"ID","NS2"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.key == "INFO"); + assert(rec.nkeys == 4); + assert(rec.keys == ["ID", "Number", "Type", "Description"]); + assert(rec.vals == ["NS2", "1", "Integer", "\"Number of Samples With Data\""]); + assert(rec["ID"] == "NS2"); + + assert(rec.idx == 3); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Generic); + rec.key = "source"; + rec.value = "hello"; + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Generic,"source","hello"); + assert(rec.recType == HeaderRecordType.Generic); + assert(rec.key == "source"); + assert(rec.value == "hello"); + assert(rec.nkeys == 0); + + hdr.addHeaderLine!(HeaderRecordType.Filter)("nonsense","filter"); + + rec = hdr.getHeaderRecord(HeaderRecordType.Filter,"ID","nonsense"); + assert(rec.recType == HeaderRecordType.Filter); + assert(rec.key == "FILTER"); + assert(rec.value == ""); + assert(rec.getID == "nonsense"); + assert(rec.idx == 4); + + hdr.removeHeaderLines(HeaderRecordType.Filter); + + auto expected = "##fileformat=VCFv4.2\n" ~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##source=hello\n"~ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + assert(hdr.toString == expected); + + rec = rec.init; + rec.setHeaderRecordType(HeaderRecordType.Contig); + rec.setID("test"); + rec["length"] = "5"; + + hdr.addHeaderRecord(rec); + + assert(hdr.sequences == ["test"]); + hdr.removeHeaderLines(HeaderRecordType.Generic, "source"); + hdr.addFilter("test","test"); + expected = "##fileformat=VCFv4.2\n" ~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##contig=\n"~ + "##FILTER=\n"~ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + assert(hdr.toString == expected); + rec = hdr.getHeaderRecord(HeaderRecordType.Filter,"test"); + assert(rec.getDescription() == "\"test\""); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test"); + rec.setLength(HeaderLengths.OnePerGenotype); + rec.setValueType(HeaderTypes.Integer); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "G"); + assert(rec.getID == "test"); + assert(rec.getValueType == HeaderTypes.Integer); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test2"); + rec.setLength(HeaderLengths.OnePerAllele); + rec.setValueType(HeaderTypes.Integer); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test2"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "R"); + assert(rec.getID == "test2"); + assert(rec.getValueType == HeaderTypes.Integer); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test3"); + rec.setLength(HeaderLengths.Variable); + rec.setValueType(HeaderTypes.Integer); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test3"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "."); + assert(rec.getID == "test3"); + assert(rec.getValueType == HeaderTypes.Integer); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test4"); + rec.setLength(1); + rec.setValueType(HeaderTypes.Flag); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test4"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getID == "test4"); + assert(rec.getValueType == HeaderTypes.Flag); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test5"); + rec.setLength(1); + rec.setValueType(HeaderTypes.Character); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test5"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "1"); + assert(rec.getID == "test5"); + assert(rec.getValueType == HeaderTypes.Character); + + rec = HeaderRecord.init; + rec.setHeaderRecordType(HeaderRecordType.Info); + rec.setID("test6"); + rec.setLength(HeaderLengths.Variable); + rec.setValueType(HeaderTypes.String); + hdr.addHeaderRecord(rec); + + rec = hdr.getHeaderRecord(HeaderRecordType.Info,"test6"); + assert(rec.recType == HeaderRecordType.Info); + assert(rec.getLength == "."); + assert(rec.getID == "test6"); + assert(rec.getValueType == HeaderTypes.String); + + expected = "##fileformat=VCFv4.2\n" ~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##contig=\n"~ + "##FILTER=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "##INFO=\n"~ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + writeln(hdr.toString); + assert(hdr.toString == expected); + } \ No newline at end of file diff --git a/source/dhtslib/vcf/package.d b/source/dhtslib/vcf/package.d index 8380f34..46833ee 100644 --- a/source/dhtslib/vcf/package.d +++ b/source/dhtslib/vcf/package.d @@ -3,4 +3,139 @@ module dhtslib.vcf; public import dhtslib.vcf.record; public import dhtslib.vcf.reader; public import dhtslib.vcf.header; -public import dhtslib.vcf.writer; \ No newline at end of file +public import dhtslib.vcf.writer; + +import std.meta : AliasSeq; +import htslib.vcf; + +/// Represents the classification of a headerline +/// +/// ##INFO=<...> +/// ==== +/// +/// Replacement for htslib BCF_HL_* +enum HeaderRecordType +{ + None = -1, + Filter = BCF_HL_FLT, /// header line: FILTER + Info = BCF_HL_INFO,/// header line: INFO + Format = BCF_HL_FMT, /// header line: FORMAT + Contig = BCF_HL_CTG, /// header line: contig + Struct = BCF_HL_STR, /// header line: structured header line TAG= + Generic = BCF_HL_GEN, /// header line: generic header line +} + +/// Strings for HeaderRecordType +static immutable HeaderRecordTypeStrings = ["FILTER","INFO","FORMAT","contig", "struct", "generic"]; + +/// Represents the classification of a headerline +/// +/// ##INFO= +/// ======= +/// +/// Replacement for htslib BCF_HT_* +enum HeaderTypes +{ + None = -1, + Flag = BCF_HT_FLAG, /// header type: FLAG// header type + Integer = BCF_HT_INT, /// header type: INTEGER + Float = BCF_HT_REAL, /// header type: REAL, + String = BCF_HT_STR, /// header type: STRING + Character = 4, + Long = BCF_HT_LONG, // BCF_HT_INT, but for int64_t values; VCF only! +} + +/// Strings for HeaderTypes +/// +/// doesn't work as needs compile time +/// enum HeaderTypesStrings = __traits(allMembers, HeaderTypes); +/// +/// works but includes "None" which is throwing off indexing +/// enum HeaderTypesStrings = [__traits(allMembers, HeaderTypes)]; +enum HeaderTypesStrings = [__traits(allMembers, HeaderTypes)][1..$]; + + +/// Represents the classification of a headerline +/// +/// ##INFO= +/// = +/// +/// if FIXED +/// ##INFO= +/// = +/// +/// Replacement for htslib BCF_VL_* +enum HeaderLengths +{ + None = -1, + Fixed = BCF_VL_FIXED, /// variable length: fixed length + Variable = BCF_VL_VAR, /// variable length: variable + OnePerAltAllele = BCF_VL_A, /// variable length: one field per alt allele + OnePerGenotype = BCF_VL_G, /// variable length: one field per genotype + OnePerAllele = BCF_VL_R, /// variable length: one field per allele including ref +} + +/// Strings for HDR_LENGTH +static immutable HeaderLengthsStrings = ["FIXED",".","A","G","R"]; + +/// Used to index into bcf_hdr_t's id field of type bcf_idpair_t*[3] +/// +/// i.e as used from VCFRecord where this.vcfheader is a VCFHeader: +/// this.vcfheader.hdr.id[HeaderDictTypes.Id] +/// +/// Replacement for htslib BCF_DT_* +enum HeaderDictTypes +{ + Id = BCF_DT_ID, /// dictionary type: ID + Contig = BCF_DT_CTG, /// dictionary type: Contig + Sample = BCF_DT_SAMPLE, /// dictionary type: SAMPLE +} + +/// Used by InfoField (bcf_info_t) and FormatField (bcf_fmt_t) +/// to identify the underlying htslib/bcf1_t info and format data +/// type and size. This data is stored in ubyte arrays. +/// +/// +/// Replacement for htslib BCF_BT_* +enum RecordType +{ + Null = 0, /// null + Int8 = BCF_BT_INT8, /// int8 + Int16 = BCF_BT_INT16, /// int16 + Int32 = BCF_BT_INT32, /// int32 + Int64 = BCF_BT_INT64, /// Unofficial, for internal use only per htslib headers + Float = BCF_BT_FLOAT, /// float (32?) + Char = BCF_BT_CHAR /// char (8 bit) +} + +/// Byte sizes for RecordType +static immutable RecordTypeSizes = [0, byte.sizeof, short.sizeof, int.sizeof, long.sizeof, float.sizeof, 0, char.sizeof]; +alias RecordTypeToDType = AliasSeq!(null, byte, short, int, long, float, null, string); + +/// Replacement for htslib VCF_* +enum VariantType +{ + Ref = VCF_REF, /// ref (e.g. in a gVCF) + Snp = VCF_SNP, /// SNP + Mnp = VCF_MNP, /// MNP + Indel = VCF_INDEL, /// INDEL + Other = VCF_OTHER, /// other (e.g. SV) + Breakend = VCF_BND, /// breakend + Overlap = VCF_OVERLAP, /// overlapping deletion, ALT=* +} + +/// Levels identifiers for unpacking the underlying variable length +/// data in the bcf1_t. Values are inclusive +/// i.e UnpackLevel.AltAllele unpacks all data before and including the ALT allele +/// Replacement for htslib BCF_UN_* +enum UnpackLevel +{ + None = 0, + AltAllele = BCF_UN_STR, // up to ALT inclusive + Filter = BCF_UN_FLT, // up to Filter + Info = BCF_UN_INFO, // up to Info + SharedFields = BCF_UN_SHR, // all shared information + Format = BCF_UN_FMT, // unpack format and each sample + IndividualFields = BCF_UN_IND, // a synonym of UNPACK.FMT + All = BCF_UN_ALL, // everything +} diff --git a/source/dhtslib/vcf/reader.d b/source/dhtslib/vcf/reader.d index 160f2c9..ab4f918 100644 --- a/source/dhtslib/vcf/reader.d +++ b/source/dhtslib/vcf/reader.d @@ -4,8 +4,7 @@ import std.string: fromStringz, toStringz; import std.utf: toUTFz; import std.traits : ReturnType; -import dhtslib.vcf.header; -import dhtslib.vcf.record; +import dhtslib.vcf; import dhtslib.tabix; import dhtslib.coordinates; import htslib.vcf; @@ -14,12 +13,12 @@ import htslib.kstring; alias BCFReader = VCFReader; -auto VCFReader(string fn, int MAX_UNPACK = BCF_UN_ALL) +auto VCFReader(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None) { return VCFReaderImpl!(CoordSystem.zbc, false)(fn, MAX_UNPACK); } -auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, int MAX_UNPACK = BCF_UN_ALL) +auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None) { return VCFReaderImpl!(cs,true)(tbxFile, chrom, coords, MAX_UNPACK); } @@ -32,10 +31,13 @@ struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) //bcf_hdr_t *hdr; /// header VCFHeader *vcfhdr; /// header wrapper -- no copies bcf1_t* b; /// record for use in iterator, will be recycled - int MAX_UNPACK; /// see htslib.vcf + UnpackLevel MAX_UNPACK; /// see htslib.vcf private static int refct; + private bool initialized; + private int success; + this(this) { this.refct++; @@ -47,7 +49,7 @@ struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) ReturnType!(this.initializeTabixRange) tbxRange; /// For tabix use /// TabixIndexedFile and coordinates ctor - this(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, int MAX_UNPACK = BCF_UN_ALL) + this(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None) { this.tbx = tbxFile; this.tbxRange = initializeTabixRange(chrom, coords); @@ -70,7 +72,7 @@ struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) }else{ /// read existing VCF file /// MAX_UNPACK: setting alternate value could speed reading - this(string fn, int MAX_UNPACK = BCF_UN_ALL) + this(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None) { import htslib.hts : hts_set_threads; @@ -119,86 +121,52 @@ struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) return this.vcfhdr; } - /** VCF version, e.g. VCFv4.2 */ - @property string vcfVersion() { return cast(string) fromStringz( bcf_hdr_get_version(this.vcfhdr.hdr) ).idup; } - - /++ - bcf_hrec_t *bcf_hdr_get_hrec(const(bcf_hdr_t) *hdr, int type, const(char) *key, const(char) *value, - const(char) *str_class); - +/ - // TODO: check key for "", fail if empty - // TODO: check value, str_class for "" and replace with NULL - // TODO: Memory leak. We are never freeing the bcf_hrec_t, but, it escapes as pointer inside key/value string - // TODO: handle structured and general lines - string[string] getTagByKV(string tagType, T)(string key, string value, string str_class) - if((tagType == "FILTER" || tagType == "INFO" || tagType == "FORMAT" || tagType == "contig") && - (isIntegral!T || isSomeString!T)) - { - // hlt : header line type - static if (tagType == "FILTER") const int hlt = BCF_HL_FLT; - else static if (tagType == "INFO") const int hlt = BCF_HL_INFO; // @suppress(dscanner.suspicious.label_var_same_name) - else static if (tagType == "FORMAT") const int hlt= BCF_HL_FMT; // @suppress(dscanner.suspicious.label_var_same_name) - else static if (tagType == "contig") const int hlt= BCF_HL_CTG; // @suppress(dscanner.suspicious.label_var_same_name) - else assert(0); - - bcf_hrec_t *hrec = bcf_hdr_get_hrec(this.vcfhdr.hdr, hlt, toStringz(key), - toStringz(value), - toStringz(str_class)); - - const int nkeys = hrec.nkeys; - string[string] kv; - - foreach(int i; 0 .. nkeys) { - string k = cast(string) fromStringz(hrec.keys[i]); - string v = cast(string) fromStringz(hrec.vals[i]); - kv[k] = v; - } - - return kv; - } - /// InputRange interface: iterate over all records @property bool empty() { - // int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); - // documentation claims returns -1 on critical errors, 0 otherwise - // however it looks like -1 is EOF and -2 is critical errors? static if(isTabixFile){ if(this.tbxRange.empty) return true; - auto str = this.tbxRange.front; - kstring_t kstr; - kputs(toUTFz!(char *)(str), &kstr); - immutable success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b); - this.tbxRange.popFront; - }else - immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); - if (success >= 0) return false; - else if (success == -1) { - // EOF - // see htslib my comments https://github.com/samtools/htslib/issues/246 - // and commit 9845bc9a947350d0f34e6ce69e79ab81b6339bd2 - return true; } - else { - hts_log_error(__FUNCTION__, "*** CRITICAL ERROR bcf_read < -1"); - // TODO: check b->errcode + if (!this.initialized) { + this.popFront(); + this.initialized = true; + } + if (success >= 0) + return false; + else if (success == -1) + return true; + else + { + static if(isTabixFile) + hts_log_error(__FUNCTION__, "*** ERROR vcf_parse < -1"); + else + hts_log_error(__FUNCTION__, "*** ERROR bcf_read < -1"); return true; } + } /// ditto void popFront() { - // noop? - // free this.b ? - //bam_destroy1(this.b); + // int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); + // documentation claims returns -1 on critical errors, 0 otherwise + // however it looks like -1 is EOF and -2 is critical errors? + static if(isTabixFile){ + if (this.initialized) this.tbxRange.popFront; + auto str = this.tbxRange.front; + kstring_t kstr; + kputs(toUTFz!(char *)(str), &kstr); + this.success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b); + + }else + this.success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); - // TODO: clear (not destroy) the bcf1_t for reuse? } /// ditto VCFRecord front() { // note that VCFRecord's constructor will be responsible for - // * unpacking and + // * UnpackLeveling and // * destroying // its copy return VCFRecord(this.vcfhdr, bcf_dup(this.b), this.MAX_UNPACK); @@ -229,6 +197,17 @@ debug(dhtslib_unittest) unittest assert(rec.allelesAsArray == ["C","T"]); assert(approxEqual(rec.qual,59.2)); assert(rec.filter == "PASS"); + + vcf.popFront; + rec = vcf.front; + + assert(rec.chrom == "1"); + assert(rec.pos == 3000150); + assert(rec.refAllele == "C"); + assert(rec.altAllelesAsArray == ["T"]); + assert(rec.allelesAsArray == ["C","T"]); + assert(approxEqual(rec.qual,59.2)); + assert(rec.filter == "PASS"); // assert(rec. == ["C","T"]); } @@ -258,5 +237,76 @@ debug(dhtslib_unittest) unittest assert(rec.allelesAsArray == ["C","T"]); assert(approxEqual(rec.qual, 59.2)); assert(rec.filter == "PASS"); + + vcf.popFront; + rec = vcf.front; + + assert(rec.chrom == "1"); + assert(rec.pos == 3062914); + assert(rec.refAllele == "GTTT"); + assert(rec.altAllelesAsArray == ["G"]); + assert(rec.allelesAsArray == ["GTTT","G"]); + assert(approxEqual(rec.qual,12.9)); + assert(rec.filter == "q10"); // assert(rec. == ["C","T"]); +} + +debug(dhtslib_unittest) unittest +{ + import std.stdio; + import htslib.hts_log; + import std.algorithm : map, count; + import std.array : array; + import std.path : buildPath, dirName; + import std.math : approxEqual; + hts_set_log_level(htsLogLevel.HTS_LOG_INFO); + hts_log_info(__FUNCTION__, "Testing bcf1_t Unpacking"); + hts_log_info(__FUNCTION__, "Loading test file"); + auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); + auto tbx = TabixIndexedFile(fn); + auto reg = getIntervalFromString("1:3000151-3062916"); + + auto vcf = VCFReader(tbx, reg.contig, reg.interval); + assert(vcf.count == 3); + vcf = VCFReader(tbx, reg.contig, reg.interval, UnpackLevel.None); + vcf.empty; + VCFRecord rec = vcf.front; + + assert(rec.chrom == "1"); + assert(rec.pos == 3000150); + + assert(approxEqual(rec.qual, 59.2)); + assert(rec.line.unpacked == UnpackLevel.None); + + assert(rec.id == "."); + assert(rec.line.unpacked == UnpackLevel.AltAllele); + + assert(rec.refAllele == "C"); + assert(rec.line.unpacked == UnpackLevel.AltAllele); + + assert(rec.altAllelesAsArray == ["T"]); + assert(rec.line.unpacked == UnpackLevel.AltAllele); + + assert(rec.filter == "PASS"); + assert(rec.line.unpacked == (UnpackLevel.Filter | UnpackLevel.AltAllele)); + + assert(rec.getInfos["AN"].to!int == 4); + assert(rec.line.unpacked == UnpackLevel.SharedFields); + + assert(rec.getFormats["DP"].to!int.array == [[32], [32]]); + assert(rec.line.unpacked == UnpackLevel.All); + + /// only one extra test for pulling only format fields + /// + /// bcf_unpack automatically promotes UnpackLevel.Filter to + /// (UnpackLevel.Filter | UnpackLevel.AltAllele) + /// + /// bcf_unpack automatically promotes UnpackLevel.Info to + /// (UnpackLevel.Info | UnpackLevel.SharedFields) + + /// since front calls bcf_dup, we get a fresh unpacked record + rec = vcf.front; + assert(rec.getFormats["DP"].to!int.array == [[32], [32]]); + assert(rec.line.unpacked == UnpackLevel.Format); + } \ No newline at end of file diff --git a/source/dhtslib/vcf/record.d b/source/dhtslib/vcf/record.d index be7ceaa..e7355ac 100644 --- a/source/dhtslib/vcf/record.d +++ b/source/dhtslib/vcf/record.d @@ -14,18 +14,322 @@ import core.vararg; import std.conv: to, ConvException; import std.format: format; -import std.range: ElementType; +import std.range; import std.string: fromStringz, toStringz; +import std.utf : toUTFz; +import std.algorithm : map; +import std.array : array; import std.traits; +import std.meta : staticIndexOf; +import dhtslib.vcf; import dhtslib.coordinates; -import dhtslib.vcf.header; import htslib.hts_log; import htslib.kstring; import htslib.vcf; alias BCFRecord = VCFRecord; +/// Struct to aid in conversion of VCF info data +/// into D types +struct InfoField +{ + /// String identifier of info field + string key; + /// BCF TYPE indicator + RecordType type; + /// number of data elements + int len; + /// reference of info field data + ubyte[] data; + + /// VCFRecord refct + int * refct; + + /// ctor from VCFRecord + this(string key, bcf_info_t * info, int * refct) + { + this.refct = refct; + (*this.refct)++; + this(key, info); + } + + /// postblit + this(this){ + if(this.refct) (*this.refct)++; + } + + /// dtor + ~this() + { + if(this.refct) (*this.refct)--; + } + + /// string, info field ctor + this(string key, bcf_info_t * info) + { + /// get type + this.type = cast(RecordType)(info.type & 0b1111); + /// get len + this.len = info.len; + /// copy data + this.data = info.vptr[0.. info.vptr_len]; + /// double check our lengths + debug{ + final switch(this.type){ + case RecordType.Int8: + assert(data.length == this.len * byte.sizeof); + break; + case RecordType.Int16: + assert(data.length == this.len * short.sizeof); + break; + case RecordType.Int32: + assert(data.length == this.len * int.sizeof); + break; + case RecordType.Int64: + assert(data.length == this.len * long.sizeof); + break; + case RecordType.Float: + assert(data.length == this.len * float.sizeof); + break; + case RecordType.Char: + assert(data.length == this.len * char.sizeof); + break; + case RecordType.Null: + assert(data.length == 0); + } + } + } + + /// convert to a D type array if int, float, or bool type array + T[] to(T: T[])() + if(isIntegral!T || is(T == float) || isBoolean!T) + { + static if(isIntegral!T){ + /// if we select the correct type just slice and return + if(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType)) + return (cast(T *)this.data.ptr)[0 .. T.sizeof * this.len]; + /// if we select type that is too small log error and return + if(RecordTypeSizes[this.type] > T.sizeof) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return []; + } + T[] ret; + /// if we select type that is bigger slice, convert and return + switch(this.type){ + case RecordType.Int8: + ret = ((cast(byte *)this.data.ptr)[0 .. this.len]).to!(T[]); + break; + case RecordType.Int16: + ret = ((cast(short *)this.data.ptr)[0 .. short.sizeof * this.len]).to!(T[]); + break; + case RecordType.Int32: + ret = ((cast(int *)this.data.ptr)[0 .. int.sizeof * this.len]).to!(T[]); + break; + case RecordType.Int64: + ret = ((cast(long *)this.data.ptr)[0 .. long.sizeof * this.len]).to!(T[]); + break; + default: + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(this.type, T.stringof)); + return []; + } + return ret; + }else{ + /// if we select type the wrong type error and return + if(!(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType))) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return []; + } + return (cast(T *)this.data.ptr)[0 .. T.sizeof * this.len]; + } + } + + /// convert to a D type if int, float, or bool type + T to(T)() + if(isIntegral!T || is(T == float) || isBoolean!T) + { + /// if bool return true + static if(isBoolean!T) + return true; + else{ + /// if info field has > 1 element error and return + if(this.len != 1){ + hts_log_error(__FUNCTION__, "This info field has a length of %d not 1".format(this.len)); + return T.init; + } + static if(isIntegral!T){ + /// if we select type that is too small log error and return + if(RecordTypeSizes[this.type] > T.sizeof) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return T.init; + } + /// just gonna use the array impl and grab index 0 + return this.to!(T[])[0]; + }else{ + /// if we select type the wrong type error and return + if(!(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType))) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return T.init; + } + /// just gonna use the array impl and grab index 0 + return this.to!(T[])[0]; + } + } + } + + /// convert to a string + T to(T)() + if(isSomeString!T) + { + /// if we select type the wrong type error and return + if(!(this.type == RecordType.Char)) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return T.init; + } + /// otherwise slice and dup + return cast(T)(cast(char *)this.data.ptr)[0 .. this.len].dup; + } +} + +/// Struct to aid in conversion of VCF format data +/// into D types +struct FormatField +{ + /// String identifier of info field + string key; + /// BCF TYPE indicator + RecordType type; + /// number of samples + int nSamples; + /// number of data elements per sample + int n; + /// number of bytes per sample + int size; + /// reference of info field data + ubyte[] data; + + /// VCFRecord refct + int * refct; + + /// ctor from VCFRecord + this(string key, bcf_fmt_t * fmt, int * refct) + { + this.refct = refct; + (*this.refct)++; + this(key, fmt); + } + + /// postblit + this(this){ + if(this.refct) (*this.refct)++; + } + + /// dtor + ~this() + { + if(this.refct) (*this.refct)--; + } + + /// string and format ctor + this(string key, bcf_fmt_t * fmt) + { + /// set all our data + this.key = key; + this.type = cast(RecordType)(fmt.type & 0b1111); + this.n = fmt.n; + this.nSamples = fmt.p_len / fmt.size; + this.data = fmt.p[0 .. fmt.p_len].dup; + this.size = fmt.size; + /// double check our work + debug{ + final switch(this.type){ + case RecordType.Int8: + assert(data.length == this.nSamples * this.n * byte.sizeof); + break; + case RecordType.Int16: + assert(data.length == this.nSamples * this.n * short.sizeof); + break; + case RecordType.Int32: + assert(data.length == this.nSamples * this.n * int.sizeof); + break; + case RecordType.Int64: + assert(data.length == this.nSamples * this.n * long.sizeof); + break; + case RecordType.Float: + assert(data.length == this.nSamples * this.n * float.sizeof); + break; + case RecordType.Char: + assert(data.length == this.nSamples * this.n * char.sizeof); + break; + case RecordType.Null: + assert(data.length == 0); + } + } + } + + /// convert to a D type array if int, float, bool, or string type array + /// very similar to InfoField.to + /// This returns chunks as we separated FORMAT values by sample + auto to(T)() + if(isIntegral!T || is(T == float) || isBoolean!T || isSomeString!T) + { + T[] ret; + static if(isIntegral!T){ + if(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType)){ + ret = (cast(T *)this.data.ptr)[0 .. this.n * this.nSamples * T.sizeof]; + return ret.chunks(this.n); + } + if(RecordTypeSizes[this.type] > T.sizeof) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return ret.chunks(this.n); + } + switch(this.type){ + case RecordType.Int8: + ret = ((cast(byte *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); + break; + case RecordType.Int16: + ret = ((cast(short *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); + break; + case RecordType.Int32: + ret = ((cast(int *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); + break; + case RecordType.Int64: + ret = ((cast(long *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); + break; + default: + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(this.type, T.stringof)); + return ret.chunks(this.n); + } + }else static if(isSomeString!T){ + if(!(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType))) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return ret.chunks(this.n); + } + auto ptr = this.data.ptr; + for(auto i=0; i < nSamples; i++) + { + ret ~= (cast(char *)ptr)[i*size .. (i+1) * size].idup; + } + }else{ + if(!(this.type == cast(RecordType) staticIndexOf!(T, RecordTypeToDType))) + { + hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); + return ret.chunks(this.n); + } + ret = (cast(T *)this.data.ptr)[0 .. this.n * this.nSamples * T.sizeof]; + } + return ret.chunks(this.n); + + } +} + /** Wrapper around bcf1_t @@ -46,16 +350,16 @@ alias BCFRecord = VCFRecord; 2019-01-23 WIP: getters for chrom, pos, id, ref, alt are complete (untested) -After parsing a BCF or VCF line, bcf1_t must be unpacked. (not applicable when building bcf1_t from scratch) +After parsing a BCF or VCF line, bcf1_t must be UnpackLeveled. (not applicable when building bcf1_t from scratch) Depending on information needed, this can be done to various levels with performance tradeoff. Unpacking symbols: -BCF_UN_ALL: all -BCF_UN_SHR: all shared information (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) +UNPACK.ALL: all +UNPACK.SHR: all shared information (UNPACK.ALT|UNPACK.FILT|UNPACK.INFO) - BCF_UN_STR - / BCF_UN_FLT - | / BCF_UN_INFO - | | / ____________________________ BCF_UN_FMT + UnpackLevel.ALT + / UnpackLevel.FILT + | / UnpackLevel.INFO + | | / ____________________________ UnpackLevel.FMT V V V / | | | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 ... @@ -77,10 +381,10 @@ struct VCFRecord appropriate INFO, CONTIG, and FILTER lines. Protip: specifying alternate MAX_UNPACK can speed it tremendously - as it will not unpack all fields, only up to those requested (see htslib.vcf) - For example, BCF_UN_STR is up to ALT inclusive, and BCF_UN_STR is up to FILTER + as it will not UnpackLevel all fields, only up to those requested (see htslib.vcf) + For example, UnpackLevel.ALT is up to ALT inclusive, and UnpackLevel.ALT is up to FILTER */ - this(T)(T *h, bcf1_t *b, int MAX_UNPACK = BCF_UN_ALL) + this(T)(T *h, bcf1_t *b, UnpackLevel MAX_UNPACK = UnpackLevel.None) if(is(T == VCFHeader) || is(T == bcf_hdr_t)) { static if (is(T == VCFHeader)) this.vcfheader = h; @@ -90,7 +394,7 @@ struct VCFRecord this.line = b; - // Now it must be unpacked + // Now it must be UNPACKed // Protip: specifying alternate MAX_UNPACK can speed it tremendously immutable int ret = bcf_unpack(this.line, MAX_UNPACK); // unsure what to do c̄ return value // @suppress(dscanner.suspicious.unused_variable) } @@ -111,7 +415,7 @@ struct VCFRecord this.filter = filter; } /// ditto - this(VCFHeader *vcfhdr, string line, int MAX_UNPACK = BCF_UN_ALL) + this(VCFHeader *vcfhdr, string line, UnpackLevel MAX_UNPACK = UnpackLevel.None) { this.vcfheader = vcfhdr; @@ -149,16 +453,20 @@ struct VCFRecord bcf_destroy1(this.line); } + /// ensure that vcf variable length data is unpacked to at least desired level + pragma(inline, true) + void unpack(UnpackLevel unpackLevel) + { + if (this.line.max_unpack < unpackLevel) bcf_unpack(this.line, unpackLevel); + } + //----- FIXED FIELDS -----// /* CHROM */ /// Get chromosome (CHROM) @property string chrom() - { - if (!this.line.unpacked) - bcf_unpack(this.line, BCF_UN_STR); - + { return fromStringz(bcf_hdr_id2name(this.vcfheader.hdr, this.line.rid)).idup; } /// Set chromosome (CHROM) @@ -184,7 +492,6 @@ struct VCFRecord out(coord) { assert(coord >= 0); } do { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); return ZB(this.line.pos); } /// Set position (POS, column 2) @@ -203,7 +510,7 @@ struct VCFRecord /// Get ID string @property string id() { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + this.unpack(UnpackLevel.AltAllele); return fromStringz(this.line.d.id).idup; } /// Sets new ID string; comma-separated list allowed but no dup checking performed @@ -254,6 +561,7 @@ struct VCFRecord /// All alleles getter (array) @property string[] allelesAsArray() { + this.unpack(UnpackLevel.AltAllele); string[] ret; ret.length = this.line.n_allele; // n=0, no reference; n=1, ref but no alt foreach(int i; 0 .. this.line.n_allele) // ref allele is index 0 @@ -265,20 +573,20 @@ struct VCFRecord /// Reference allele getter @property string refAllele() { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + this.unpack(UnpackLevel.AltAllele); if (this.line.n_allele < 1) return ""; // a valid record could have no ref (or alt) alleles else return fromStringz(this.line.d.als).idup; } // NB WIP: there could be zero, or multiple alt alleles /+@property string altAlleles() { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + if (!this.line.unpacked) bcf_unpack(this.line, UnpackLevel.ALT); return fromStringz(this.line.d.als) }+/ /// Alternate alleles getter version 1: ["A", "ACTG", ...] @property string[] altAllelesAsArray() { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + this.unpack(UnpackLevel.AltAllele); string[] ret; if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt @@ -287,7 +595,7 @@ struct VCFRecord /// Alternate alleles getter version 2: "A,ACTG,..." @property string altAllelesAsString() { - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + this.unpack(UnpackLevel.AltAllele); string ret; if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt @@ -318,6 +626,7 @@ struct VCFRecord /// Set alleles; comma-separated list @property void alleles(string a) { + this.unpack(UnpackLevel.AltAllele); if (a == "") { this.line.n_allele = 0; if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF @@ -328,6 +637,7 @@ struct VCFRecord /// Set alleles; array @property void alleles(string[] a) { + this.unpack(UnpackLevel.AltAllele); if (a.length == 0) { this.line.n_allele = 0; if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF @@ -346,30 +656,22 @@ struct VCFRecord bcf_update_alleles(this.vcfheader.hdr, this.line, &allelesp[0], cast(int)a.length); } } + /// Set REF allele only - /// param r is \0-term Cstring - /// TODO: UNTESTED - void setRefAllele(const(char)* r) + void setRefAllele(string r) { // first, get REF - if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); + this.unpack(UnpackLevel.AltAllele); // a valid record could have no ref (or alt) alleles + auto alleles = [toUTFz!(char *)(r)]; if (this.line.n_allele < 2) // if none or 1 (=REF only), just add the REF we receieved - bcf_update_alleles(this.vcfheader.hdr, this.line, &r, 1); + bcf_update_alleles(this.vcfheader.hdr, this.line, alleles.ptr, 1); else { // length of REF allele is allele[1] - allele[0], (minus one more for \0) // TODO ** TODO ** : there is a property line.refLen already const auto reflen = (this.line.d.allele[1] - this.line.d.allele[0]) - 1; - if (strlen(r) <= reflen) { - memcpy(this.line.d.allele[0], r, reflen + 1); // +1 -> copy a trailing \0 in case original REF was longer than new - // TODO: do we need to call _sync ? - } else { - // slower way: replace allele[0] with r, but keep rest of pointers poitning at existing allele block, - // then call bcf_update_alleles; this will make complete new copy of this.line.d.allele, so forgive the casts - this.line.d.allele[0] = cast(char*) r; - bcf_update_alleles(this.vcfheader.hdr, this.line, - cast( const(char)** ) this.line.d.allele, this.line.n_allele); - } + alleles ~= this.altAllelesAsArray.map!(toUTFz!(char *)).array; + bcf_update_alleles(this.vcfheader.hdr, this.line, alleles.ptr, cast(int)alleles.length); } } /// Set alleles; alt can be comma separated @@ -400,7 +702,6 @@ struct VCFRecord out(result) { assert(result >= 0); } do { - if (this.line.max_unpack < BCF_UN_FLT) bcf_unpack(this.line, BCF_UN_FLT); return this.line.qual; } /// Set variant quality (QUAL, column 6) @@ -415,7 +716,7 @@ struct VCFRecord { const(char)[] ret; - if (this.line.max_unpack < BCF_UN_FLT) bcf_unpack(this.line, BCF_UN_FLT); + this.unpack(UnpackLevel.Filter); if (this.line.d.n_flt) { for(int i; i< this.line.d.n_flt; i++) { @@ -445,9 +746,11 @@ struct VCFRecord { int[] fids; foreach(f; fs) { - if(f == "") continue; const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); - if (fid == -1) hts_log_warning(__FUNCTION__, format("filter not found in header (ignoring): %s", f) ); + if (fid == -1){ + hts_log_warning(__FUNCTION__, format("ignoring filter not found in header: %s", f.empty() ? "(empty)" : f)); + continue; + } else fids ~= fid; } if (fids.length > 0) @@ -459,13 +762,24 @@ struct VCFRecord /// "If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed." int addFilter(string f) { - return bcf_add_filter(this.vcfheader.hdr, this.line, - bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f))); + + if(f == "") hts_log_warning(__FUNCTION__, "filter was empty"); + const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); + if (fid == -1){ + hts_log_warning(__FUNCTION__, format("ignoring filter not found in header: %s", f.empty() ? "(empty)" : f)); + return -1; + } + + return bcf_add_filter(this.vcfheader.hdr, this.line, fid); } /// Remove a filter by name int removeFilter(string f) { const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); + if (fid == -1){ + hts_log_warning(__FUNCTION__, format("filter not found in header (ignoring): %s", f) ); + return -1; + } return removeFilter(fid); } /// Remove a filter by numeric id @@ -498,6 +812,7 @@ struct VCFRecord * Both singletons and arrays are supported. */ void addInfo(T)(string tag, T data) + if(isSomeString!T || ((isIntegral!T || isFloatingPoint!T || isBoolean!T) && !isArray!T)) { int ret = -1; @@ -523,19 +838,19 @@ struct VCFRecord hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); } /// ditto - void addInfo(T)(string tag, T[] data) - if(!is(T==immutable(char))) // otherwise string::immutable(char)[] will match the other template + void addInfo(T)(string tag, T data) + if(isIntegral!(ElementType!T) || isFloatingPoint!(ElementType!T)) // otherwise string::immutable(char)[] will match the other template { int ret = -1; - static if(isIntegral!T) { - auto integer = cast(int) data; - ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, data.length); + static if(isIntegral!(ElementType!T)) { + auto integer = data.to!(int[]); + ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), integer.ptr, cast(int)data.length); } - else static if(isFloatingPoint!T) { - auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double - ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, data.length); + else static if(isFloatingPoint!(ElementType!T)) { + auto flt = data.to!(float[]); // simply passing "2.0" (or whatever) => data is a double + ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), flt.ptr, cast(int)data.length); } if (ret == -1) @@ -547,22 +862,24 @@ struct VCFRecord * Templated on data type, calls one of bc_update_format_{int32,float,string,flag} */ void addFormat(T)(string tag, T data) - if(!isArray!T) + if(!isArray!T || isSomeString!T) { int ret = -1; static if(isIntegral!T) { - auto integer = cast(int) data; + auto integer = data.to!int; ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, 1); } else static if(isFloatingPoint!T) { - auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double + auto flt = data.to!float; // simply passing "2.0" (or whatever) => data is a double ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, 1); } - else static if(isSomeString!T) - ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), toStringz(data)); + else static if(isSomeString!T){ + auto strs = [data].map!(toUTFz!(char*)).array; + ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), strs.ptr, 1); + } else static if(isBoolean!T) { immutable int set = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag @@ -573,29 +890,48 @@ struct VCFRecord } /// ditto void addFormat(T)(string tag, T[] data) + if((!isArray!T || isSomeString!T) && !is(T == immutable(char))) { int ret = -1; static if(isIntegral!T) { - auto integer = cast(int[]) data; + auto integer = data.to!(int[]); ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), integer.ptr, cast(int)data.length); } else static if(isFloatingPoint!T) { - auto flt = cast(float[]) data; // simply passing "2.0" (or whatever) => data is a double + auto flt = data.to!(float[]); // simply passing "2.0" (or whatever) => data is a double ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), flt.ptr, cast(int)data.length); } - + + else static if(isSomeString!T){ + auto strs = data.map!(toUTFz!(char*)).array; + ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), strs.ptr, cast(int)strs.length); + } + if (ret == -1) hts_log_warning(__FUNCTION__, format("Couldn't add format (ignoring): %s", data)); } + /// remove an info section + void removeInfo(string tag) + { + bcf_update_info(this.vcfheader.hdr,this.line, toStringz(tag),null,0,HeaderTypes.None); + } + + /// remove a format section + void removeFormat(string tag) + { + bcf_update_format(this.vcfheader.hdr,this.line, toStringz(tag),null,0,HeaderTypes.None); + } + + /// add INFO or FORMAT key:value pairs to a record - /// add a single datapoint OR vector of values, OR, values to each sample (if tagType == FORMAT) - void add(string tagType, T)(const(char)[] tag, T data) - if((tagType == "INFO" || tagType == "FORMAT") && + /// add a single datapoint OR vector of values, OR, values to each sample (if lineType == FORMAT) + void add(HeaderRecordType lineType, T)(const(char)[] tag, T data) + if((lineType == HeaderRecordType.Info || lineType == HeaderRecordType.Format) && (isIntegral!T || isIntegral!(ElementType!T) || isFloatingPoint!T || isFloatingPoint!(ElementType!T) || isSomeString!T || isSomeString!(ElementType!T) || @@ -637,7 +973,7 @@ struct VCFRecord } else static assert(0); - static if (tagType == "INFO") { + static if (lineType == HeaderRecordType.Info) { static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) @@ -648,7 +984,7 @@ struct VCFRecord ret = bcf_update_info_flag(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); else static assert(0, "Type not recognized for INFO tag"); } - else static if (tagType == "FORMAT") { + else static if (lineType == HeaderRecordType.Format) { static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) @@ -663,6 +999,44 @@ struct VCFRecord hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); } + /// returns a hashmap of info data by field + InfoField[string] getInfos() + { + /// UnpackLevel + this.unpack(UnpackLevel.Info); + + /// copy some data + InfoField[string] infoMap; + bcf_info_t[] infos = this.line.d.info[0..this.line.n_info].dup; + + foreach (bcf_info_t info; infos) + { + /// if variable data is null then value is set for deletion + /// skip + if(!info.vptr) continue; + auto key = fromStringz(this.vcfheader.hdr.id[HeaderDictTypes.Id][info.key].key).idup; + infoMap[key] = InfoField(key, &info, &this.refct); + } + return infoMap; + } + + /// returns a hashmap of format data by field + FormatField[string] getFormats() + { + this.unpack(UnpackLevel.Format); + FormatField[string] fmtMap; + bcf_fmt_t[] fmts = this.line.d.fmt[0..this.line.n_fmt].dup; + foreach (bcf_fmt_t fmt; fmts) + { + /// if variable data is null then value is set for deletion + /// skip + if(!fmt.p) continue; + auto key = fromStringz(this.vcfheader.hdr.id[HeaderDictTypes.Id][fmt.id].key).idup; + fmtMap[key] = FormatField(key, &fmt, &this.refct); + } + return fmtMap; + } + /// Return a string representation of the VCFRecord (i.e. as would appear in .vcf) /// /// As a bonus, there is a kstring_t memory leak @@ -702,15 +1076,27 @@ unittest vw.addHeaderLineRaw("##INFO="); vw.addHeaderLineKV("INFO", ""); // ##INFO= - vw.addTag!"INFO"("AF", "A", "Integer", "Number of Samples With Data"); + vw.vcfhdr.addHeaderLine!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data"); vw.addHeaderLineRaw("##contig="); // @suppress(dscanner.style.long_line) vw.addHeaderLineRaw("##FILTER="); + HeaderRecord hrec; + hrec.setHeaderRecordType(HeaderRecordType.Format); + hrec.setID("AF"); + hrec.setLength(HeaderLengths.OnePerAltAllele); + hrec.setValueType(HeaderTypes.Float); + hrec.setDescription("Allele Freq"); + vw.vcfhdr.addHeaderRecord(hrec); + + assert(vw.vcfhdr.getHeaderRecord(HeaderRecordType.Format, "ID","AF")["ID"] == "AF"); + vw.addHeaderLineRaw("##FORMAT="); // Exercise header assert(vw.vcfhdr.nsamples == 0); vw.addSample("NA12878"); assert(vw.vcfhdr.nsamples == 1); + vw.writeHeader(); + auto r = new VCFRecord(vw.vcfhdr, bcf_init1()); r.chrom = "20"; @@ -809,13 +1195,24 @@ unittest assert(r.refAllele == "A"); assert(r.altAllelesAsString == "C,T"); + r.setRefAllele("G"); + assert(r.allelesAsArray == ["G", "C", "T"]); + assert(r.refAllele == "G"); + assert(r.altAllelesAsString == "C,T"); + + r.setRefAllele("A"); + assert(r.allelesAsArray == ["A", "C", "T"]); + assert(r.refAllele == "A"); + assert(r.altAllelesAsString == "C,T"); + + // Test QUAL r.qual = 1.0; assert(r.qual == 1.0); - // now test setting qual without unpacking + // now test setting qual without UnpackLeveling // TODO: see https://forum.dlang.org/post/hebouvswxlslqhovzaia@forum.dlang.org, once resolved (or once factory function written), - // add template value param BCF_UN_STR + // add template value param UnpackLevel.ALT auto rr = new VCFRecord(vw.vcfhdr, "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0.017\n"); // @suppress(dscanner.style.long_line) rr.qual = 3.0; assert(rr.qual == 3.0); @@ -843,10 +1240,140 @@ unittest assert(r.hasFilter("q10")); r.removeFilter("q10"); assert(r.hasFilter("PASS")); + assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\n"); + + r.addFormat("AF",[0.1, 0.5]); + assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\tAF\t0.1,0.5\n"); + + rr.addFormat("AF",[0.1]); + writeln(rr.toString); + assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0\tAF\t0.1\n"); + + + assert(rr.getFormats["AF"].to!float[0][0] == 0.1f); + + rr.addInfo("AF",0.5); + assert(rr.getInfos["AF"].to!float == 0.5f); + + rr.removeInfo("AF"); + + rr.removeFormat("AF"); + + rr.addFormat("CH", "test"); + + + writeln(rr.toString); + writeln(rr.getFormats["CH"].to!string); + assert(rr.getFormats["CH"].to!string[0][0] == "test"); + + assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11\tCH\ttest\n"); + + assert(rr.coordinates == ZBHO(17329,17330)); + vw.writeRecord(*r); + vw.writeRecord(vw.vcfhdr.hdr, r.line); // Finally, print the records: - writefln("VCF records via toString:\n%s%s", r, rr); + writefln("VCF records via toString:\n%s%s", (*r), (*rr)); writeln("\ndhtslib.vcf: all tests passed\n"); } + +debug(dhtslib_unittest) unittest +{ + import std.stdio; + import htslib.hts_log; + import std.algorithm : map, count; + import std.array : array; + import std.path : buildPath, dirName; + import std.math : approxEqual, isNaN; + import dhtslib.tabix; + import dhtslib.vcf; + hts_set_log_level(htsLogLevel.HTS_LOG_INFO); + hts_log_info(__FUNCTION__, "Testing VCFReader"); + hts_log_info(__FUNCTION__, "Loading test file"); + auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); + auto tbx = TabixIndexedFile(fn); + auto reg = getIntervalFromString("1:3000151-3062916"); + auto vcf = VCFReader(tbx, reg.contig, reg.interval); + assert(!vcf.empty); + + VCFRecord rec = vcf.front; + assert(rec.getInfos["AN"].to!int == 4); + rec.addInfo("AN", rec.getInfos["AN"].to!int + 1); + assert(rec.getInfos["AN"].to!int == 5); + assert(rec.getInfos["AN"].to!byte == 5); + assert(rec.getInfos["AN"].to!short == 5); + assert(rec.getInfos["AN"].to!long == 5); + + assert(rec.getInfos["AN"].to!uint == 5); + assert(rec.getInfos["AN"].to!ubyte == 5); + assert(rec.getInfos["AN"].to!ushort == 5); + assert(rec.getInfos["AN"].to!ulong == 5); + + assert(rec.getInfos["AN"].to!float.isNaN); + + rec.addInfo("AN", 128); + assert(rec.getInfos["AN"].to!int == 128); + assert(rec.getInfos["AN"].to!byte == 0); + assert(rec.getInfos["AN"].to!short == 128); + assert(rec.getInfos["AN"].to!long == 128); + + rec.addInfo("AN", 32768); + assert(rec.getInfos["AN"].to!int == 32768); + assert(rec.getInfos["AN"].to!byte == 0); + assert(rec.getInfos["AN"].to!short == 0); + assert(rec.getInfos["AN"].to!long == 32768); + + assert(rec.getInfos["AN"].to!long == 32768); + + // rec.addInfo("AN", 2147483648); + // assert(rec.getInfos["AN"].to!int == 0); + // assert(rec.getInfos["AN"].to!byte == 0); + // assert(rec.getInfos["AN"].to!short == 0); + // assert(rec.getInfos["AN"].to!long == 2147483648); + + vcf.popFront; + rec = vcf.front; + assert(rec.refAllele == "GTTT"); + assert(rec.getInfos["INDEL"].to!bool == true); + + assert(rec.getInfos["DP4"].to!(int[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(byte[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(short[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(long[]) == [1, 2, 3, 4]); + + assert(rec.getInfos["DP4"].to!(uint[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(ubyte[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(ushort[]) == [1, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(ulong[]) == [1, 2, 3, 4]); + + assert(rec.getInfos["DP4"].to!(float[]) == []); + rec.addInfo("DP4", [2, 2, 3, 4]); + assert(rec.getInfos["DP4"].to!(int[]) == [2, 2, 3, 4]); + + assert(rec.getFormats["GQ"].to!int.array == [[409], [409]]); + + rec.addFormat("GQ", [32768,32768]); + + assert(rec.getFormats["GQ"].to!byte.array == []); + assert(rec.getFormats["GQ"].to!short.array == []); + assert(rec.getFormats["GQ"].to!int[0][0] == 32768); + + + assert(rec.getInfos["STR"].to!string == "test"); + + assert(rec.getInfos["DP4"].to!string == ""); + + assert(rec.getFormats["GT"].to!int.array == [[2, 4], [2, 4]]); + + + vcf.popFront; + rec = vcf.front; + + auto fmts = rec.getFormats; + auto sam = vcf.vcfhdr.getSampleId("A"); + assert(fmts["GT"].to!int[sam] == [2, 4]); + sam = vcf.vcfhdr.getSampleId("B"); + assert(fmts["GT"].to!int[sam] == [6, -127]); +} \ No newline at end of file diff --git a/source/dhtslib/vcf/writer.d b/source/dhtslib/vcf/writer.d index 8e22feb..dde1e9f 100644 --- a/source/dhtslib/vcf/writer.d +++ b/source/dhtslib/vcf/writer.d @@ -6,8 +6,7 @@ import std.traits: isArray, isDynamicArray, isBoolean, isIntegral, isFloatingPoi import std.conv: to, ConvException; import std.format: format; -import dhtslib.vcf.header; -import dhtslib.vcf.record; +import dhtslib.vcf; import htslib.vcf; import htslib.hts_log; @@ -82,54 +81,36 @@ struct VCFWriter return this.vcfhdr; } - // TODO - /// copy header lines from a template without overwiting existing lines - void copyHeaderLines(bcf_hdr_t *other) - { - assert(this.vcfhdr != null); - assert(0); - // bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const(bcf_hdr_t) *src); - } - /// Add sample to this VCF /// * int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample); + deprecated("use VCFHeader methods instead") int addSample(string name) in { assert(name != ""); } do { - assert(this.vcfhdr != null); - - bcf_hdr_add_sample(this.vcfhdr.hdr, toStringz(name)); - - // AARRRRGGGHHHH - // https://github.com/samtools/htslib/issues/767 - bcf_hdr_sync(this.vcfhdr.hdr); - - return 0; + return this.vcfhdr.addSample(name); } + deprecated("use VCFHeader methods instead") /// Add a new header line int addHeaderLineKV(string key, string value) { - // TODO check that key is not INFO, FILTER, FORMAT (or contig?) - string line = format("##%s=%s", key, value); - - return bcf_hdr_append(this.vcfhdr.hdr, toStringz(line)); + return this.vcfhdr.addHeaderLineKV(key, value); } + + deprecated("use VCFHeader methods instead") /// Add a new header line -- must be formatted ##key=value int addHeaderLineRaw(string line) { - assert(this.vcfhdr != null); - // int bcf_hdr_append(bcf_hdr_t *h, const(char) *line); - const auto ret = bcf_hdr_append(this.vcfhdr.hdr, toStringz(line)); - bcf_hdr_sync(this.vcfhdr.hdr); - return ret; + return this.vcfhdr.addHeaderLineRaw(line); } + + deprecated("use VCFHeader methods instead") /// Add a filedate= headerline, which is not called out specifically in the spec, /// but appears in the spec's example files. We could consider allowing a param here. int addFiledate() { - return addHeaderLineKV("filedate", (cast(Date) Clock.currTime()).toISOString ); + return this.vcfhdr.addFiledate; } /** Add INFO (§1.2.2) or FORMAT (§1.2.4) tag @@ -149,105 +130,42 @@ struct VCFWriter * source: Annotation source (eg dbSNP) * version: Annotation version (eg 142) */ - void addTag(string tagType, T)( string id, - T number, - string type, + deprecated("use VCFHeader methods instead") + void addTag(HeaderRecordType tagType)( string id, + HeaderLengths number, + HeaderTypes type, string description, string source="", string _version="") - if((tagType == "INFO" || tagType == "FORMAT") && (isIntegral!T || isSomeString!T)) + if(tagType == HeaderRecordType.Info || tagType == HeaderRecordType.Format) { - string line; // we'll suffix with \0, don't worry - - // check ID - if (id == "") { - hts_log_error(__FUNCTION__, "no ID"); - return; - } - - // check Number is either a special code {A,R,G,.} or an integer - static if(isSomeString!T) { - if (number != "A" && - number != "R" && - number != "G" && - number != ".") { - // not a special ; check if integer - try { - number.to!int; // don't need to store result, will use format/%s - } - catch (ConvException e) { - hts_log_error(__FUNCTION__, "Number not A/R/G/. nor an integer"); - return; - } - } - } - - // check Type - if (type != "Integer" && - type != "Float" && - type != "Flag" && - type != "Character" && - type != "String") { - hts_log_error(__FUNCTION__, "unrecognized type"); - return; - } - - // check Description - if (description == "") hts_log_error(__FUNCTION__, "no description"); - - // check Source and Version - if (source == "" && _version != "") hts_log_error(__FUNCTION__, "version wo source"); - - // Format params - if (source != "" && _version != "") - line = format("##%s=\0", - tagType, id, number, type, description, source, _version); - else if (source != "" && _version == "") - line = format("##%s=\0", - tagType, id, number, type, description, source); - else - line = format("##%s=\0", - tagType, id, number, type, description); - - bcf_hdr_append(this.vcfhdr.hdr, line.ptr); + this.vcfhdr.addHeaderLine!(tagType)(id, number, type, description, source, _version); } /** Add FILTER tag (§1.2.3) */ - void addTag(string tagType)(string id, string description) - if(tagType == "FILTER") + deprecated("use VCFHeader methods instead") + void addTag(HeaderRecordType tagType)(string id, string description) + if(tagType == HeaderRecordType.Filter) { - // check ID - if (id == "") { - hts_log_error(__FUNCTION__, "no ID"); - return; - } - // check Description - if (description == "") hts_log_error(__FUNCTION__, "no description"); - - string line = format("##FILTER=\0", id, description); - bcf_hdr_append(this.vcfhdr.hdr, line.ptr); + this.vcfhdr.addHeaderLine!(tagType)(id, description); } /** Add FILTER tag (§1.2.3) */ - deprecated void addFilterTag(string id, string description) + deprecated("use VCFHeader methods instead") + void addFilterTag(string id, string description) { - string filter = format("##FILTER=\0", id, description); - bcf_hdr_append(this.vcfhdr.hdr, filter.ptr); + this.vcfhdr.addFilter(id, description); } /** Add contig definition (§1.2.7) to header meta-info other: "url=...,md5=...,etc." */ - auto addTag(string tagType)(const(char)[] id, const int length = 0, string other = "") - if(tagType == "contig" || tagType == "CONTIG") + deprecated("use VCFHeader methods instead") + auto addTag(HeaderRecordType tagType)(const(char)[] id, const int length = 0, string other = "") + if(tagType == HeaderRecordType.Contig) { - const(char)[] contig = "##contig= 0 ? ",length=" ~ length.to!string : "") ~ - (other != "" ? "," ~ other : "") ~ - ">\0"; - - return bcf_hdr_append(this.vcfhdr.hdr, contig.ptr); + return this.vcfhdr.addTag!(tagType)(id, length, other); } /** @@ -344,4 +262,28 @@ struct VCFWriter if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error"); return ret; } +} + +/// +debug(dhtslib_unittest) +unittest +{ + import std.exception: assertThrown; + import std.stdio: writeln, writefln; + import dhtslib.vcf.writer; + + hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); + + + auto vw = VCFWriter("/dev/null"); + + vw.addHeaderLineRaw("##INFO="); + vw.addHeaderLineKV("INFO", ""); + // ##INFO= + vw.addTag!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data"); + vw.addTag!(HeaderRecordType.Filter)("filt","test"); + vw.addFilterTag("filt2","test2"); + + assert(vw.getHeader.getHeaderRecord(HeaderRecordType.Filter, "filt2").getDescription == "\"test2\""); + vw.writeHeader(); } \ No newline at end of file