From daf950202b25b2cc214fe63838a1bfa0ee55b7ce Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 31 Oct 2021 23:15:42 +0000 Subject: [PATCH] Add sam_hdr_passes_filter() for SAM header filter expressions --- Makefile | 2 +- header.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++ htslib/sam.h | 13 ++++++-- 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 37ae76565d..627276172d 100644 --- a/Makefile +++ b/Makefile @@ -359,7 +359,7 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) -header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) +header.o header.pico: header.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) diff --git a/header.c b/header.c index 9be4ec0773..b8bbbf0d3d 100644 --- a/header.c +++ b/header.c @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include "htslib/hts_expr.h" #include "textutils_internal.h" #include "header.h" @@ -898,6 +899,84 @@ static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len return 0; } +static int sam_hrecs_tag_lookup(sam_hrec_type_t *h_type, const char *tagname, + char type, hts_expr_val_t *result) { + sam_hrec_tag_t *tag = sam_hrecs_find_key(h_type, tagname, NULL); + if (tag == NULL || tag->len < 3) { + result->is_str = (type == 'Z'); + result->s.l = 0; + result->d = 0.0; + result->is_true = 0; + return 0; + } + + result->is_true = 1; + switch (type) { + case 'f': + result->is_str = 0; + result->d = strtod(&tag->str[3], NULL); + return 0; + + case 'i': + result->is_str = 0; + result->d = strtoll(&tag->str[3], NULL, 0); + return 0; + + case 'Z': + result->is_str = 1; + ks_clear(&result->s); + return (kputsn(&tag->str[3], tag->len-3, &result->s) >= 0)? 0 : -1; + } + + return -1; +} + +static int sam_hrecs_sym_lookup(void *data, char *str, char **end, + hts_expr_val_t *result) { + sam_hrec_type_t *h_type = (sam_hrec_type_t *) data; + + switch (*str) { + case 't': + if (memcmp(str, "type", 4) == 0) { + *end = &str[4]; + result->is_str = 1; + ks_clear(&result->s); + kputc_(h_type->type >> 8, &result->s); + kputc(h_type->type & 0xff, &result->s); + return (ks_len(&result->s) == 2)? 0 : -1; + } + break; + + case '@': + if (isalpha_c(str[1]) && isalpha_c(str[2])) { + *end = &str[3]; + result->is_str = 0; + result->is_true = result->d = (h_type->type == TYPEKEY(&str[1])); + return 0; + } + break; + + case '[': + if (str[1] & str[2]) { + if (str[3] == ']') { + *end = &str[4]; + return sam_hrecs_tag_lookup(h_type, &str[1], 'Z', result); + } + else if (str[3] == ':' && str[4] && str[5] == ']') { + if (!strchr("fiZ", str[4])) { + hts_log_error("Unrecognised %.6s type code", str); + return -1; + } + *end = &str[6]; + return sam_hrecs_tag_lookup(h_type, &str[1], str[4], result); + } + } + break; + } + + return -1; +} + /*! Update sam_hdr_t target_name and target_len arrays * * @return 0 on success; -1 on failure @@ -1762,6 +1841,21 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * return ret; } +int sam_hdr_passes_filter(sam_hdr_t *bh, sam_hdr_line_itr_t *iter, hts_filter_t *filt) { + hts_expr_val_t result; + int ret; + if (hts_filter_eval(filt, iter, sam_hrecs_sym_lookup, &result) >= 0) { + ret = result.is_true; + } + else { + hts_log_error("Couldn't process sam_hdr filter expression"); + ret = -1; + } + + hts_expr_val_free(&result); + return ret; +} + int sam_hdr_count_lines(sam_hdr_t *bh, const char *type) { int count; sam_hrec_type_t *first_ty, *itr_ty; diff --git a/htslib/sam.h b/htslib/sam.h index 7717cae117..298422e218 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +struct hts_filter_t; struct sam_hrec_type_s; /// Highest SAM format version supported by this library @@ -845,6 +846,15 @@ hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); */ static inline int bam_name2id(sam_hdr_t *h, const char *ref) { return sam_hdr_name2tid(h, ref); } +/// Check whether a header line passes an hts_filter +/** @param h Pointer to the header structure previously read + * @param iter Iterator pointing to a header line + * @param filt Pointer to the filter, created from hts_filter_init + * @return 1 if it passes, 0 if not, and <0 on error + */ +HTSLIB_EXPORT +int sam_hdr_passes_filter(sam_hdr_t *h, sam_hdr_line_itr_t *iter, struct hts_filter_t *filt); + /// Generate a unique \@PG ID: value /*! * @param name Name of the program. Eg. samtools @@ -1483,9 +1493,6 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; -// Forward declaration, see hts_expr.h for full. -struct hts_filter_t; - /// sam_passes_filter - Checks whether a record passes an hts_filter. /** @param h Pointer to the header structure previously read * @param b Pointer to the BAM record to be checked