From b5e1e91c2e6515d9b30bc535afa3169912bf8087 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 29 Apr 2024 16:53:46 +0100
Subject: [PATCH 001/165] the shadow of a form in the marble

---
 .gitignore          |  5 +++++
 hairpin/__init__.py |  0
 hairpin/main.py     | 13 +++++++++++++
 pyproject.toml      | 11 +++++++++++
 4 files changed, 29 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 hairpin/__init__.py
 create mode 100644 hairpin/main.py
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2c77f38
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+data/
+.env/
+dist/
+hairpin.egg-info/
+
diff --git a/hairpin/__init__.py b/hairpin/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/hairpin/main.py b/hairpin/main.py
new file mode 100644
index 0000000..f57c0af
--- /dev/null
+++ b/hairpin/main.py
@@ -0,0 +1,13 @@
+import pysam
+
+
+# pysam AlignmentFile.Fetch will return iterator over reads
+# which I think itself returns iterator row
+# hence the typing of this function
+# needs checking
+def start_end_mate_pairs(record: pysam.IteratorRow) -> list[int]:
+    
+    return [] 
+
+if __name__ == '__main__':
+    
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..1976f43
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,11 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "hairpin"
+version = "0.0.1"
+requires-python = ">= 3.7"
+dependencies = [
+    'pysam'
+]
\ No newline at end of file

From fc03ddf65b6f1e268d4ebe031313df27e70531c4 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 30 Apr 2024 14:33:23 +0100
Subject: [PATCH 002/165] start_end_mate_pairs reproduced with author errors

---
 hairpin/main.py | 55 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/hairpin/main.py b/hairpin/main.py
index f57c0af..21feee2 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -1,13 +1,58 @@
 import pysam
+from enum import Enum
+
+SOFT_CLIP_OP = 4
+# for start_end_mate_pairs
+Pairs = Enum('Pairs', [('start', 0), ('end', 1), ('mate_start', 2), ('mate_end', 3)])
+
 
 
 # pysam AlignmentFile.Fetch will return iterator over reads
-# which I think itself returns iterator row
-# hence the typing of this function
-# needs checking
-def start_end_mate_pairs(record: pysam.IteratorRow) -> list[int]:
+# which yields AlignedSegment
+def start_end_mate_pairs(
+    record: pysam.AlignedSegment,
+    bam: pysam.AlignmentFile
+) -> list[int]:
+    # want the start of the record, the end
+    # start of mate, end of mate
+    # .reference_start
+    mate: pysam.AlignedSegment = bam.mate(record)
+    # Julia XAM gets cigar info differently, checking CG:B,I tag
+    # does this matter?
+    cig: list[tuple[int, int]] = record.cigartuples
+    mate_cig: list[tuple[int, int]] = mate.cigartuples
+    
+    start: int = record.reference_start
+    end: int = record.reference_end
+    mate_start: int = record.reference_start
+    mate_end: int = record.reference_end
+    
+    # behaviour on cig = None?
+    if cig[0][0] == SOFT_CLIP_OP:
+        start -= cig[0][1]
+    if cig[-1][0] == SOFT_CLIP_OP:
+        end += cig[-1][1]
+    if mate_cig[0][0] == SOFT_CLIP_OP:
+        mate_start -= mate_cig[0][1]
+    if mate_cig[-1][0] == SOFT_CLIP_OP:
+        mate_end += mate_cig[-1][1]
+    
+    # appears mate posns simply aren't assigned if none
+    return [start, end, mate_start, mate_end]
+    
+    
+        
+    
+        
+    
+    
+    
+    
+    
+    
     
     return [] 
 
 if __name__ == '__main__':
-    
\ No newline at end of file
+    # do stuff
+    print('hello world')
\ No newline at end of file

From 4073cf0e58f1b2a8157bae1235ccbe65feb24c43 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 3 May 2024 16:35:08 +0100
Subject: [PATCH 003/165] most funcs broadly declared

---
 hairpin/main.py | 63 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/hairpin/main.py b/hairpin/main.py
index 21feee2..2af309a 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -5,7 +5,18 @@
 # for start_end_mate_pairs
 Pairs = Enum('Pairs', [('start', 0), ('end', 1), ('mate_start', 2), ('mate_end', 3)])
 
-
+# Peter modifies a dict, nonref_reads, inplace
+# i.e. each sample has a dict entry
+# dict is sample name : bam record
+# return bam record, handle dict above
+def get_mutant_reads(
+    vcf_record, # called per record
+    bam, # type depends on streaming approach
+    min_basequal: int,
+    clip_qual_cutoff: int,
+    min_mapqual: int
+): # return type probably pysam.AlignedSegment
+    return
 
 # pysam AlignmentFile.Fetch will return iterator over reads
 # which yields AlignedSegment
@@ -39,19 +50,51 @@ def start_end_mate_pairs(
     
     # appears mate posns simply aren't assigned if none
     return [start, end, mate_start, mate_end]
+
+
+# Peter does this in place. Any particular reason?
+# is structure of nonref_reads optimal/appropriate
+def remove_dups_with_wobble(
+    nonref_reads: dict,
+    max_span_ends: int
+) -> dict:
     
+    return 'hello world'
     
-        
-    
-        
-    
-    
-    
-    
-    
+
+# filters analysed on a cohort basis
+def test_filters(
+    vcf_posn: int,
+    mutant_reads: list[pysam.AlignedSegment],
+    cent90_thresh: float,
+    AL_filt_thresh: float
+) -> tuple[bool, bool]:
     
+    mut_pos_f: list[int] = []
+    mut_fracs_f: list[float] = []
+    mut_pos_r: list[int] = []
+    mut_fracs_r: list[float] = []
+    aln_scores: list[float] = []
+     
+    return tuple()
+
+
+# is streaming approach necessary?
+def main(
+    bam_paths: list,
+    intervals, # type?
+    vcf_in_path: str,
+    vcf_out_path: str,
+    clip_qual_cutoff: int,
+    min_mapqual: int,
+    min_basequal: int,
+    max_span: int,
+    AL_thresh: float,
+    cent90_thresh:float,
+    header: bool
+) -> None:
     
-    return [] 
+    return
 
 if __name__ == '__main__':
     # do stuff

From 8e240e21e5a83a52860e615cd7dd9e46b66a1c89 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 29 May 2024 14:05:19 +0100
Subject: [PATCH 004/165] get mutant reads

---
 .gitignore         |   2 +
 hairpin/main.py    | 101 -----------------------
 hairpin/ref2seq.py |  50 +++++++++++
 main.py            | 202 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 254 insertions(+), 101 deletions(-)
 delete mode 100644 hairpin/main.py
 create mode 100644 hairpin/ref2seq.py
 create mode 100644 main.py

diff --git a/.gitignore b/.gitignore
index 2c77f38..d905a25 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ data/
 dist/
 hairpin.egg-info/
 
+data/
+data
diff --git a/hairpin/main.py b/hairpin/main.py
deleted file mode 100644
index 2af309a..0000000
--- a/hairpin/main.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import pysam
-from enum import Enum
-
-SOFT_CLIP_OP = 4
-# for start_end_mate_pairs
-Pairs = Enum('Pairs', [('start', 0), ('end', 1), ('mate_start', 2), ('mate_end', 3)])
-
-# Peter modifies a dict, nonref_reads, inplace
-# i.e. each sample has a dict entry
-# dict is sample name : bam record
-# return bam record, handle dict above
-def get_mutant_reads(
-    vcf_record, # called per record
-    bam, # type depends on streaming approach
-    min_basequal: int,
-    clip_qual_cutoff: int,
-    min_mapqual: int
-): # return type probably pysam.AlignedSegment
-    return
-
-# pysam AlignmentFile.Fetch will return iterator over reads
-# which yields AlignedSegment
-def start_end_mate_pairs(
-    record: pysam.AlignedSegment,
-    bam: pysam.AlignmentFile
-) -> list[int]:
-    # want the start of the record, the end
-    # start of mate, end of mate
-    # .reference_start
-    mate: pysam.AlignedSegment = bam.mate(record)
-    # Julia XAM gets cigar info differently, checking CG:B,I tag
-    # does this matter?
-    cig: list[tuple[int, int]] = record.cigartuples
-    mate_cig: list[tuple[int, int]] = mate.cigartuples
-    
-    start: int = record.reference_start
-    end: int = record.reference_end
-    mate_start: int = record.reference_start
-    mate_end: int = record.reference_end
-    
-    # behaviour on cig = None?
-    if cig[0][0] == SOFT_CLIP_OP:
-        start -= cig[0][1]
-    if cig[-1][0] == SOFT_CLIP_OP:
-        end += cig[-1][1]
-    if mate_cig[0][0] == SOFT_CLIP_OP:
-        mate_start -= mate_cig[0][1]
-    if mate_cig[-1][0] == SOFT_CLIP_OP:
-        mate_end += mate_cig[-1][1]
-    
-    # appears mate posns simply aren't assigned if none
-    return [start, end, mate_start, mate_end]
-
-
-# Peter does this in place. Any particular reason?
-# is structure of nonref_reads optimal/appropriate
-def remove_dups_with_wobble(
-    nonref_reads: dict,
-    max_span_ends: int
-) -> dict:
-    
-    return 'hello world'
-    
-
-# filters analysed on a cohort basis
-def test_filters(
-    vcf_posn: int,
-    mutant_reads: list[pysam.AlignedSegment],
-    cent90_thresh: float,
-    AL_filt_thresh: float
-) -> tuple[bool, bool]:
-    
-    mut_pos_f: list[int] = []
-    mut_fracs_f: list[float] = []
-    mut_pos_r: list[int] = []
-    mut_fracs_r: list[float] = []
-    aln_scores: list[float] = []
-     
-    return tuple()
-
-
-# is streaming approach necessary?
-def main(
-    bam_paths: list,
-    intervals, # type?
-    vcf_in_path: str,
-    vcf_out_path: str,
-    clip_qual_cutoff: int,
-    min_mapqual: int,
-    min_basequal: int,
-    max_span: int,
-    AL_thresh: float,
-    cent90_thresh:float,
-    header: bool
-) -> None:
-    
-    return
-
-if __name__ == '__main__':
-    # do stuff
-    print('hello world')
\ No newline at end of file
diff --git a/hairpin/ref2seq.py b/hairpin/ref2seq.py
new file mode 100644
index 0000000..01fe796
--- /dev/null
+++ b/hairpin/ref2seq.py
@@ -0,0 +1,50 @@
+import pysam
+
+def ref2querypos(
+            record: pysam.AlignedSegment,
+            ref_pos: int
+) -> int:
+    pos_aln = record.get_aligned_pairs()
+    while True:
+        try:
+            aln_pair = pos_aln.pop()
+        except IndexError:
+            return -1 # ref_pos not on read
+        if aln_pair[1] == ref_pos:
+            return aln_pair[0]
+
+
+def pos2op(
+    seq_pos: int,
+    record: pysam.AlignedSegment
+) -> int:
+    cig = record.cigartuples
+    if cig is None:
+        exit(1)  # No cigar tuples for record
+    sum_len = 0
+    while True:
+        try:
+            cig_pair = cig.pop(0)
+        except IndexError:
+            raise RuntimeError  # seq_pos not in cigar string
+        sum_len += cig_pair[1]
+        if seq_pos < sum_len:
+            return cig_pair[0]
+
+
+"""
+In Julia, ref2seq takes a run of cigar ops
+and a position on the reference genome.
+Here, the cigar ops come from the bam read/record
+under examination, and the pos on ref comes from
+the position given by the vcf for the vcf record
+under examination (one or more bam records will be
+examined for each vcf record). It returns an array
+of 2 values, the position on the bam read, and the
+cigar operation applicable to that position.
+
+since peter wants to discard if not OP_MATCH
+i.e. if clipped
+we can use reference start in pysam
+which is position where read begins alignment sans clipping etc
+"""
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e5012c2
--- /dev/null
+++ b/main.py
@@ -0,0 +1,202 @@
+import pysam
+from enum import Enum
+from typing import Optional
+from hairpin import ref2seq
+import statistics
+
+Ops = Enum('Ops',
+           ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
+           start = 0)
+# for start_end_mate_pairs
+Pairs = Enum('Pairs',
+             ['start', 'end', 'mate_start', 'mate_end'],
+             star = 0)
+
+# pysam AlignmentFile.Fetch will return iterator over reads
+# which yields AlignedSegment
+def start_end_mate_pairs(
+    record: pysam.AlignedSegment,
+    bam: pysam.AlignmentFile
+) -> list[int]:
+    # want the start of the record, the end
+    # start of mate, end of mate
+    # .reference_start
+    mate: pysam.AlignedSegment = bam.mate(record)
+    # Julia XAM gets cigar info differently, checking CG:B,I tag
+    # does this matter?
+    cig: list[tuple[int, int]] = record.cigartuples
+    mate_cig: list[tuple[int, int]] = mate.cigartuples
+    
+    # this gets pos wrt to reference, not query sequence, is that desired?
+    start: int = record.reference_start
+    end: int = record.reference_end
+    mate_start: int = record.reference_start
+    mate_end: int = record.reference_end
+    
+    # behaviour on cig = None?
+    if cig[0][0] == Ops.soft:
+        start -= cig[0][1]
+    if cig[-1][0] == Ops.soft:
+        end += cig[-1][1]
+    if mate_cig[0][0] == Ops.soft:
+        mate_start -= mate_cig[0][1]
+    if mate_cig[-1][0] == Ops.soft:
+        mate_end += mate_cig[-1][1]
+    
+    # appears mate posns simply aren't assigned if none
+    return [start, end, mate_start, mate_end]
+
+
+# Peter does this in place. Any particular reason?
+# is structure of nonref_reads optimal/appropriate
+def remove_dups_with_wobble(
+    nonref_reads: dict,
+    max_span_ends: int
+) -> dict:
+    
+    return dict()
+    
+
+# filters analysed on a cohort basis
+def test_filters(
+    vcf_posn: int,
+    mutant_reads: list[pysam.AlignedSegment],
+    cent90_thresh: float,
+    AL_filt_thresh: float
+) -> tuple[bool, bool]:
+    
+    mut_pos_f: list[int] = []
+    mut_fracs_f: list[float] = []
+    mut_pos_r: list[int] = []
+    mut_fracs_r: list[float] = []
+    aln_scores: list[float] = []
+     
+    return tuple()
+
+
+# is streaming approach necessary?
+def main(
+    bam_paths: list,
+    intervals, # type?
+    vcf_in_path: str,
+    vcf_out_path: str,
+    clip_qual_cutoff: int,
+    min_mapqual: int,
+    min_basequal: int,
+    max_span: int,
+    AL_thresh: float,
+    cent90_thresh:float,
+    header: bool
+) -> None:
+    
+    vcf_obj: pysam.VariantFile = pysam.VariantFile(vcf_in_path)
+    sample_names: list[str] = list(vcf_obj.header.samples)
+    mut_reads: dict[str, list] = {key: [] for key in sample_names}
+    
+    # try excepts
+    bam_reader_dict: dict[str, Optional[pysam.AlignmentFile]] = dict.fromkeys(sample_names)
+    for path in bam_paths:
+        bam = pysam.AlignmentFile(path, 'rb')
+        # grab the sample name from first SM field
+        # in header field RG
+        # this may cause problems?
+        # check with Peter
+        if bam_sample := bam.header.to_dict()['RG'][1]['SM'] not in sample_names:
+            exit(1) # error
+        else:
+            bam_reader_dict[bam_sample] = bam  # type: ignore
+    
+    # init output
+    if header:
+        pass
+    
+    # since intervals are unnecessary
+    # - they were an artifact of the shearwater mp -
+    # just iterate through all records in the vcf
+    for vcf_rec in vcf_obj.fetch():
+        
+        if vcf_rec.alts is None:
+            continue  # ?
+        alt_test: bool = len(vcf_rec.alts[0]) == 1
+        if vcf_rec.rlen == 1:
+            mut_type = "sub" if alt_test else "ins"
+        elif alt_test:
+            mut_type = "del"
+        else:
+            mut_type = "complex"
+                
+        # check with Peter
+        samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
+        
+        for mut_sample_name in samples_w_mutants:
+
+            for read in bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, vcf_rec.stop) # type: ignore
+                
+                if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
+                    continue  # ?
+                
+                if read.flag & 0xE02 or read.mapping_quality < min_mapqual:
+                    continue
+                
+                mut_pos = ref2seq.ref2querypos(read, vcf_rec.pos)
+                mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
+
+                # Check whether read reports variant or not - skip the rest of the loop if does not report variant
+                # First, check for sub
+                # does read.query_x work? or should it be read.query_alignment_x?
+                if (mut_type == "sub" and
+                    (mut_op != Ops.match or
+                     read.query_sequence[mut_pos] != vcf_rec.alts[0] or
+                     read.query_qualities[mut_pos] < min_basequal)):
+                    continue
+                
+                # Second, check whether length of read can accommodate size of indel
+                # what if other alt is longer?
+                if (mut_pos + vcf_rec.rlen > read.query_length or
+                    mut_pos + len(vcf_rec.alts[0]) > read.query_length):
+                    continue
+                
+                if mut_type == "del":
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, vcf_rec.stop + 1))  # check with peter re in/exclusivity of range
+                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                    if (mut_rng_ops[0] != Ops.match or
+                        mut_rng_ops[-1] != Ops.match or
+                        any(x != Ops.delete for x in mut_rng_ops)):
+                        continue
+                elif mut_type == "ins":
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
+                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                    if (mut_rng_ops[0] != Ops.match or
+                        mut_rng_ops[-1] != Ops.match or
+                        any(x != Ops.ins for x in mut_rng_ops) or
+                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
+                        continue
+                
+                if ('S' in read.cigarstring and  # type: ignore
+                    statistics.mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+                    continue
+                
+                if read.flag & 0x40:  # read first in pair
+                    # ADD READ TO DICT OR SOMETHING
+                    mut_reads[mut_sample_name].append(read)
+                else:  # read second in pair
+                    if read.flag & 0x10:
+                        mate = bam_reader_dict[mut_sample_name].mate(read)
+                        if mate.reference_end is None:
+                            continue  # ?
+                        if read.reference_start <= mate.reference_end:  # check with Peter, does this map on to his code accurately
+                            read_start = mate.query_alignment_end + 1
+                    else:
+                        if read.reference_end >= read.next_reference_start:
+                            read_end = read.next_reference_start - 1
+                    if read_start <= vcf_rec.pos <= read_end:
+                        # ADD READ TO DICT OR SOMETHING
+                        mut_reads[mut_sample_name].append(read)
+                    
+
+    
+    return
+
+if __name__ == '__main__':
+    # do stuff
+    print('hello world')
\ No newline at end of file

From 71fb790f8920723c3411632510782f9fbf51dca3 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 30 May 2024 12:19:02 +0100
Subject: [PATCH 005/165] remove dups with wobble, get start end pairs. - on to
 check hairpin filter

---
 main.py | 107 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 69 insertions(+), 38 deletions(-)

diff --git a/main.py b/main.py
index e5012c2..3deb08f 100644
--- a/main.py
+++ b/main.py
@@ -12,40 +12,6 @@
              ['start', 'end', 'mate_start', 'mate_end'],
              star = 0)
 
-# pysam AlignmentFile.Fetch will return iterator over reads
-# which yields AlignedSegment
-def start_end_mate_pairs(
-    record: pysam.AlignedSegment,
-    bam: pysam.AlignmentFile
-) -> list[int]:
-    # want the start of the record, the end
-    # start of mate, end of mate
-    # .reference_start
-    mate: pysam.AlignedSegment = bam.mate(record)
-    # Julia XAM gets cigar info differently, checking CG:B,I tag
-    # does this matter?
-    cig: list[tuple[int, int]] = record.cigartuples
-    mate_cig: list[tuple[int, int]] = mate.cigartuples
-    
-    # this gets pos wrt to reference, not query sequence, is that desired?
-    start: int = record.reference_start
-    end: int = record.reference_end
-    mate_start: int = record.reference_start
-    mate_end: int = record.reference_end
-    
-    # behaviour on cig = None?
-    if cig[0][0] == Ops.soft:
-        start -= cig[0][1]
-    if cig[-1][0] == Ops.soft:
-        end += cig[-1][1]
-    if mate_cig[0][0] == Ops.soft:
-        mate_start -= mate_cig[0][1]
-    if mate_cig[-1][0] == Ops.soft:
-        mate_end += mate_cig[-1][1]
-    
-    # appears mate posns simply aren't assigned if none
-    return [start, end, mate_start, mate_end]
-
 
 # Peter does this in place. Any particular reason?
 # is structure of nonref_reads optimal/appropriate
@@ -91,7 +57,7 @@ def main(
     
     vcf_obj: pysam.VariantFile = pysam.VariantFile(vcf_in_path)
     sample_names: list[str] = list(vcf_obj.header.samples)
-    mut_reads: dict[str, list] = {key: [] for key in sample_names}
+    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in sample_names}
     
     # try excepts
     bam_reader_dict: dict[str, Optional[pysam.AlignmentFile]] = dict.fromkeys(sample_names)
@@ -129,7 +95,7 @@ def main(
         samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
         
         for mut_sample_name in samples_w_mutants:
-
+            ### get_mutant_reads
             for read in bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, vcf_rec.stop) # type: ignore
                 
                 if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
@@ -192,8 +158,73 @@ def main(
                     if read_start <= vcf_rec.pos <= read_end:
                         # ADD READ TO DICT OR SOMETHING
                         mut_reads[mut_sample_name].append(read)
-                    
-
+            ### end
+        ### remove_dups_with_wobble
+        for _, reads in mut_reads.items():
+            if len(reads) == 0:
+                continue
+                # want the start of the record, the end
+            ### start_mate_end_pairs()
+            # incidentally, I suppose hairpin only works for paired data?
+            sorted_ends = []
+            for read in reads:
+                mate = bam.mate(read)
+                
+                if any(x is None for x in [read.reference_start,
+                                        read.reference_end,
+                                        read.cigartuples,
+                                        mate.reference_start,
+                                        mate.reference_end,
+                                        mate.cigartuples]):
+                    continue  # ?
+                
+                # this gets pos wrt to reference, not query sequence, is that desired?
+                start: int = read.reference_start
+                end: int = read.reference_end
+                mate_start: int  = mate.reference_start
+                mate_end: int  = mate.reference_end
+                
+                # Peter
+                # behaviour on cig = None?     
+                # Julia XAM gets cigar info differently, checking CG:B,I tag
+                # does this matter?
+                if read.cigartuples[0][0] == Ops.soft:
+                    start -= read.cigartuples[0][1]
+                if read.cigartuples[-1][0] == Ops.soft:
+                    end += cig[-1][1]
+                if mate.cigartuples[0][0] == Ops.soft:
+                    mate_start -= mate.cigartuples[0][1]
+                if mate.cigartuples[-1][0] == Ops.soft:
+                    mate_end += mate.cigartuples[-1][1]
+                
+                # appears mate posns simply aren't assigned if none
+                sorted_ends.append(sorted([start, end, mate_start, mate_end]))
+            ### end
+            # I don't really understand this
+            sorted_ends: list[list[int]] = sorted(sorted_ends)
+            min_ends: list[list[int]] = [sorted_ends.pop(0)]
+            i = 1
+            while len(sorted_ends) != 0:
+                loop_ends: list[int] = sorted_ends.pop(0)
+                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
+                 
+                if all([x <= max_span for x in max_spans]):
+                    min_ends.append(loop_ends)
+                    reads.pop(i)
+                else:
+                    min_ends = [loop_ends]
+                i += 1
+            del(i)
+        ### end
+        
+        ### check hairpin filter
+        vcf_rec
+        mut_reads
+        samples_w_mutants
+        
+        for samp, reads in mut_reads:
+            for read in reads:
+                
     
     return
 

From 0d46902345402d37a3231053541da472f80811b6 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 30 May 2024 14:20:13 +0100
Subject: [PATCH 006/165] filter testing

---
 main.py | 95 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/main.py b/main.py
index 3deb08f..7df17a3 100644
--- a/main.py
+++ b/main.py
@@ -7,43 +7,10 @@
 Ops = Enum('Ops',
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
            start = 0)
-# for start_end_mate_pairs
-Pairs = Enum('Pairs',
-             ['start', 'end', 'mate_start', 'mate_end'],
-             star = 0)
-
-
-# Peter does this in place. Any particular reason?
-# is structure of nonref_reads optimal/appropriate
-def remove_dups_with_wobble(
-    nonref_reads: dict,
-    max_span_ends: int
-) -> dict:
-    
-    return dict()
-    
-
-# filters analysed on a cohort basis
-def test_filters(
-    vcf_posn: int,
-    mutant_reads: list[pysam.AlignedSegment],
-    cent90_thresh: float,
-    AL_filt_thresh: float
-) -> tuple[bool, bool]:
-    
-    mut_pos_f: list[int] = []
-    mut_fracs_f: list[float] = []
-    mut_pos_r: list[int] = []
-    mut_fracs_r: list[float] = []
-    aln_scores: list[float] = []
-     
-    return tuple()
-
 
 # is streaming approach necessary?
 def main(
     bam_paths: list,
-    intervals, # type?
     vcf_in_path: str,
     vcf_out_path: str,
     clip_qual_cutoff: int,
@@ -218,13 +185,65 @@ def main(
         ### end
         
         ### check hairpin filter
-        vcf_rec
-        mut_reads
-        samples_w_mutants
-        
-        for samp, reads in mut_reads:
+        mut_read_pos_f: list[int] = []
+        mut_read_pos_r: list[int] = []
+        mut_read_fracs_f: list[float] = []
+        mut_read_fracs_r: list[float] = []
+        aln_scores: list[float] = []
+        for _, reads in mut_reads.items():
             for read in reads:
+                if any([x is None for x in [read.reference_start, read.reference_end]]):
+                    continue  # ?
                 
+                mut_pos = ref2seq.ref2querypos(read, vcf_rec.pos)
+                if mut_pos == -1:
+                    continue  # ?
+                if read.flag & 0x10:
+                    read_loc = read.reference_end - mut_pos + 1
+                    mut_read_fracs_r.append(read_loc / (read.reference_start - read.reference_end + 1))
+                    mut_read_pos_r.append(read_loc)
+                else:
+                    read_loc = (mut_pos - read.reference_start + 1)
+                    mut_read_fracs_f.append(read_loc / (read.reference_end - read.reference_start + 1))
+                    mut_read_pos_f.append(read_loc)
+                try:
+                    read.get_tag('AS')
+                except KeyError:
+                    continue  # ?
+                aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
+        al_filt = statistics.median(aln_scores) <= AL_thresh
+        
+        if len(mut_read_pos_f) > 1:
+            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
+            sd_f = statistics.stdev(mut_read_pos_f)
+        if len(mut_read_pos_r) > 1:
+            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
+            sd_r = statistics.stdev(mut_read_pos_r)
+        # hairpin conditions from Ellis et al.
+        hp_filt = True
+        # these branches all lead to the same result!
+        if len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
+            frac_lt_thresh = sum([x <= cent90_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
+            if (frac_lt_thresh < 0.9 or
+                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
+                (mad_f > 1 and sd_f > 10) or
+                (mad_r > 1 and sd_r > 10)):
+                hp_filt = False
+        elif len(mut_read_pos_f) > 1:
+            if (((sum([x <= cent90_thresh for x in mut_read_pos_f]) / len(mut_read_pos_f)) < 0.9) and
+                mad_f > 0 and
+                sd_f > 4):
+                hp_filt = False
+        elif len(mut_read_pos_r) > 1:
+            if (((sum([x <= cent90_thresh for x in mut_read_pos_r]) / len(mut_read_pos_r)) < 0.9) and
+                mad_r > 0 and
+                sd_r > 4):
+                hp_filt = False
+        else:
+            hp_filt = False
+        ### end 
+
+        ### update vcf record
     
     return
 

From 40bb38adc6b0ef58d9965e92c5696cace3097fd8 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 30 May 2024 16:57:40 +0100
Subject: [PATCH 007/165] basic argparse

---
 .gitignore |  1 +
 main.py    | 64 +++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index d905a25..7ac04f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ hairpin.egg-info/
 
 data/
 data
+__pycache__/
diff --git a/main.py b/main.py
index 7df17a3..2752f49 100644
--- a/main.py
+++ b/main.py
@@ -2,12 +2,14 @@
 from enum import Enum
 from typing import Optional
 from hairpin import ref2seq
-import statistics
+from statistics import mean, median, stdev
+import argparse
 
 Ops = Enum('Ops',
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
            start = 0)
 
+
 # is streaming approach necessary?
 def main(
     bam_paths: list,
@@ -17,15 +19,19 @@ def main(
     min_mapqual: int,
     min_basequal: int,
     max_span: int,
-    AL_thresh: float,
-    cent90_thresh:float,
-    header: bool
+    al_thresh: float,
+    cent90_thresh:float
 ) -> None:
     
-    vcf_obj: pysam.VariantFile = pysam.VariantFile(vcf_in_path)
-    sample_names: list[str] = list(vcf_obj.header.samples)
-    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in sample_names}
+    vcf_obj = pysam.VariantFile(vcf_in_path)
+    # init output
+    out_head = vcf_obj.header
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(al_thresh))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
+    vcf_out = pysam.VariantFile(vcf_out_path, 'w', header=out_head)
     
+    sample_names: list[str] = list(vcf_obj.header.samples)
+
     # try excepts
     bam_reader_dict: dict[str, Optional[pysam.AlignmentFile]] = dict.fromkeys(sample_names)
     for path in bam_paths:
@@ -38,11 +44,8 @@ def main(
             exit(1) # error
         else:
             bam_reader_dict[bam_sample] = bam  # type: ignore
-    
-    # init output
-    if header:
-        pass
-    
+
+    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in sample_names}
     # since intervals are unnecessary
     # - they were an artifact of the shearwater mp -
     # just iterate through all records in the vcf
@@ -63,7 +66,7 @@ def main(
         
         for mut_sample_name in samples_w_mutants:
             ### get_mutant_reads
-            for read in bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, vcf_rec.stop) # type: ignore
+            for read in bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, vcf_rec.stop): # type: ignore
                 
                 if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
                     continue  # ?
@@ -106,7 +109,7 @@ def main(
                         continue
                 
                 if ('S' in read.cigarstring and  # type: ignore
-                    statistics.mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+                    mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
                     continue
                 
                 if read.flag & 0x40:  # read first in pair
@@ -211,14 +214,14 @@ def main(
                 except KeyError:
                     continue  # ?
                 aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
-        al_filt = statistics.median(aln_scores) <= AL_thresh
+        al_filt = median(aln_scores) <= al_thresh
         
         if len(mut_read_pos_f) > 1:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-            sd_f = statistics.stdev(mut_read_pos_f)
+            sd_f = stdev(mut_read_pos_f)
         if len(mut_read_pos_r) > 1:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-            sd_r = statistics.stdev(mut_read_pos_r)
+            sd_r = stdev(mut_read_pos_r)
         # hairpin conditions from Ellis et al.
         hp_filt = True
         # these branches all lead to the same result!
@@ -244,9 +247,30 @@ def main(
         ### end 
 
         ### update vcf record
-    
+        if al_filt:
+            vcf_rec.filter.add("ALF")
+        if hp_filt:
+            vcf_rec.filter.add("HPF")
+        
+        # try except    
+        vcf_out.write(vcf_rec)
+
     return
 
+
 if __name__ == '__main__':
-    # do stuff
-    print('hello world')
\ No newline at end of file
+    
+    parser = argparse.ArgumentParser(prog="hairpin")
+    parser.add_argument('-i', '--vcf-in', help="path to input vcf", nargs=1, type=str, required=True)
+    parser.add_argument('-o', '--vcf-out', help="path to vcf out", nargs=1, type=str, required=True)
+    parser.add_argument('-b', '--bams', help="list of paths to bams for samples in input vcf, whitespace separated", nargs='+', type=list, required=True)
+    parser.add_argument('-cq', '--clip-quality-cutoff', default=35)
+    parser.add_argument('-mq', '--min-mapping-quality', default=11)
+    parser.add_argument('-mb', '--min-base-quality', default=25)
+    parser.add_argument('-ms', '--max-read-span', default=6)
+    parser.add_argument('-al', '--AL-filter-threshold', default=0.93)
+    parser.add_argument('-c9', '--cent90-threshold', default=0.15)
+    
+    args = parser.parse_args()
+    
+    
\ No newline at end of file

From 34c55a49c9565e780cd1f74c6a72713b49764565 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 19 Jun 2024 17:06:48 +0100
Subject: [PATCH 008/165] matches Peter's output; first pass

---
 .gitignore |   1 +
 main.py    | 391 ++++++++++++++++++++++++++++++++---------------------
 2 files changed, 235 insertions(+), 157 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7ac04f4..5f5a449 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ hairpin.egg-info/
 data/
 data
 __pycache__/
+.helix/
diff --git a/main.py b/main.py
index 2752f49..2353a4e 100644
--- a/main.py
+++ b/main.py
@@ -4,13 +4,15 @@
 from hairpin import ref2seq
 from statistics import mean, median, stdev
 import argparse
+import logging
+from itertools import tee
+import sys
 
 Ops = Enum('Ops',
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
            start = 0)
 
 
-# is streaming approach necessary?
 def main(
     bam_paths: list,
     vcf_in_path: str,
@@ -32,174 +34,221 @@ def main(
     
     sample_names: list[str] = list(vcf_obj.header.samples)
 
-    # try excepts
-    bam_reader_dict: dict[str, Optional[pysam.AlignmentFile]] = dict.fromkeys(sample_names)
-    for path in bam_paths:
-        bam = pysam.AlignmentFile(path, 'rb')
-        # grab the sample name from first SM field
-        # in header field RG
-        # this may cause problems?
-        # check with Peter
-        if bam_sample := bam.header.to_dict()['RG'][1]['SM'] not in sample_names:
-            exit(1) # error
-        else:
-            bam_reader_dict[bam_sample] = bam  # type: ignore
-
+    # add try excepts
+    
+    # func for local scope
+    bam_reader_dict: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)
+    def init_bam_dict() -> None:
+        for path in bam_paths:
+            bam = pysam.AlignmentFile(path, 'rb')
+            # grab the sample name from first SM field
+            # in header field RG
+            # this may cause problems?
+            # check with Peter
+            bam_sample = bam.header.to_dict()['RG'][1]['SM']
+            if bam_sample not in sample_names:
+                logging.error('bam doesnt match')
+                exit(1) # error
+            else:
+                bam_reader_dict[bam_sample] = bam
+        if any([x is None for x in bam_reader_dict.values()]):
+            logging.error('not enough bams')
+            exit(1)
+    init_bam_dict()  # execute w/o polluting namespace 
+    
     mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in sample_names}
     # since intervals are unnecessary
     # - they were an artifact of the shearwater mp -
     # just iterate through all records in the vcf
     for vcf_rec in vcf_obj.fetch():
-        
+
         if vcf_rec.alts is None:
-            continue  # ?
-        alt_test: bool = len(vcf_rec.alts[0]) == 1
-        if vcf_rec.rlen == 1:
-            mut_type = "sub" if alt_test else "ins"
-        elif alt_test:
-            mut_type = "del"
-        else:
-            mut_type = "complex"
-                
-        # check with Peter
-        samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
-        
-        for mut_sample_name in samples_w_mutants:
-            ### get_mutant_reads
-            for read in bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, vcf_rec.stop): # type: ignore
-                
-                if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
-                    continue  # ?
-                
-                if read.flag & 0xE02 or read.mapping_quality < min_mapqual:
-                    continue
+            logging.error('vcf rec has no alts')
+            continue  # ? ask Peter
+
+        def fill_mut_reads() -> None:
+            alt_test: bool = len(vcf_rec.alts[0]) == 1
+            if vcf_rec.rlen == 1:
+                mut_type = "sub" if alt_test else "ins"
+            elif alt_test:
+                mut_type = "del"
+            else:
+                mut_type = "complex"
                 
-                mut_pos = ref2seq.ref2querypos(read, vcf_rec.pos)
-                mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
+            # check with Peter
+            samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
 
-                # Check whether read reports variant or not - skip the rest of the loop if does not report variant
-                # First, check for sub
-                # does read.query_x work? or should it be read.query_alignment_x?
-                if (mut_type == "sub" and
-                    (mut_op != Ops.match or
-                     read.query_sequence[mut_pos] != vcf_rec.alts[0] or
-                     read.query_qualities[mut_pos] < min_basequal)):
-                    continue
+            if len(samples_w_mutants) == 0:
+                logging.error('no mutants')
+                sys.exit(1)
+            for mut_sample_name in samples_w_mutants:
+                ### get_mutant_reads
+                read_iter, test = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, (vcf_rec.pos - 2), vcf_rec.pos))
+                try:
+                    next(test)
+                except StopAsyncIteration:
+                    logging.error('empty iterator')
+                    sys.exit(1)
+                for read in read_iter: # type: ignore
+                    if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
+                        # breakpoint()
+                        continue  # ?
                 
-                # Second, check whether length of read can accommodate size of indel
-                # what if other alt is longer?
-                if (mut_pos + vcf_rec.rlen > read.query_length or
-                    mut_pos + len(vcf_rec.alts[0]) > read.query_length):
-                    continue
+                    if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
+                        # breakpoint()
+                        continue
                 
-                if mut_type == "del":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, vcf_rec.stop + 1))  # check with peter re in/exclusivity of range
-                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] != Ops.match or
-                        mut_rng_ops[-1] != Ops.match or
-                        any(x != Ops.delete for x in mut_rng_ops)):
+                    mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1)) # VCF 1-INDEXED, BAM 0-INDEXED
+                    mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
+
+                    # Check whether read reports variant or not - skip the rest of the loop if does not report variant
+                    # First, check for sub
+                    # does read.query_x work? or should it be read.query_alignment_x?
+                    if (mut_type == "sub" and
+                        (not (mut_op == Ops.match.value or mut_op == Ops.diff.value) or
+                         read.query_sequence[mut_pos] != vcf_rec.alts[0] or
+                         read.query_qualities[mut_pos] < min_basequal)):
+                        # breakpoint()
                         continue
-                elif mut_type == "ins":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
-                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] != Ops.match or
-                        mut_rng_ops[-1] != Ops.match or
-                        any(x != Ops.ins for x in mut_rng_ops) or
-                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
+                    # Second, check whether length of read can accommodate size of indel
+                    # what if other alt is longer?
+                    if (mut_pos + vcf_rec.rlen > read.query_length or
+                        mut_pos + len(vcf_rec.alts[0]) > read.query_length):
+                        # breakpoint()
                         continue
+
+                    if mut_type == "del":
+                        mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range((vcf_rec.pos - 1), vcf_rec.pos))  # check with peter re in/exclusivity of range
+                        mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                        if (mut_rng_ops[0] != Ops.match.value or
+                            mut_rng_ops[-1] != Ops.match.value or
+                            any(x != Ops.delete.value for x in mut_rng_ops)):
+                            # breakpoint()
+                            continue
+                    elif mut_type == "ins":
+                        mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
+                        mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                        if (mut_rng_ops[0] != Ops.match.value or
+                            mut_rng_ops[-1] != Ops.match.value or
+                            any(x != Ops.ins.value for x in mut_rng_ops) or
+                            read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
+                            # breakpoint()
+                            continue
                 
-                if ('S' in read.cigarstring and  # type: ignore
-                    mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
-                    continue
-                
-                if read.flag & 0x40:  # read first in pair
-                    # ADD READ TO DICT OR SOMETHING
-                    mut_reads[mut_sample_name].append(read)
-                else:  # read second in pair
-                    if read.flag & 0x10:
-                        mate = bam_reader_dict[mut_sample_name].mate(read)
-                        if mate.reference_end is None:
-                            continue  # ?
-                        if read.reference_start <= mate.reference_end:  # check with Peter, does this map on to his code accurately
-                            read_start = mate.query_alignment_end + 1
-                    else:
-                        if read.reference_end >= read.next_reference_start:
-                            read_end = read.next_reference_start - 1
-                    if read_start <= vcf_rec.pos <= read_end:
-                        # ADD READ TO DICT OR SOMETHING
+                    if ('S' in read.cigarstring and  # type: ignore
+                        mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+                        # breakpoint()
+                        continue
+
+                    if read.flag & 0x40:  # read first in pair
                         mut_reads[mut_sample_name].append(read)
-            ### end
+                    else:  # read second in pair
+                        read_start = read.reference_start
+                        read_end = read.reference_end
+                        if read.flag & 0x10:
+                            mate = bam_reader_dict[mut_sample_name].mate(read)
+                            if mate.reference_end is None:
+                                # breakpoint()
+                                continue  # ?
+                            if read.reference_start <= mate.reference_end:
+                                read_start = mate.reference_end + 1
+                        else:
+                            if read.reference_end >= read.next_reference_start:  # check with Peter, does this map on to his code accurately
+                                read_end = read.next_reference_start - 1
+                        # Peter's implemenation comments that this conditional below
+                        # checks if mutant overlaps with first read in pair
+                        # I don't see it, perhaps it refers to the code above
+                        if read_start <= (vcf_rec.pos - 1) <= read_end:
+                            mut_reads[mut_sample_name].append(read)
+                        else:
+                            # breakpoint() 
+                            continue
+        fill_mut_reads()
+        # breakpoint()
+        if all([x is None for x in mut_reads.values()]):
+            logging.error('empty mut_reads')
+            sys.exit(1)
+        
         ### remove_dups_with_wobble
-        for _, reads in mut_reads.items():
-            if len(reads) == 0:
-                continue
+        def mut_reads_remove_dups_w_wobble() -> None:
+            for mut_sample_name, reads in mut_reads.items():
+                if len(reads) == 0:
+                    continue
                 # want the start of the record, the end
-            ### start_mate_end_pairs()
-            # incidentally, I suppose hairpin only works for paired data?
-            sorted_ends = []
-            for read in reads:
-                mate = bam.mate(read)
+                ### start_mate_end_pairs()
+                # incidentally, I suppose hairpin only works for paired data?
+                sorted_ends = []
+                for read in reads:
+                    mate = bam_reader_dict[mut_sample_name].mate(read)
                 
-                if any(x is None for x in [read.reference_start,
-                                        read.reference_end,
-                                        read.cigartuples,
-                                        mate.reference_start,
-                                        mate.reference_end,
-                                        mate.cigartuples]):
-                    continue  # ?
+                    if any(x is None for x in [read.reference_start,
+                                            read.reference_end,
+                                            read.cigartuples,
+                                            mate.reference_start,
+                                            mate.reference_end,
+                                            mate.cigartuples]):
+                        continue  # ?
                 
-                # this gets pos wrt to reference, not query sequence, is that desired?
-                start: int = read.reference_start
-                end: int = read.reference_end
-                mate_start: int  = mate.reference_start
-                mate_end: int  = mate.reference_end
+                    # this gets pos wrt to alignment against reference
+                    start: int = read.reference_start
+                    end: int = read.reference_end
+                    mate_start: int  = mate.reference_start
+                    mate_end: int  = mate.reference_end
                 
-                # Peter
-                # behaviour on cig = None?     
-                # Julia XAM gets cigar info differently, checking CG:B,I tag
-                # does this matter?
-                if read.cigartuples[0][0] == Ops.soft:
-                    start -= read.cigartuples[0][1]
-                if read.cigartuples[-1][0] == Ops.soft:
-                    end += cig[-1][1]
-                if mate.cigartuples[0][0] == Ops.soft:
-                    mate_start -= mate.cigartuples[0][1]
-                if mate.cigartuples[-1][0] == Ops.soft:
-                    mate_end += mate.cigartuples[-1][1]
+                    # Peter
+                    # behaviour on cig = None?     
+                    # Julia XAM gets cigar info differently, checking CG:B,I tag
+                    # does this matter?
+                    if read.cigartuples[0][0] == Ops.soft.value:
+                        start -= read.cigartuples[0][1]
+                    if read.cigartuples[-1][0] == Ops.soft.value:
+                        end += read.cigartuples[-1][1]
+                    if mate.cigartuples[0][0] == Ops.soft.value:
+                        mate_start -= mate.cigartuples[0][1]
+                    if mate.cigartuples[-1][0] == Ops.soft.value:
+                        mate_end += mate.cigartuples[-1][1]
                 
-                # appears mate posns simply aren't assigned if none
-                sorted_ends.append(sorted([start, end, mate_start, mate_end]))
-            ### end
-            # I don't really understand this
-            sorted_ends: list[list[int]] = sorted(sorted_ends)
-            min_ends: list[list[int]] = [sorted_ends.pop(0)]
-            i = 1
-            while len(sorted_ends) != 0:
-                loop_ends: list[int] = sorted_ends.pop(0)
-                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
+                    # appears mate posns simply aren't assigned if none
+                    sorted_ends.append(sorted([start, end, mate_start, mate_end]))
+                ### end
+                sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
+                min_ends: list[list[int]] = [sorted_ends.pop(0)]
+                # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
+                i = 1
+                while len(sorted_ends) != 0:
+                    loop_ends: list[int] = sorted_ends.pop(0)
+                    max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
                  
-                if all([x <= max_span for x in max_spans]):
-                    min_ends.append(loop_ends)
-                    reads.pop(i)
-                else:
-                    min_ends = [loop_ends]
-                i += 1
-            del(i)
-        ### end
-        
+                    if all([x <= max_span for x in max_spans]):
+                        min_ends.append(loop_ends)
+                        reads.pop(i)
+                    else:
+                        min_ends = [loop_ends]
+                    i += 1
+                mut_reads[mut_sample_name] = reads
+        mut_reads_remove_dups_w_wobble()
+        # breakpoint()
+    
         ### check hairpin filter
         mut_read_pos_f: list[int] = []
         mut_read_pos_r: list[int] = []
         mut_read_fracs_f: list[float] = []
         mut_read_fracs_r: list[float] = []
         aln_scores: list[float] = []
+        # for my test vcf, by the time we get here there is only on read, for one sample, in mut_reads
+        # one sample is correct, since 0/1 only appears on in l0045
+        # but Peter's implementation flags HPF, so perhaps there should be more samples?
+        # need to follow that bam through this process, see what happens to each read
         for _, reads in mut_reads.items():
             for read in reads:
+                # breakpoint()
                 if any([x is None for x in [read.reference_start, read.reference_end]]):
+                    # breakpoint()
                     continue  # ?
                 
-                mut_pos = ref2seq.ref2querypos(read, vcf_rec.pos)
+                mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1))
                 if mut_pos == -1:
+                    # breakpoint()
                     continue  # ?
                 if read.flag & 0x10:
                     read_loc = read.reference_end - mut_pos + 1
@@ -209,40 +258,48 @@ def main(
                     read_loc = (mut_pos - read.reference_start + 1)
                     mut_read_fracs_f.append(read_loc / (read.reference_end - read.reference_start + 1))
                     mut_read_pos_f.append(read_loc)
+                # breakpoint()
                 try:
-                    read.get_tag('AS')
+                    aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
                 except KeyError:
+                    # breakpoint()
                     continue  # ?
-                aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
         al_filt = median(aln_scores) <= al_thresh
-        
-        if len(mut_read_pos_f) > 1:
+
+        # breakpoint()
+        fbool = len(mut_read_pos_f) > 1
+        rbool = len(mut_read_pos_r) > 1
+        if fbool:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
-        if len(mut_read_pos_r) > 1:
+        if rbool:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
         # hairpin conditions from Ellis et al.
         hp_filt = True
         # these branches all lead to the same result!
-        if len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
+        if fbool and rbool:
             frac_lt_thresh = sum([x <= cent90_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
             if (frac_lt_thresh < 0.9 or
                 (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
                 (mad_f > 1 and sd_f > 10) or
                 (mad_r > 1 and sd_r > 10)):
+                # breakpoint()
                 hp_filt = False
-        elif len(mut_read_pos_f) > 1:
+        elif fbool:
             if (((sum([x <= cent90_thresh for x in mut_read_pos_f]) / len(mut_read_pos_f)) < 0.9) and
                 mad_f > 0 and
                 sd_f > 4):
+                # breakpoint()
                 hp_filt = False
-        elif len(mut_read_pos_r) > 1:
+        elif rbool:
             if (((sum([x <= cent90_thresh for x in mut_read_pos_r]) / len(mut_read_pos_r)) < 0.9) and
                 mad_r > 0 and
                 sd_r > 4):
+                # breakpoint()
                 hp_filt = False
         else:
+            # breakpoint()
             hp_filt = False
         ### end 
 
@@ -251,26 +308,46 @@ def main(
             vcf_rec.filter.add("ALF")
         if hp_filt:
             vcf_rec.filter.add("HPF")
+            breakpoint()
         
-        # try except    
+        # try except
+        # breakpoint()
         vcf_out.write(vcf_rec)
 
-    return
-
 
 if __name__ == '__main__':
     
+    logging.basicConfig(level=logging.INFO)
+    
     parser = argparse.ArgumentParser(prog="hairpin")
-    parser.add_argument('-i', '--vcf-in', help="path to input vcf", nargs=1, type=str, required=True)
-    parser.add_argument('-o', '--vcf-out', help="path to vcf out", nargs=1, type=str, required=True)
-    parser.add_argument('-b', '--bams', help="list of paths to bams for samples in input vcf, whitespace separated", nargs='+', type=list, required=True)
-    parser.add_argument('-cq', '--clip-quality-cutoff', default=35)
-    parser.add_argument('-mq', '--min-mapping-quality', default=11)
-    parser.add_argument('-mb', '--min-base-quality', default=25)
-    parser.add_argument('-ms', '--max-read-span', default=6)
-    parser.add_argument('-al', '--AL-filter-threshold', default=0.93)
-    parser.add_argument('-c9', '--cent90-threshold', default=0.15)
+    parser._optionals.title = 'info'
+    parser.add_argument('-v', '--version', help='print version', action='version', version='hairpin 1.0.0')
+    req = parser.add_argument_group('required')
+    req.add_argument('-i', '--vcf-in', help="path to input vcf", required=True)
+    req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
+    req.add_argument('-b', '--bams', help="list of paths to bams for samples in input vcf, whitespace separated", nargs='+', required=True)
+    opt = parser.add_argument_group('options')
+    opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int)
+    opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int)
+    opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int)
+    opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int)
+    opt.add_argument('-al', '--AL-filter-threshold', help='default: 0.93', type=float)
+    opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float)
     
     args = parser.parse_args()
+    if any([x is None for _, x in vars(args).items()]):
+        logging.info('option(s) not provided, using defaults')
+    
+    main(
+        bam_paths=args.bams,
+        vcf_in_path=args.vcf_in,
+        vcf_out_path=args.vcf_out,
+        clip_qual_cutoff=args.clip_quality_cutoff if args.clip_quality_cutoff else 35,
+        min_mapqual=args.min_mapping_quality if args.min_mapping_quality else 11,
+        min_basequal=args.min_base_quality if args.min_base_quality else 25,
+        max_span=args.max_read_span if args.max_read_span else 6,
+        al_thresh=args.AL_filter_threshold if args.AL_filter_threshold else 0.93,
+        cent90_thresh=args.cent90_threshold if args.cent90_threshold else 0.15
+    )
+    
     
-    
\ No newline at end of file

From 89204b4b1a99cbf7ebd3be84d888f116ad0faa30 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 20 Jun 2024 10:26:44 +0100
Subject: [PATCH 009/165] rethink block scope

---
 main.py | 289 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 143 insertions(+), 146 deletions(-)

diff --git a/main.py b/main.py
index 2353a4e..a6036c3 100644
--- a/main.py
+++ b/main.py
@@ -1,6 +1,5 @@
 import pysam
 from enum import Enum
-from typing import Optional
 from hairpin import ref2seq
 from statistics import mean, median, stdev
 import argparse
@@ -66,167 +65,165 @@ def init_bam_dict() -> None:
             logging.error('vcf rec has no alts')
             continue  # ? ask Peter
 
-        def fill_mut_reads() -> None:
-            alt_test: bool = len(vcf_rec.alts[0]) == 1
-            if vcf_rec.rlen == 1:
-                mut_type = "sub" if alt_test else "ins"
-            elif alt_test:
-                mut_type = "del"
-            else:
-                mut_type = "complex"
-                
-            # check with Peter
-            samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
+        alt_test: bool = len(vcf_rec.alts[0]) == 1
+        if vcf_rec.rlen == 1:
+            mut_type = "sub" if alt_test else "ins"
+        elif alt_test:
+            mut_type = "del"
+        else:
+            mut_type = "complex"
+            
+        # check with Peter
+        samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
 
-            if len(samples_w_mutants) == 0:
-                logging.error('no mutants')
+        if len(samples_w_mutants) == 0:
+            logging.error('no mutants')
+            sys.exit(1)
+        for mut_sample_name in samples_w_mutants:
+            ### get_mutant_reads
+            read_iter, test = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, (vcf_rec.pos - 2), vcf_rec.pos))
+            try:
+                next(test)
+            except StopAsyncIteration:
+                logging.error('empty iterator')
                 sys.exit(1)
-            for mut_sample_name in samples_w_mutants:
-                ### get_mutant_reads
-                read_iter, test = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, (vcf_rec.pos - 2), vcf_rec.pos))
-                try:
-                    next(test)
-                except StopAsyncIteration:
-                    logging.error('empty iterator')
-                    sys.exit(1)
-                for read in read_iter: # type: ignore
-                    if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
-                        # breakpoint()
-                        continue  # ?
-                
-                    if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
-                        # breakpoint()
-                        continue
-                
-                    mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1)) # VCF 1-INDEXED, BAM 0-INDEXED
-                    mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
+            for read in read_iter: # type: ignore
+                if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
+                    # breakpoint()
+                    continue  # ?
+            
+                if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
+                    # breakpoint()
+                    continue
+            
+                mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1)) # VCF 1-INDEXED, BAM 0-INDEXED
+                mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
+
+                # Check whether read reports variant or not - skip the rest of the loop if does not report variant
+                # First, check for sub
+                # does read.query_x work? or should it be read.query_alignment_x?
+                if (mut_type == "sub" and
+                    (not (mut_op == Ops.match.value or mut_op == Ops.diff.value) or
+                     read.query_sequence[mut_pos] != vcf_rec.alts[0] or
+                     read.query_qualities[mut_pos] < min_basequal)):
+                    # breakpoint()
+                    continue
+                # Second, check whether length of read can accommodate size of indel
+                # what if other alt is longer?
+                if (mut_pos + vcf_rec.rlen > read.query_length or
+                    mut_pos + len(vcf_rec.alts[0]) > read.query_length):
+                    # breakpoint()
+                    continue
 
-                    # Check whether read reports variant or not - skip the rest of the loop if does not report variant
-                    # First, check for sub
-                    # does read.query_x work? or should it be read.query_alignment_x?
-                    if (mut_type == "sub" and
-                        (not (mut_op == Ops.match.value or mut_op == Ops.diff.value) or
-                         read.query_sequence[mut_pos] != vcf_rec.alts[0] or
-                         read.query_qualities[mut_pos] < min_basequal)):
+                if mut_type == "del":
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range((vcf_rec.pos - 1), vcf_rec.pos))  # check with peter re in/exclusivity of range
+                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                    if (mut_rng_ops[0] != Ops.match.value or
+                        mut_rng_ops[-1] != Ops.match.value or
+                        any(x != Ops.delete.value for x in mut_rng_ops)):
                         # breakpoint()
                         continue
-                    # Second, check whether length of read can accommodate size of indel
-                    # what if other alt is longer?
-                    if (mut_pos + vcf_rec.rlen > read.query_length or
-                        mut_pos + len(vcf_rec.alts[0]) > read.query_length):
+                elif mut_type == "ins":
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
+                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
+                    if (mut_rng_ops[0] != Ops.match.value or
+                        mut_rng_ops[-1] != Ops.match.value or
+                        any(x != Ops.ins.value for x in mut_rng_ops) or
+                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
                         # breakpoint()
                         continue
+            
+                if ('S' in read.cigarstring and  # type: ignore
+                    mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+                    # breakpoint()
+                    continue
 
-                    if mut_type == "del":
-                        mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range((vcf_rec.pos - 1), vcf_rec.pos))  # check with peter re in/exclusivity of range
-                        mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                        if (mut_rng_ops[0] != Ops.match.value or
-                            mut_rng_ops[-1] != Ops.match.value or
-                            any(x != Ops.delete.value for x in mut_rng_ops)):
-                            # breakpoint()
-                            continue
-                    elif mut_type == "ins":
-                        mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
-                        mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                        if (mut_rng_ops[0] != Ops.match.value or
-                            mut_rng_ops[-1] != Ops.match.value or
-                            any(x != Ops.ins.value for x in mut_rng_ops) or
-                            read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
+                if read.flag & 0x40:  # read first in pair
+                    mut_reads[mut_sample_name].append(read)
+                else:  # read second in pair
+                    read_start = read.reference_start
+                    read_end = read.reference_end
+                    if read.flag & 0x10:
+                        mate = bam_reader_dict[mut_sample_name].mate(read)
+                        if mate.reference_end is None:
                             # breakpoint()
-                            continue
-                
-                    if ('S' in read.cigarstring and  # type: ignore
-                        mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
-                        # breakpoint()
-                        continue
-
-                    if read.flag & 0x40:  # read first in pair
+                            continue  # ?
+                        if read.reference_start <= mate.reference_end:
+                            read_start = mate.reference_end + 1
+                    else:
+                        if read.reference_end >= read.next_reference_start:  # check with Peter, does this map on to his code accurately
+                            read_end = read.next_reference_start - 1
+                    # Peter's implemenation comments that this conditional below
+                    # checks if mutant overlaps with first read in pair
+                    # I don't see it, perhaps it refers to the code above
+                    if read_start <= (vcf_rec.pos - 1) <= read_end:
                         mut_reads[mut_sample_name].append(read)
-                    else:  # read second in pair
-                        read_start = read.reference_start
-                        read_end = read.reference_end
-                        if read.flag & 0x10:
-                            mate = bam_reader_dict[mut_sample_name].mate(read)
-                            if mate.reference_end is None:
-                                # breakpoint()
-                                continue  # ?
-                            if read.reference_start <= mate.reference_end:
-                                read_start = mate.reference_end + 1
-                        else:
-                            if read.reference_end >= read.next_reference_start:  # check with Peter, does this map on to his code accurately
-                                read_end = read.next_reference_start - 1
-                        # Peter's implemenation comments that this conditional below
-                        # checks if mutant overlaps with first read in pair
-                        # I don't see it, perhaps it refers to the code above
-                        if read_start <= (vcf_rec.pos - 1) <= read_end:
-                            mut_reads[mut_sample_name].append(read)
-                        else:
-                            # breakpoint() 
-                            continue
-        fill_mut_reads()
+                    else:
+                        # breakpoint() 
+                        continue
+        del(read_iter, test, read, mut_sample_name)  # type: ignore
         # breakpoint()
         if all([x is None for x in mut_reads.values()]):
             logging.error('empty mut_reads')
             sys.exit(1)
         
         ### remove_dups_with_wobble
-        def mut_reads_remove_dups_w_wobble() -> None:
-            for mut_sample_name, reads in mut_reads.items():
-                if len(reads) == 0:
-                    continue
-                # want the start of the record, the end
-                ### start_mate_end_pairs()
-                # incidentally, I suppose hairpin only works for paired data?
-                sorted_ends = []
-                for read in reads:
-                    mate = bam_reader_dict[mut_sample_name].mate(read)
-                
-                    if any(x is None for x in [read.reference_start,
-                                            read.reference_end,
-                                            read.cigartuples,
-                                            mate.reference_start,
-                                            mate.reference_end,
-                                            mate.cigartuples]):
-                        continue  # ?
-                
-                    # this gets pos wrt to alignment against reference
-                    start: int = read.reference_start
-                    end: int = read.reference_end
-                    mate_start: int  = mate.reference_start
-                    mate_end: int  = mate.reference_end
-                
-                    # Peter
-                    # behaviour on cig = None?     
-                    # Julia XAM gets cigar info differently, checking CG:B,I tag
-                    # does this matter?
-                    if read.cigartuples[0][0] == Ops.soft.value:
-                        start -= read.cigartuples[0][1]
-                    if read.cigartuples[-1][0] == Ops.soft.value:
-                        end += read.cigartuples[-1][1]
-                    if mate.cigartuples[0][0] == Ops.soft.value:
-                        mate_start -= mate.cigartuples[0][1]
-                    if mate.cigartuples[-1][0] == Ops.soft.value:
-                        mate_end += mate.cigartuples[-1][1]
-                
-                    # appears mate posns simply aren't assigned if none
-                    sorted_ends.append(sorted([start, end, mate_start, mate_end]))
-                ### end
-                sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
-                min_ends: list[list[int]] = [sorted_ends.pop(0)]
-                # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
-                i = 1
-                while len(sorted_ends) != 0:
-                    loop_ends: list[int] = sorted_ends.pop(0)
-                    max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
-                 
-                    if all([x <= max_span for x in max_spans]):
-                        min_ends.append(loop_ends)
-                        reads.pop(i)
-                    else:
-                        min_ends = [loop_ends]
-                    i += 1
-                mut_reads[mut_sample_name] = reads
-        mut_reads_remove_dups_w_wobble()
+        for mut_sample_name, reads in mut_reads.items():
+            if len(reads) == 0:
+                continue
+            # want the start of the record, the end
+            ### start_mate_end_pairs()
+            # incidentally, I suppose hairpin only works for paired data?
+            sorted_ends = []
+            for read in reads:
+                mate = bam_reader_dict[mut_sample_name].mate(read)
+            
+                if any(x is None for x in [read.reference_start,
+                                        read.reference_end,
+                                        read.cigartuples,
+                                        mate.reference_start,
+                                        mate.reference_end,
+                                        mate.cigartuples]):
+                    continue  # ?
+            
+                # this gets pos wrt to alignment against reference
+                start: int = read.reference_start
+                end: int = read.reference_end
+                mate_start: int  = mate.reference_start
+                mate_end: int  = mate.reference_end
+            
+                # Peter
+                # behaviour on cig = None?     
+                # Julia XAM gets cigar info differently, checking CG:B,I tag
+                # does this matter?
+                if read.cigartuples[0][0] == Ops.soft.value:
+                    start -= read.cigartuples[0][1]
+                if read.cigartuples[-1][0] == Ops.soft.value:
+                    end += read.cigartuples[-1][1]
+                if mate.cigartuples[0][0] == Ops.soft.value:
+                    mate_start -= mate.cigartuples[0][1]
+                if mate.cigartuples[-1][0] == Ops.soft.value:
+                    mate_end += mate.cigartuples[-1][1]
+            
+                # appears mate posns simply aren't assigned if none
+                sorted_ends.append(sorted([start, end, mate_start, mate_end]))
+            ### end
+            sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
+            min_ends: list[list[int]] = [sorted_ends.pop(0)]
+            # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
+            i = 1
+            while len(sorted_ends) != 0:
+                loop_ends: list[int] = sorted_ends.pop(0)
+                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
+             
+                if all([x <= max_span for x in max_spans]):
+                    min_ends.append(loop_ends)
+                    reads.pop(i)
+                else:
+                    min_ends = [loop_ends]
+                i += 1
+            mut_reads[mut_sample_name] = reads
+        del(read, reads, mut_sample_name)
         # breakpoint()
     
         ### check hairpin filter

From 680ca38d97a9797d5cd3ab505893297247c6455e Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 20 Jun 2024 18:51:37 +0100
Subject: [PATCH 010/165] bash into vaguely DRY, linear, procedural state (WIP)

---
 main.py | 262 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 130 insertions(+), 132 deletions(-)

diff --git a/main.py b/main.py
index a6036c3..0f64578 100644
--- a/main.py
+++ b/main.py
@@ -5,12 +5,22 @@
 import argparse
 import logging
 from itertools import tee
-import sys
+from sys import exit as sysexit
+from dataclasses import dataclass
 
 Ops = Enum('Ops',
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
            start = 0)
 
+@dataclass
+class MutReadInfo:
+    
+    read: pysam.AlignedSegment
+    mate: pysam.AlignedSegment
+    local_posn: int
+    cig_op: int
+    
+    
 
 def main(
     bam_paths: list,
@@ -23,18 +33,18 @@ def main(
     al_thresh: float,
     cent90_thresh:float
 ) -> None:
-    
+
     vcf_obj = pysam.VariantFile(vcf_in_path)
     # init output
     out_head = vcf_obj.header
     out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(al_thresh))
     out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
     vcf_out = pysam.VariantFile(vcf_out_path, 'w', header=out_head)
-    
+
     sample_names: list[str] = list(vcf_obj.header.samples)
 
     # add try excepts
-    
+
     # func for local scope
     bam_reader_dict: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)
     def init_bam_dict() -> None:
@@ -54,11 +64,7 @@ def init_bam_dict() -> None:
             logging.error('not enough bams')
             exit(1)
     init_bam_dict()  # execute w/o polluting namespace 
-    
-    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in sample_names}
-    # since intervals are unnecessary
-    # - they were an artifact of the shearwater mp -
-    # just iterate through all records in the vcf
+
     for vcf_rec in vcf_obj.fetch():
 
         if vcf_rec.alts is None:
@@ -78,73 +84,104 @@ def init_bam_dict() -> None:
 
         if len(samples_w_mutants) == 0:
             logging.error('no mutants')
-            sys.exit(1)
+            sysexit(1)
+
+        mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in samples_w_mutants}
+
+        mut_read_pos_f: list[int] = []
+        mut_read_pos_r: list[int] = []
+        mut_read_fracs_f: list[float] = []
+        mut_read_fracs_r: list[float] = []
+        aln_scores: list[float] = []
+        
+        
         for mut_sample_name in samples_w_mutants:
             ### get_mutant_reads
-            read_iter, test = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, (vcf_rec.pos - 2), vcf_rec.pos))
+            read_iter, test_iter = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
             try:
-                next(test)
-            except StopAsyncIteration:
+                next(test_iter)
+            except StopIteration:
                 logging.error('empty iterator')
-                sys.exit(1)
+                continue
+            # Peter the above will skip the sample if no reads are returned by the fetch
+            # is that acceptable? Would we not expect reads in the bam if the sample
+            # exhibits 0/1
+            sorted_ends = []
+            read = None
             for read in read_iter: # type: ignore
-                if any(x is None for x in [read.query_sequence, read.query_qualities, read.cigarstring, read.reference_start, read.reference_end]):
-                    # breakpoint()
-                    continue  # ?
-            
+
                 if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
                     # breakpoint()
                     continue
-            
-                mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1)) # VCF 1-INDEXED, BAM 0-INDEXED
+
+                if any(x is None for x in [read.reference_start,
+                                        read.reference_end,
+                                        read.query_sequence,
+                                        read.query_qualities,
+                                        read.cigarstring,
+                                        read.cigartuples]):
+                                        # error state or continue?
+                                        sysexit(1)
+
+                mut_pos = ref2seq.ref2querypos(read, vcf_rec.start) # VCF 1-INDEXED, BAM 0-INDEXED
                 mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
 
+
                 # Check whether read reports variant or not - skip the rest of the loop if does not report variant
                 # First, check for sub
                 # does read.query_x work? or should it be read.query_alignment_x?
-                if (mut_type == "sub" and
-                    (not (mut_op == Ops.match.value or mut_op == Ops.diff.value) or
-                     read.query_sequence[mut_pos] != vcf_rec.alts[0] or
-                     read.query_qualities[mut_pos] < min_basequal)):
-                    # breakpoint()
-                    continue
+                if mut_type == "sub":
+                    if (mut_op not in [Ops.match.value, Ops.diff.value] or
+                        read.query_sequence[mut_pos] != vcf_rec.alts[0] or  # what about other alts?
+                        read.query_qualities[mut_pos] < min_basequal):
+                            # breakpoint()
+                            continue
                 # Second, check whether length of read can accommodate size of indel
                 # what if other alt is longer?
-                if (mut_pos + vcf_rec.rlen > read.query_length or
-                    mut_pos + len(vcf_rec.alts[0]) > read.query_length):
+                elif (mut_pos + vcf_rec.rlen > read.query_length or
+                      mut_pos + len(vcf_rec.alts[0]) > read.query_length):
                     # breakpoint()
                     continue
 
                 if mut_type == "del":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range((vcf_rec.pos - 1), vcf_rec.pos))  # check with peter re in/exclusivity of range
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, vcf_rec.stop))
                     mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] != Ops.match.value or
-                        mut_rng_ops[-1] != Ops.match.value or
-                        any(x != Ops.delete.value for x in mut_rng_ops)):
+                    if (mut_rng_ops[0] not in [Ops.match.value, Ops.diff.value] or  # Peter should diff op be considered here?
+                        mut_rng_ops[-1] not in [Ops.match.value, Ops.diff.value] or
+                        any(x != Ops.delete.value for x in mut_rng_ops[1:-2])):
                         # breakpoint()
                         continue
                 elif mut_type == "ins":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(mut_pos, mut_pos + len(vcf_rec.alts[0]) + 1))
+                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, (vcf_rec.start + len(vcf_rec.alts[0]))))  # other alt?
                     mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] != Ops.match.value or
-                        mut_rng_ops[-1] != Ops.match.value or
-                        any(x != Ops.ins.value for x in mut_rng_ops) or
-                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):
+                    if (mut_rng_ops[0] not in [Ops.match.value, Ops.diff.value] or
+                        mut_rng_ops[-1] not in [Ops.match.value, Ops.diff.value] or
+                        any(x != Ops.ins.value for x in mut_rng_ops[1:-2]) or
+                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):  # further checks needed, should it be query_alignment_sequence?
                         # breakpoint()
                         continue
-            
+
+                # n.b. nothing done if complex read
+                
                 if ('S' in read.cigarstring and  # type: ignore
                     mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
                     # breakpoint()
                     continue
 
+                mate = bam_reader_dict[mut_sample_name].mate(read)
+
+                if any(x is None for x in [mate.reference_start,
+                                            mate.reference_end,
+                                            mate.cigartuples]):
+                                        logging.error('badmate')
+                                        continue
+
                 if read.flag & 0x40:  # read first in pair
                     mut_reads[mut_sample_name].append(read)
                 else:  # read second in pair
                     read_start = read.reference_start
                     read_end = read.reference_end
                     if read.flag & 0x10:
-                        mate = bam_reader_dict[mut_sample_name].mate(read)
                         if mate.reference_end is None:
                             # breakpoint()
                             continue  # ?
@@ -156,94 +193,50 @@ def init_bam_dict() -> None:
                     # Peter's implemenation comments that this conditional below
                     # checks if mutant overlaps with first read in pair
                     # I don't see it, perhaps it refers to the code above
-                    if read_start <= (vcf_rec.pos - 1) <= read_end:
+                    if read_start <= vcf_rec.start <= read_end:
                         mut_reads[mut_sample_name].append(read)
                     else:
                         # breakpoint() 
                         continue
-        del(read_iter, test, read, mut_sample_name)  # type: ignore
-        # breakpoint()
-        if all([x is None for x in mut_reads.values()]):
-            logging.error('empty mut_reads')
-            sys.exit(1)
-        
-        ### remove_dups_with_wobble
-        for mut_sample_name, reads in mut_reads.items():
-            if len(reads) == 0:
-                continue
-            # want the start of the record, the end
-            ### start_mate_end_pairs()
-            # incidentally, I suppose hairpin only works for paired data?
-            sorted_ends = []
-            for read in reads:
-                mate = bam_reader_dict[mut_sample_name].mate(read)
-            
-                if any(x is None for x in [read.reference_start,
-                                        read.reference_end,
-                                        read.cigartuples,
-                                        mate.reference_start,
-                                        mate.reference_end,
-                                        mate.cigartuples]):
-                    continue  # ?
-            
-                # this gets pos wrt to alignment against reference
-                start: int = read.reference_start
-                end: int = read.reference_end
-                mate_start: int  = mate.reference_start
-                mate_end: int  = mate.reference_end
-            
+                # if we've got this far, then read has been added to mut_reads
+                # get pos wrt to aligned region
                 # Peter
                 # behaviour on cig = None?     
                 # Julia XAM gets cigar info differently, checking CG:B,I tag
                 # does this matter?
-                if read.cigartuples[0][0] == Ops.soft.value:
-                    start -= read.cigartuples[0][1]
-                if read.cigartuples[-1][0] == Ops.soft.value:
-                    end += read.cigartuples[-1][1]
-                if mate.cigartuples[0][0] == Ops.soft.value:
-                    mate_start -= mate.cigartuples[0][1]
-                if mate.cigartuples[-1][0] == Ops.soft.value:
-                    mate_end += mate.cigartuples[-1][1]
-            
-                # appears mate posns simply aren't assigned if none
-                sorted_ends.append(sorted([start, end, mate_start, mate_end]))
-            ### end
+                soft_start = (read.reference_start - read.cigartuples[0][1]) if read.cigartuples[0][0] == Ops.soft.value else read.reference_start
+                soft_end = (read.reference_end + read.cigartuples[-1][1]) if read.cigartuples[-1][0] == Ops.soft.value else read.reference_end
+                soft_mate_start = (mate.reference_start - mate.cigartuples[0][1]) if mate.cigartuples[0][0] == Ops.soft.value else mate.reference_start
+                soft_mate_end = (mate.reference_end + mate.cigartuples[-1][1]) if mate.cigartuples[-1][0] == Ops.soft.value else mate.reference_end
+
+                sorted_ends.append(sorted([soft_start, soft_end, soft_mate_start, soft_mate_end]))
             sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
             min_ends: list[list[int]] = [sorted_ends.pop(0)]
             # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
             i = 1
+            # not sure this is right below...
             while len(sorted_ends) != 0:
                 loop_ends: list[int] = sorted_ends.pop(0)
                 max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
-             
+
                 if all([x <= max_span for x in max_spans]):
                     min_ends.append(loop_ends)
-                    reads.pop(i)
+                    mut_reads[mut_sample_name].pop(i)
                 else:
                     min_ends = [loop_ends]
                 i += 1
-            mut_reads[mut_sample_name] = reads
-        del(read, reads, mut_sample_name)
-        # breakpoint()
-    
-        ### check hairpin filter
-        mut_read_pos_f: list[int] = []
-        mut_read_pos_r: list[int] = []
-        mut_read_fracs_f: list[float] = []
-        mut_read_fracs_r: list[float] = []
-        aln_scores: list[float] = []
-        # for my test vcf, by the time we get here there is only on read, for one sample, in mut_reads
-        # one sample is correct, since 0/1 only appears on in l0045
-        # but Peter's implementation flags HPF, so perhaps there should be more samples?
-        # need to follow that bam through this process, see what happens to each read
+
+            if read:
+                del(read)
+            # if we got nothing for that sample
+
+            if len(mut_reads[mut_sample_name]) == 0:
+                # is this an error state? if the sample showed 0/1 would we expect viable reads supporting that mutation?
+                continue
+
         for _, reads in mut_reads.items():
             for read in reads:
-                # breakpoint()
-                if any([x is None for x in [read.reference_start, read.reference_end]]):
-                    # breakpoint()
-                    continue  # ?
-                
-                mut_pos = ref2seq.ref2querypos(read, (vcf_rec.pos - 1))
+                mut_pos = ref2seq.ref2querypos(read, vcf_rec.start)
                 if mut_pos == -1:
                     # breakpoint()
                     continue  # ?
@@ -259,44 +252,46 @@ def init_bam_dict() -> None:
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
                 except KeyError:
+                    # Peter what is the correct approach to this state?
                     # breakpoint()
                     continue  # ?
+
+        if len(aln_scores) == 0:
+            # Peter what is the correct approach to this state?
+            logging.error('bad')
+
         al_filt = median(aln_scores) <= al_thresh
 
-        # breakpoint()
         fbool = len(mut_read_pos_f) > 1
         rbool = len(mut_read_pos_r) > 1
-        if fbool:
-            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-            sd_f = stdev(mut_read_pos_f)
-        if rbool:
-            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-            sd_r = stdev(mut_read_pos_r)
         # hairpin conditions from Ellis et al.
         hp_filt = True
-        # these branches all lead to the same result!
-        if fbool and rbool:
-            frac_lt_thresh = sum([x <= cent90_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
-            if (frac_lt_thresh < 0.9 or
-                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
-                (mad_f > 1 and sd_f > 10) or
-                (mad_r > 1 and sd_r > 10)):
-                # breakpoint()
-                hp_filt = False
-        elif fbool:
+        if fbool and not rbool:
+            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
+            sd_f = stdev(mut_read_pos_f)
             if (((sum([x <= cent90_thresh for x in mut_read_pos_f]) / len(mut_read_pos_f)) < 0.9) and
                 mad_f > 0 and
                 sd_f > 4):
-                # breakpoint()
                 hp_filt = False
-        elif rbool:
+        elif rbool and not fbool:
+            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
+            sd_r = stdev(mut_read_pos_r)
             if (((sum([x <= cent90_thresh for x in mut_read_pos_r]) / len(mut_read_pos_r)) < 0.9) and
-                mad_r > 0 and
-                sd_r > 4):
-                # breakpoint()
+                  mad_r > 0 and
+                  sd_r > 4):
+                hp_filt = False
+        elif fbool and rbool:
+            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
+            sd_f = stdev(mut_read_pos_f)
+            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
+            sd_r = stdev(mut_read_pos_r)
+            frac_lt_thresh = sum([x <= cent90_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
+            if (frac_lt_thresh < 0.9 or
+               (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
+               (mad_f > 1 and sd_f > 10) or
+               (mad_r > 1 and sd_r > 10)):
                 hp_filt = False
         else:
-            # breakpoint()
             hp_filt = False
         ### end 
 
@@ -305,10 +300,9 @@ def init_bam_dict() -> None:
             vcf_rec.filter.add("ALF")
         if hp_filt:
             vcf_rec.filter.add("HPF")
-            breakpoint()
         
         # try except
-        # breakpoint()
+        breakpoint()
         vcf_out.write(vcf_rec)
 
 
@@ -330,6 +324,11 @@ def init_bam_dict() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int)
     opt.add_argument('-al', '--AL-filter-threshold', help='default: 0.93', type=float)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float)
+    proc_opt = parser.add_argument_group('procedural options')
+    log_sev_opt = proc_opt.add_mutually_exclusive_group()
+    log_sev_opt.add_argument('-l', help='log reason for flags to file', nargs=1)
+    log_sev_opt.add_argument('-ll', help='as -l, and additionally log reason for NOT flagging to file', nargs=1)
+    log_sev_opt.add_argument('-lll', help='as -ll, and additionaly log reason for discarding reads', nargs=1)
     
     args = parser.parse_args()
     if any([x is None for _, x in vars(args).items()]):
@@ -346,5 +345,4 @@ def init_bam_dict() -> None:
         al_thresh=args.AL_filter_threshold if args.AL_filter_threshold else 0.93,
         cent90_thresh=args.cent90_threshold if args.cent90_threshold else 0.15
     )
-    
-    
+

From 394b456c7c7e916373930afc2dcdfcd3362f99e1 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 21 Jun 2024 16:52:17 +0100
Subject: [PATCH 011/165] logging

---
 hairpin/utilities.py |  31 +++++++
 main.py              | 215 +++++++++++++++++++++++++++----------------
 2 files changed, 165 insertions(+), 81 deletions(-)
 create mode 100644 hairpin/utilities.py

diff --git a/hairpin/utilities.py b/hairpin/utilities.py
new file mode 100644
index 0000000..23da78f
--- /dev/null
+++ b/hairpin/utilities.py
@@ -0,0 +1,31 @@
+from typing import TextIO, Optional
+import argparse
+import re
+
+def log_decision(
+    msg: str,
+    decision_lvl: int,
+    log_lvl: int,
+    log_file: Optional[TextIO]
+) -> None:
+    if log_file is not None and decision_lvl >= log_lvl:
+        print(msg, file = log_file)
+
+
+# Define a custom action to set a string and counter when the argument is present
+class SetLogAndSeverity(argparse.Action):
+        def __call__(self, parser, namespace, values, option_string=None):
+            setattr(namespace, self.dest, values)
+
+            # count ls to set sev
+            if option_string is not None:
+                pattern = re.compile('[^-l]')
+                if pattern.search(option_string):
+                    raise KeyError
+                else:
+                    sev = option_string.count('l')
+            else:
+                raise KeyError
+
+            setattr(namespace, 'severity', sev)
+
diff --git a/main.py b/main.py
index 0f64578..0ce328c 100644
--- a/main.py
+++ b/main.py
@@ -1,12 +1,17 @@
 import pysam
 from enum import Enum
-from hairpin import ref2seq
+from hairpin import ref2seq, utilities
 from statistics import mean, median, stdev
 import argparse
 import logging
 from itertools import tee
+from functools import partial
 from sys import exit as sysexit
 from dataclasses import dataclass
+from typing import Callable, Optional
+
+EXIT_SUCCESS = 0
+EXIT_FAILURE = 1
 
 Ops = Enum('Ops',
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
@@ -22,56 +27,43 @@ class MutReadInfo:
     
     
 
+
+def cleanup(code: int, msg: Optional[str] = None) -> None:
+    if code != EXIT_SUCCESS and msg:
+        logging.error(msg)
+    for obj_name in ['vcf_in_handle', 'vcf_out_handle']:
+        if obj_name in locals():
+            locals()[obj_name].close()  # lol
+    if 'bam_reader_d' in locals():
+        locals()['bam_reader_d'].close()
+    if 'log_file' in locals() and locals()['log_file']:
+        locals()['log_file'].close()
+    if code == EXIT_SUCCESS:
+        logging.info('hairpin complete')
+    sysexit(code)
+
+
 def main(
-    bam_paths: list,
-    vcf_in_path: str,
-    vcf_out_path: str,
+    bams: dict[str, pysam.AlignmentFile],
+    vcf_in: pysam.VariantFile,
+    vcf_out: pysam.VariantFile,
     clip_qual_cutoff: int,
     min_mapqual: int,
     min_basequal: int,
     max_span: int,
     al_thresh: float,
-    cent90_thresh:float
-) -> None:
-
-    vcf_obj = pysam.VariantFile(vcf_in_path)
-    # init output
-    out_head = vcf_obj.header
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(al_thresh))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
-    vcf_out = pysam.VariantFile(vcf_out_path, 'w', header=out_head)
-
-    sample_names: list[str] = list(vcf_obj.header.samples)
-
-    # add try excepts
-
-    # func for local scope
-    bam_reader_dict: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)
-    def init_bam_dict() -> None:
-        for path in bam_paths:
-            bam = pysam.AlignmentFile(path, 'rb')
-            # grab the sample name from first SM field
-            # in header field RG
-            # this may cause problems?
-            # check with Peter
-            bam_sample = bam.header.to_dict()['RG'][1]['SM']
-            if bam_sample not in sample_names:
-                logging.error('bam doesnt match')
-                exit(1) # error
-            else:
-                bam_reader_dict[bam_sample] = bam
-        if any([x is None for x in bam_reader_dict.values()]):
-            logging.error('not enough bams')
-            exit(1)
-    init_bam_dict()  # execute w/o polluting namespace 
-
-    for vcf_rec in vcf_obj.fetch():
+    cent90_thresh: float,
+    log_func: Callable,
+) -> int:
+    
+    for vcf_rec in vcf_in.fetch():
 
         if vcf_rec.alts is None:
             logging.error('vcf rec has no alts')
-            continue  # ? ask Peter
+            # Peter what is the correct approach to this state?
+            continue  # ? 
 
-        alt_test: bool = len(vcf_rec.alts[0]) == 1
+        alt_test = len(vcf_rec.alts[0]) == 1
         if vcf_rec.rlen == 1:
             mut_type = "sub" if alt_test else "ins"
         elif alt_test:
@@ -83,8 +75,9 @@ def init_bam_dict() -> None:
         samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
 
         if len(samples_w_mutants) == 0:
-            logging.error('no mutants')
-            sysexit(1)
+            logging.error('Mutation {}:{} has no samples exhibiting mutation')
+            # Peter what is the correct approach to this situtation?
+            return EXIT_FAILURE
 
         mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in samples_w_mutants}
 
@@ -97,7 +90,7 @@ def init_bam_dict() -> None:
         
         for mut_sample_name in samples_w_mutants:
             ### get_mutant_reads
-            read_iter, test_iter = tee(bam_reader_dict[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
+            read_iter, test_iter = tee(bams[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
             try:
                 next(test_iter)
             except StopIteration:
@@ -111,7 +104,6 @@ def init_bam_dict() -> None:
             for read in read_iter: # type: ignore
 
                 if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
-                    # breakpoint()
                     continue
 
                 if any(x is None for x in [read.reference_start,
@@ -120,8 +112,8 @@ def init_bam_dict() -> None:
                                         read.query_qualities,
                                         read.cigarstring,
                                         read.cigartuples]):
-                                        # error state or continue?
-                                        sysexit(1)
+                                        # Peter what is the correct response to this state?
+                                        continue
 
                 mut_pos = ref2seq.ref2querypos(read, vcf_rec.start) # VCF 1-INDEXED, BAM 0-INDEXED
                 mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
@@ -168,12 +160,12 @@ def init_bam_dict() -> None:
                     # breakpoint()
                     continue
 
-                mate = bam_reader_dict[mut_sample_name].mate(read)
+                mate = bams[mut_sample_name].mate(read)
 
                 if any(x is None for x in [mate.reference_start,
                                             mate.reference_end,
                                             mate.cigartuples]):
-                                        logging.error('badmate')
+                                        # Peter what is the correct approach to this state
                                         continue
 
                 if read.flag & 0x40:  # read first in pair
@@ -188,7 +180,7 @@ def init_bam_dict() -> None:
                         if read.reference_start <= mate.reference_end:
                             read_start = mate.reference_end + 1
                     else:
-                        if read.reference_end >= read.next_reference_start:  # check with Peter, does this map on to his code accurately
+                        if read.reference_end >= read.next_reference_start:
                             read_end = read.next_reference_start - 1
                     # Peter's implemenation comments that this conditional below
                     # checks if mutant overlaps with first read in pair
@@ -211,10 +203,10 @@ def init_bam_dict() -> None:
 
                 sorted_ends.append(sorted([soft_start, soft_end, soft_mate_start, soft_mate_end]))
             sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
+            # Peter what if sorted_ends contains only 1 item?
             min_ends: list[list[int]] = [sorted_ends.pop(0)]
             # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
             i = 1
-            # not sure this is right below...
             while len(sorted_ends) != 0:
                 loop_ends: list[int] = sorted_ends.pop(0)
                 max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
@@ -225,11 +217,12 @@ def init_bam_dict() -> None:
                 else:
                     min_ends = [loop_ends]
                 i += 1
+            del(i)
 
             if read:
                 del(read)
-            # if we got nothing for that sample
 
+            # if we got nothing for that sample
             if len(mut_reads[mut_sample_name]) == 0:
                 # is this an error state? if the sample showed 0/1 would we expect viable reads supporting that mutation?
                 continue
@@ -253,33 +246,35 @@ def init_bam_dict() -> None:
                     aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
                 except KeyError:
                     # Peter what is the correct approach to this state?
-                    # breakpoint()
                     continue  # ?
 
-        if len(aln_scores) == 0:
-            # Peter what is the correct approach to this state?
-            logging.error('bad')
-
-        al_filt = median(aln_scores) <= al_thresh
+        al_filt = (avg_AS := median(aln_scores) <= al_thresh)
 
         fbool = len(mut_read_pos_f) > 1
         rbool = len(mut_read_pos_r) > 1
         # hairpin conditions from Ellis et al.
-        hp_filt = True
         if fbool and not rbool:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
             if (((sum([x <= cent90_thresh for x in mut_read_pos_f]) / len(mut_read_pos_f)) < 0.9) and
-                mad_f > 0 and
-                sd_f > 4):
+                  mad_f > 0 and
+                  sd_f > 4):
+                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60B(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
                 hp_filt = False
+            else:
+                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60B(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
+                hp_filt = True
         elif rbool and not fbool:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
             if (((sum([x <= cent90_thresh for x in mut_read_pos_r]) / len(mut_read_pos_r)) < 0.9) and
                   mad_r > 0 and
                   sd_r > 4):
+                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
                 hp_filt = False
+            else:
+                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
+                hp_filt = True
         elif fbool and rbool:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
@@ -290,26 +285,35 @@ def init_bam_dict() -> None:
                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
                (mad_f > 1 and sd_f > 10) or
                (mad_r > 1 and sd_r > 10)):
+                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
                 hp_filt = False
+            else:
+                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
+                hp_filt = True
         else:
+            log_func(msg='Mutation {}:{} --- passed HPF, insufficient reads to support HPF flag', decision_lvl=2)  # Peter does this comment accurately assses the situation
             hp_filt = False
-        ### end 
 
         ### update vcf record
         if al_filt:
+            log_func(msg='Mutation {}:{} --- failed ALF, median AS of {}'.format(vcf_rec.chrom, vcf_rec.pos, avg_AS), decision_lvl=1)
             vcf_rec.filter.add("ALF")
         if hp_filt:
             vcf_rec.filter.add("HPF")
-        
-        # try except
-        breakpoint()
-        vcf_out.write(vcf_rec)
+
+        try:
+            vcf_out.write(vcf_rec)
+        except Exception as e:
+            logging.error('Failed to write VCF, reporting: {}'.format(e))
+            return EXIT_FAILURE
+
+    return EXIT_SUCCESS
 
 
 if __name__ == '__main__':
-    
-    logging.basicConfig(level=logging.INFO)
-    
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s ¦ %(levelname)-8s ¦ %(message)s', datefmt='%I:%M:%S')
+
     parser = argparse.ArgumentParser(prog="hairpin")
     parser._optionals.title = 'info'
     parser.add_argument('-v', '--version', help='print version', action='version', version='hairpin 1.0.0')
@@ -324,25 +328,74 @@ def init_bam_dict() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int)
     opt.add_argument('-al', '--AL-filter-threshold', help='default: 0.93', type=float)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float)
-    proc_opt = parser.add_argument_group('procedural options')
-    log_sev_opt = proc_opt.add_mutually_exclusive_group()
-    log_sev_opt.add_argument('-l', help='log reason for flags to file', nargs=1)
-    log_sev_opt.add_argument('-ll', help='as -l, and additionally log reason for NOT flagging to file', nargs=1)
-    log_sev_opt.add_argument('-lll', help='as -ll, and additionaly log reason for discarding reads', nargs=1)
-    
+    log_sev_opt = opt.add_mutually_exclusive_group()
+    log_sev_opt.add_argument('-l', dest='log_path', help='log reason for failing records to file', action=utilities.SetLogAndSeverity)
+    log_sev_opt.add_argument('-ll', dest='log_path', help='as -l, and addtionally log reason for passing records')
+    log_sev_opt.add_argument('-lll', dest='log_path', help='as -ll, and additionaly log reason for discarding reads associated with a record', action=utilities.SetLogAndSeverity)
+
     args = parser.parse_args()
+
+    if not hasattr(args, 'severity'):
+        args.severity = 0
+
     if any([x is None for _, x in vars(args).items()]):
         logging.info('option(s) not provided, using defaults')
-    
-    main(
-        bam_paths=args.bams,
-        vcf_in_path=args.vcf_in,
-        vcf_out_path=args.vcf_out,
+
+
+
+    try:
+        log_file = open(args.log_path) if args.log_path else None
+    except Exception as e:
+        cleanup(1, 'failed to open log file, reporting: {}'.format(e))
+    primed_log_func = partial(utilities.log_decision, log_lvl=args.severity, log_file=log_file)
+
+    try:
+        vcf_in_handle = pysam.VariantFile(args.vcf_in)
+    except Exception as e:
+        cleanup(1, 'failed to open VCF input, reporting: {}'.format(e))
+
+    # init output
+    out_head = vcf_in_handle.header
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(args.AL_filter_threshold if args.AL_filter_threshold else None))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
+
+    try:
+        vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
+    except Exception as e:
+        logging.error()
+        cleanup(1, 'failed to open VCF output, reporting: {}'.format(e))
+
+    sample_names: list[str] = list(vcf_out_handle.header.samples)
+
+    # func for local scope
+    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)
+    for path in args.bams:
+        try:
+            bam = pysam.AlignmentFile(path, 'rb')
+        except Exception as e:
+            cleanup(1, 'failed to read BAM at {}, reporting: {}'.format(path, e))
+        # grab the sample name from first SM field
+        # in header field RG
+        # this may cause problems?
+        # check with Peter
+        bam_sample = bam.header.to_dict()['RG'][1]['SM']
+        if bam_sample not in sample_names:
+            cleanup(1, 'name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
+        else:
+            bam_reader_d[bam_sample] = bam
+
+
+    main_code = main(
+        bams=bam_reader_d,
+        vcf_in=vcf_in_handle,
+        vcf_out=vcf_out_handle,
         clip_qual_cutoff=args.clip_quality_cutoff if args.clip_quality_cutoff else 35,
         min_mapqual=args.min_mapping_quality if args.min_mapping_quality else 11,
         min_basequal=args.min_base_quality if args.min_base_quality else 25,
         max_span=args.max_read_span if args.max_read_span else 6,
         al_thresh=args.AL_filter_threshold if args.AL_filter_threshold else 0.93,
-        cent90_thresh=args.cent90_threshold if args.cent90_threshold else 0.15
+        cent90_thresh=args.cent90_threshold if args.cent90_threshold else 0.15,
+        log_func=primed_log_func
     )
 
+    cleanup(main_code)

From cdd6a69abd0555015c93def6ba79399b27fafc01 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 24 Jun 2024 22:10:35 +0100
Subject: [PATCH 012/165] squash some edge cases

---
 hairpin/ref2seq.py | 33 +++---------------
 main.py            | 83 ++++++++++++++++++++++++----------------------
 2 files changed, 49 insertions(+), 67 deletions(-)

diff --git a/hairpin/ref2seq.py b/hairpin/ref2seq.py
index 01fe796..33e5bba 100644
--- a/hairpin/ref2seq.py
+++ b/hairpin/ref2seq.py
@@ -3,15 +3,12 @@
 def ref2querypos(
             record: pysam.AlignedSegment,
             ref_pos: int
-) -> int:
+) -> int | None:
     pos_aln = record.get_aligned_pairs()
-    while True:
-        try:
-            aln_pair = pos_aln.pop()
-        except IndexError:
-            return -1 # ref_pos not on read
+    for aln_pair in pos_aln:
         if aln_pair[1] == ref_pos:
             return aln_pair[0]
+    raise IndexError('reference position not covered by read')
 
 
 def pos2op(
@@ -20,31 +17,11 @@ def pos2op(
 ) -> int:
     cig = record.cigartuples
     if cig is None:
-        exit(1)  # No cigar tuples for record
+        raise ValueError('no cigar tuples available for pysam record') # No cigar tuples for record
     sum_len = 0
     while True:
-        try:
-            cig_pair = cig.pop(0)
-        except IndexError:
-            raise RuntimeError  # seq_pos not in cigar string
+        cig_pair = cig.pop(0)
         sum_len += cig_pair[1]
         if seq_pos < sum_len:
             return cig_pair[0]
 
-
-"""
-In Julia, ref2seq takes a run of cigar ops
-and a position on the reference genome.
-Here, the cigar ops come from the bam read/record
-under examination, and the pos on ref comes from
-the position given by the vcf for the vcf record
-under examination (one or more bam records will be
-examined for each vcf record). It returns an array
-of 2 values, the position on the bam read, and the
-cigar operation applicable to that position.
-
-since peter wants to discard if not OP_MATCH
-i.e. if clipped
-we can use reference start in pysam
-which is position where read begins alignment sans clipping etc
-"""
\ No newline at end of file
diff --git a/main.py b/main.py
index 0ce328c..637c93c 100644
--- a/main.py
+++ b/main.py
@@ -17,15 +17,9 @@
            ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
            start = 0)
 
-@dataclass
-class MutReadInfo:
-    
-    read: pysam.AlignedSegment
-    mate: pysam.AlignedSegment
-    local_posn: int
-    cig_op: int
-    
-    
+
+# dataclass for tracking info?
+# define exit codes and do all logging outside of main?
 
 
 def cleanup(code: int, msg: Optional[str] = None) -> None:
@@ -59,23 +53,26 @@ def main(
     for vcf_rec in vcf_in.fetch():
 
         if vcf_rec.alts is None:
-            logging.error('vcf rec has no alts')
+            logging.error('Mutation {}:{} ¦ no alts in VCF')
             # Peter what is the correct approach to this state?
             continue  # ? 
 
-        alt_test = len(vcf_rec.alts[0]) == 1
+        # Peter is it possible for other alts to be longer?
+        # the vcf format doesn't specify as far as I can tell that
+        # alts should be the same length (and elsewhere)
         if vcf_rec.rlen == 1:
-            mut_type = "sub" if alt_test else "ins"
-        elif alt_test:
+            mut_type = "sub" if len(vcf_rec.alts[0]) == 1 else "ins"
+        elif len(vcf_rec.alts[0]) == 1:
             mut_type = "del"
         else:
             mut_type = "complex"
             
-        # check with Peter
+
+        # Peter is this sufficiently specific? You search for 0/1 as a string
         samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
 
         if len(samples_w_mutants) == 0:
-            logging.error('Mutation {}:{} has no samples exhibiting mutation')
+            logging.error('Mutation {}:{} --- no samples exhibiting mutation')
             # Peter what is the correct approach to this situtation?
             return EXIT_FAILURE
 
@@ -87,14 +84,12 @@ def main(
         mut_read_fracs_r: list[float] = []
         aln_scores: list[float] = []
         
-        
         for mut_sample_name in samples_w_mutants:
-            ### get_mutant_reads
             read_iter, test_iter = tee(bams[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
             try:
                 next(test_iter)
             except StopIteration:
-                logging.error('empty iterator')
+                logging.error('Mutation {}:{} --- no reads mapped to region despite 0/1 for sample {}'.format(vcf_rec.chrom, vcf_rec.pos, mut_sample_name))
                 continue
             # Peter the above will skip the sample if no reads are returned by the fetch
             # is that acceptable? Would we not expect reads in the bam if the sample
@@ -116,12 +111,17 @@ def main(
                                         continue
 
                 mut_pos = ref2seq.ref2querypos(read, vcf_rec.start) # VCF 1-INDEXED, BAM 0-INDEXED
+                # Peter this occurs when the cigar string for a read
+                # indicates a deletion over the reference position in the read
+                # despite the vcf calling a single-base substitution
+                # What is the correct approach to this state?
+                if mut_pos is None:
+                    continue
                 mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
 
 
                 # Check whether read reports variant or not - skip the rest of the loop if does not report variant
                 # First, check for sub
-                # does read.query_x work? or should it be read.query_alignment_x?
                 if mut_type == "sub":
                     if (mut_op not in [Ops.match.value, Ops.diff.value] or
                         read.query_sequence[mut_pos] != vcf_rec.alts[0] or  # what about other alts?
@@ -129,7 +129,7 @@ def main(
                             # breakpoint()
                             continue
                 # Second, check whether length of read can accommodate size of indel
-                # what if other alt is longer?
+                # Peter again could other alt is longer?
                 elif (mut_pos + vcf_rec.rlen > read.query_length or
                       mut_pos + len(vcf_rec.alts[0]) > read.query_length):
                     # breakpoint()
@@ -182,9 +182,6 @@ def main(
                     else:
                         if read.reference_end >= read.next_reference_start:
                             read_end = read.next_reference_start - 1
-                    # Peter's implemenation comments that this conditional below
-                    # checks if mutant overlaps with first read in pair
-                    # I don't see it, perhaps it refers to the code above
                     if read_start <= vcf_rec.start <= read_end:
                         mut_reads[mut_sample_name].append(read)
                     else:
@@ -194,7 +191,7 @@ def main(
                 # get pos wrt to aligned region
                 # Peter
                 # behaviour on cig = None?     
-                # Julia XAM gets cigar info differently, checking CG:B,I tag
+                # Also, Julia XAM gets cigar info differently, checking CG:B,I tag
                 # does this matter?
                 soft_start = (read.reference_start - read.cigartuples[0][1]) if read.cigartuples[0][0] == Ops.soft.value else read.reference_start
                 soft_end = (read.reference_end + read.cigartuples[-1][1]) if read.cigartuples[-1][0] == Ops.soft.value else read.reference_end
@@ -202,33 +199,43 @@ def main(
                 soft_mate_end = (mate.reference_end + mate.cigartuples[-1][1]) if mate.cigartuples[-1][0] == Ops.soft.value else mate.reference_end
 
                 sorted_ends.append(sorted([soft_start, soft_end, soft_mate_start, soft_mate_end]))
+                
+            # if we got nothing for that sample after cycling through all reads
+            if len(mut_reads[mut_sample_name]) == 0:
+                # Peter is this an error state? if the sample showed 0/1 would we expect viable reads supporting that mutation?
+                continue
+
+            # return to per sample
             sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
             # Peter what if sorted_ends contains only 1 item?
             min_ends: list[list[int]] = [sorted_ends.pop(0)]
-            # I dont' fully understand this segment but I think it recapitulates the Julia (ask Peter)
+            # Peter I dont' fully understand this segment but I think it recapitulates the Julia
             i = 1
+            drop_idx = []
             while len(sorted_ends) != 0:
                 loop_ends: list[int] = sorted_ends.pop(0)
                 max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
 
                 if all([x <= max_span for x in max_spans]):
                     min_ends.append(loop_ends)
-                    mut_reads[mut_sample_name].pop(i)
+                    drop_idx.append(i)
                 else:
                     min_ends = [loop_ends]
                 i += 1
-            del(i)
-
+            mut_reads[mut_sample_name] = [j for i, j in enumerate(mut_reads[mut_sample_name]) if i not in drop_idx]
+            del(i, drop_idx)
+            
             if read:
                 del(read)
 
-            # if we got nothing for that sample
-            if len(mut_reads[mut_sample_name]) == 0:
-                # is this an error state? if the sample showed 0/1 would we expect viable reads supporting that mutation?
-                continue
+        # if we got nothing for all samples exhibiting 0/1 for this mutation
+        # skip this mutation
+        # Peter please confirm correct approach to this state
+        if all([len(x) == 0 for x in mut_reads.values()]):
+            continue
 
-        for _, reads in mut_reads.items():
-            for read in reads:
+        for read_list in mut_reads.values():
+            for read in read_list:
                 mut_pos = ref2seq.ref2querypos(read, vcf_rec.start)
                 if mut_pos == -1:
                     # breakpoint()
@@ -243,11 +250,12 @@ def main(
                     mut_read_pos_f.append(read_loc)
                 # breakpoint()
                 try:
-                    aln_scores.append(read.get_tag('AS') / read.query_length)  # or should this be .query_alignment_length? (Peter)
+                    aln_scores.append(read.get_tag('AS') / read.query_length)
                 except KeyError:
                     # Peter what is the correct approach to this state?
                     continue  # ?
-
+        if len(aln_scores) == 0:
+            breakpoint()
         al_filt = (avg_AS := median(aln_scores) <= al_thresh)
 
         fbool = len(mut_read_pos_f) > 1
@@ -341,8 +349,6 @@ def main(
     if any([x is None for _, x in vars(args).items()]):
         logging.info('option(s) not provided, using defaults')
 
-
-
     try:
         log_file = open(args.log_path) if args.log_path else None
     except Exception as e:
@@ -362,7 +368,6 @@ def main(
     try:
         vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
     except Exception as e:
-        logging.error()
         cleanup(1, 'failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_out_handle.header.samples)

From 26efdf5fe17ec6025939385cee64a88e2e175aff Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 9 Jul 2024 14:05:16 +0100
Subject: [PATCH 013/165] final shape for prototype

---
 .gitignore           |   2 +
 hairpin/constants.py |  75 ++++++
 hairpin/ref2seq.py   |  64 ++++--
 hairpin/utilities.py |  31 ---
 main.py              | 536 ++++++++++++++++++++-----------------------
 pyproject.toml       |   5 +-
 6 files changed, 376 insertions(+), 337 deletions(-)
 create mode 100644 hairpin/constants.py
 delete mode 100644 hairpin/utilities.py

diff --git a/.gitignore b/.gitignore
index 5f5a449..f287d02 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ data/
 data
 __pycache__/
 .helix/
+build/
+*.txt
diff --git a/hairpin/constants.py b/hairpin/constants.py
new file mode 100644
index 0000000..8bc305e
--- /dev/null
+++ b/hairpin/constants.py
@@ -0,0 +1,75 @@
+from enum import Enum, IntEnum, Flag
+from typing import Callable, Optional
+import dataclasses as d
+
+EXIT_SUCCESS = 0
+EXIT_FAILURE = 1
+
+FiltCodes = IntEnum('FiltCodes',
+            ['SIXTYAI', 'SIXTYBI', 'ON_THRESHOLD', 'INSUFFICIENT_READS', 'NO_MUTANTS'],
+            start=0)
+Ops = IntEnum('Ops',
+            ['MATCH', 'INS', 'DEL', 'SKIP', 'SOFT', 'HARD', 'PAD', 'EQUAL', 'DIFF', 'BACK'],
+            start = 0)
+ValidatorFlags = Flag('ReadFlags',
+            ['CLEAR', 'FLAG', 'MAPQUAL', 'READ_FIELDS_MISSING', 'NOT_ALIGNED', 'BAD_OP', 'NOT_ALT', 'BASEQUAL', 'SHORT', 'CLIPQUAL', 'MATE_MISSING_FIELDS', 'OVERLAP'],
+            start=0)
+
+
+@d.dataclass
+class FilterData:
+    name: str
+    flag: bool = False
+    code: Optional[int] = None
+
+    def set(self):
+        self.flag = True
+
+    def __iter__(self):
+        return (getattr(self, field.name) for field in d.fields(self))
+
+
+@d.dataclass
+class HPFilter(FilterData):
+    name: str = d.field(default='HPF')
+
+@d.dataclass
+class ALFilter(FilterData):
+    name: str = d.field(default='ALF')
+    avg_as: Optional[float] = None
+
+@d.dataclass
+class Filters:
+    HP: FilterData
+    AL: ALFilter
+
+    def __iter__(self):
+        return ((field.name, getattr(self, field.name)) for field in d.fields(self))
+
+    def fill_field(self, field_name, value):
+        if hasattr(self, field_name):
+            setattr(self, field_name, value)
+        else:
+            raise AttributeError
+
+    def get_field(self, field_name):
+        if hasattr(self, field_name):
+            return getattr(self, field_name)
+        else:
+            raise AttributeError
+
+
+FiltReturn = Callable[..., Filters]
+FlagReturn = Callable[..., int]
+
+
+def print_flag(
+    print_enum: Enum
+) -> None:
+    print([':'.join([str(e), hex(e.value)]) for e in print_enum])
+
+def print_enum(
+    print_enum: Enum
+) -> None:
+    print([e for e in print_enum])
+
diff --git a/hairpin/ref2seq.py b/hairpin/ref2seq.py
index 33e5bba..00411a9 100644
--- a/hairpin/ref2seq.py
+++ b/hairpin/ref2seq.py
@@ -1,27 +1,53 @@
 import pysam
+from hairpin import constants as c
 
 def ref2querypos(
-            record: pysam.AlignedSegment,
-            ref_pos: int
-) -> int | None:
-    pos_aln = record.get_aligned_pairs()
+            bam_record: pysam.AlignedSegment,
+            ref_pos: int,
+            get_cig: bool = True
+) -> tuple[int, int | None]:
+    pos_aln = bam_record.get_aligned_pairs()
+    query_pos = pos_op = None
     for aln_pair in pos_aln:
         if aln_pair[1] == ref_pos:
-            return aln_pair[0]
-    raise IndexError('reference position not covered by read')
+            query_pos = aln_pair[0]
+    if query_pos is None or len(pos_aln) == 0:
+        raise IndexError('reference position not covered by read')
+    elif get_cig:
+        dist2op = ref_pos - bam_record.reference_start + 1  # since position is 0-indexed, add 1 to get distance
+        cig = bam_record.cigartuples
+        if cig is None or len(cig) == 0:
+            raise ValueError('no cigar tuples available for pysam record')
+        sum_len = 0
+        while len(cig) > 0:
+            cig_pair = cig.pop(0)
+            if cig_pair[0] != c.Ops.SOFT.value:
+                sum_len += cig_pair[1]
+                if dist2op <= sum_len:
+                    pos_op = cig_pair[0]
+        if pos_op is None:
+            raise ValueError('cigar op could not be recovered')
+    return query_pos, pos_op
 
 
-def pos2op(
-    seq_pos: int,
-    record: pysam.AlignedSegment
+def ref_end_via_cigar(
+    cig_str: str,
+    ref_start: int
 ) -> int:
-    cig = record.cigartuples
-    if cig is None:
-        raise ValueError('no cigar tuples available for pysam record') # No cigar tuples for record
-    sum_len = 0
-    while True:
-        cig_pair = cig.pop(0)
-        sum_len += cig_pair[1]
-        if seq_pos < sum_len:
-            return cig_pair[0]
-
+    if not cig_str[0].isdigit() or len(cig_str) < 2:
+        raise ValueError('cigar string misformatted')
+    cig_l = []
+    digit_accumulator: str = ''
+    for char in cig_str:
+        if char.isdigit():
+            digit_accumulator += char
+        else:
+            cig_l.append(digit_accumulator)
+            cig_l.append(char)
+            digit_accumulator = ''
+    cig_t = list(zip(cig_l[0::2], cig_l[1::2]))
+    for op_len, op_code in cig_t:
+        if op_code in ['M','D','N','=','X']:
+            ref_start += int(op_len)
+    return ref_start
+                
diff --git a/hairpin/utilities.py b/hairpin/utilities.py
deleted file mode 100644
index 23da78f..0000000
--- a/hairpin/utilities.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from typing import TextIO, Optional
-import argparse
-import re
-
-def log_decision(
-    msg: str,
-    decision_lvl: int,
-    log_lvl: int,
-    log_file: Optional[TextIO]
-) -> None:
-    if log_file is not None and decision_lvl >= log_lvl:
-        print(msg, file = log_file)
-
-
-# Define a custom action to set a string and counter when the argument is present
-class SetLogAndSeverity(argparse.Action):
-        def __call__(self, parser, namespace, values, option_string=None):
-            setattr(namespace, self.dest, values)
-
-            # count ls to set sev
-            if option_string is not None:
-                pattern = re.compile('[^-l]')
-                if pattern.search(option_string):
-                    raise KeyError
-                else:
-                    sev = option_string.count('l')
-            else:
-                raise KeyError
-
-            setattr(namespace, 'severity', sev)
-
diff --git a/main.py b/main.py
index 637c93c..b9a667f 100644
--- a/main.py
+++ b/main.py
@@ -1,29 +1,16 @@
 import pysam
-from enum import Enum
-from hairpin import ref2seq, utilities
+from hairpin import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
 import argparse
 import logging
 from itertools import tee
 from functools import partial
-from sys import exit as sysexit
-from dataclasses import dataclass
-from typing import Callable, Optional
-
-EXIT_SUCCESS = 0
-EXIT_FAILURE = 1
-
-Ops = Enum('Ops',
-           ['match', 'ins', 'delete', 'skip', 'soft', 'hard', 'pad', 'equal', 'diff', 'back'],
-           start = 0)
-
-
-# dataclass for tracking info?
-# define exit codes and do all logging outside of main?
+import sys
+from typing import Optional
 
 
 def cleanup(code: int, msg: Optional[str] = None) -> None:
-    if code != EXIT_SUCCESS and msg:
+    if code != c.EXIT_SUCCESS and msg:
         logging.error(msg)
     for obj_name in ['vcf_in_handle', 'vcf_out_handle']:
         if obj_name in locals():
@@ -32,258 +19,203 @@ def cleanup(code: int, msg: Optional[str] = None) -> None:
         locals()['bam_reader_d'].close()
     if 'log_file' in locals() and locals()['log_file']:
         locals()['log_file'].close()
-    if code == EXIT_SUCCESS:
+    if code == c.EXIT_SUCCESS:
         logging.info('hairpin complete')
-    sysexit(code)
+    sys.exit(code)
 
 
-def main(
-    bams: dict[str, pysam.AlignmentFile],
-    vcf_in: pysam.VariantFile,
-    vcf_out: pysam.VariantFile,
-    clip_qual_cutoff: int,
+# CIGAR best retrieved from CG:B,I tag - implement in future
+def validate_read(
+    vcf_record: pysam.VariantRecord,
+    read: pysam.AlignedSegment,
     min_mapqual: int,
+    clip_qual_cutoff: int,
     min_basequal: int,
-    max_span: int,
-    al_thresh: float,
-    cent90_thresh: float,
-    log_func: Callable,
+    alt: str
 ) -> int:
-    
-    for vcf_rec in vcf_in.fetch():
-
-        if vcf_rec.alts is None:
-            logging.error('Mutation {}:{} ¦ no alts in VCF')
-            # Peter what is the correct approach to this state?
-            continue  # ? 
-
-        # Peter is it possible for other alts to be longer?
-        # the vcf format doesn't specify as far as I can tell that
-        # alts should be the same length (and elsewhere)
-        if vcf_rec.rlen == 1:
-            mut_type = "sub" if len(vcf_rec.alts[0]) == 1 else "ins"
-        elif len(vcf_rec.alts[0]) == 1:
-            mut_type = "del"
-        else:
-            mut_type = "complex"
-            
-
-        # Peter is this sufficiently specific? You search for 0/1 as a string
-        samples_w_mutants = [name for name in sample_names if vcf_rec.samples[name]["GT"] == (0, 1)]
-
-        if len(samples_w_mutants) == 0:
-            logging.error('Mutation {}:{} --- no samples exhibiting mutation')
-            # Peter what is the correct approach to this situtation?
-            return EXIT_FAILURE
-
-        mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in samples_w_mutants}
-
-        mut_read_pos_f: list[int] = []
-        mut_read_pos_r: list[int] = []
-        mut_read_fracs_f: list[float] = []
-        mut_read_fracs_r: list[float] = []
-        aln_scores: list[float] = []
-        
-        for mut_sample_name in samples_w_mutants:
-            read_iter, test_iter = tee(bams[mut_sample_name].fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
-            try:
-                next(test_iter)
-            except StopIteration:
-                logging.error('Mutation {}:{} --- no reads mapped to region despite 0/1 for sample {}'.format(vcf_rec.chrom, vcf_rec.pos, mut_sample_name))
-                continue
-            # Peter the above will skip the sample if no reads are returned by the fetch
-            # is that acceptable? Would we not expect reads in the bam if the sample
-            # exhibits 0/1
-            sorted_ends = []
-            read = None
-            for read in read_iter: # type: ignore
-
-                if not (read.flag & 0x2) or read.flag & 0xE00 or read.mapping_quality < min_mapqual:
-                    continue
-
-                if any(x is None for x in [read.reference_start,
-                                        read.reference_end,
-                                        read.query_sequence,
-                                        read.query_qualities,
-                                        read.cigarstring,
-                                        read.cigartuples]):
-                                        # Peter what is the correct response to this state?
-                                        continue
-
-                mut_pos = ref2seq.ref2querypos(read, vcf_rec.start) # VCF 1-INDEXED, BAM 0-INDEXED
-                # Peter this occurs when the cigar string for a read
-                # indicates a deletion over the reference position in the read
-                # despite the vcf calling a single-base substitution
-                # What is the correct approach to this state?
-                if mut_pos is None:
-                    continue
-                mut_op = ref2seq.pos2op(mut_pos, read) if mut_pos != -1 else None
-
-
-                # Check whether read reports variant or not - skip the rest of the loop if does not report variant
-                # First, check for sub
-                if mut_type == "sub":
-                    if (mut_op not in [Ops.match.value, Ops.diff.value] or
-                        read.query_sequence[mut_pos] != vcf_rec.alts[0] or  # what about other alts?
-                        read.query_qualities[mut_pos] < min_basequal):
-                            # breakpoint()
-                            continue
-                # Second, check whether length of read can accommodate size of indel
-                # Peter again could other alt is longer?
-                elif (mut_pos + vcf_rec.rlen > read.query_length or
-                      mut_pos + len(vcf_rec.alts[0]) > read.query_length):
-                    # breakpoint()
-                    continue
-
-                if mut_type == "del":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, vcf_rec.stop))
-                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] not in [Ops.match.value, Ops.diff.value] or  # Peter should diff op be considered here?
-                        mut_rng_ops[-1] not in [Ops.match.value, Ops.diff.value] or
-                        any(x != Ops.delete.value for x in mut_rng_ops[1:-2])):
-                        # breakpoint()
-                        continue
-                elif mut_type == "ins":
-                    mut_rng = map(lambda x: ref2seq.ref2querypos(read, x), range(vcf_rec.start, (vcf_rec.start + len(vcf_rec.alts[0]))))  # other alt?
-                    mut_rng_ops = list(map(lambda x: ref2seq.pos2op(x, read), mut_rng))
-                    if (mut_rng_ops[0] not in [Ops.match.value, Ops.diff.value] or
-                        mut_rng_ops[-1] not in [Ops.match.value, Ops.diff.value] or
-                        any(x != Ops.ins.value for x in mut_rng_ops[1:-2]) or
-                        read.query_sequence[mut_pos:len(vcf_rec.alts[0])] != vcf_rec.alts[0]):  # further checks needed, should it be query_alignment_sequence?
-                        # breakpoint()
-                        continue
+    read_flag = c.ValidatorFlags.CLEAR.value
 
-                # n.b. nothing done if complex read
-                
-                if ('S' in read.cigarstring and  # type: ignore
-                    mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
-                    # breakpoint()
-                    continue
-
-                mate = bams[mut_sample_name].mate(read)
-
-                if any(x is None for x in [mate.reference_start,
-                                            mate.reference_end,
-                                            mate.cigartuples]):
-                                        # Peter what is the correct approach to this state
-                                        continue
-
-                if read.flag & 0x40:  # read first in pair
-                    mut_reads[mut_sample_name].append(read)
-                else:  # read second in pair
-                    read_start = read.reference_start
-                    read_end = read.reference_end
-                    if read.flag & 0x10:
-                        if mate.reference_end is None:
-                            # breakpoint()
-                            continue  # ?
-                        if read.reference_start <= mate.reference_end:
-                            read_start = mate.reference_end + 1
+    if not (read.flag & 0x2) or read.flag & 0xE00:
+        read_flag |= c.ValidatorFlags.FLAG.value
+
+    if read.mapping_quality < min_mapqual:
+        read_flag |= c.ValidatorFlags.MAPQUAL.value
+
+    try:
+        mate_cig = read.get_tag('MC')
+    except KeyError:
+        mate_cig = None
+    if any(x is None for x in [read.reference_start,
+                            read.reference_end,
+                            read.query_sequence,
+                            read.query_qualities,
+                            read.query_alignment_qualities,
+                            read.cigarstring,
+                            read.cigartuples,
+                            mate_cig]):
+        read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
+    else:
+        if ('S' in read.cigarstring and  # type: ignore
+            mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+            read_flag |= c.ValidatorFlags.CLIPQUAL.value
+        # First, check for sub
+        try:
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED - vcf_record.start = 0-indexed mutation position. testing with pos, 1-indexed, to see if match Peter
+        except IndexError:
+            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+        else:
+            if vcf_record.rlen == len(alt) == 1:
+                if (mut_op not in [c.Ops.MATCH.value, c.Ops.DIFF.value]):
+                    read_flag |= c.ValidatorFlags.BAD_OP.value
+                if read.query_sequence[mut_pos] != alt:  # type: ignore
+                    read_flag |= c.ValidatorFlags.NOT_ALT.value
+                if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
+                        read_flag |= c.ValidatorFlags.BASEQUAL.value
+            # Second, check whether length of read can accommodate size of indel
+            elif (mut_pos + vcf_record.rlen > read.query_length or
+                  mut_pos + len(alt) > read.query_length):
+                read_flag |= c.ValidatorFlags.SHORT.value
+            else:
+                if len(alt) == 1:  # DEL
+                    try:
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_record.start, vcf_record.stop)))
+                    except IndexError:
+                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
-                        if read.reference_end >= read.next_reference_start:
-                            read_end = read.next_reference_start - 1
-                    if read_start <= vcf_rec.start <= read_end:
-                        mut_reads[mut_sample_name].append(read)
+                        if (mut_rng[0][1] != c.Ops.MATCH.value or
+                            mut_rng[-1][1] != c.Ops.MATCH.value or
+                            any(x[1] != c.Ops.DEL.value for x in mut_rng[1:-2])):
+                            read_flag |= c.ValidatorFlags.BAD_OP.value
+                elif vcf_record.rlen == 1:  # INS
+                    try:
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_record.start, (vcf_record.start + len(alt)))))
+                    except IndexError:
+                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
-                        # breakpoint() 
-                        continue
-                # if we've got this far, then read has been added to mut_reads
-                # get pos wrt to aligned region
-                # Peter
-                # behaviour on cig = None?     
-                # Also, Julia XAM gets cigar info differently, checking CG:B,I tag
-                # does this matter?
-                soft_start = (read.reference_start - read.cigartuples[0][1]) if read.cigartuples[0][0] == Ops.soft.value else read.reference_start
-                soft_end = (read.reference_end + read.cigartuples[-1][1]) if read.cigartuples[-1][0] == Ops.soft.value else read.reference_end
-                soft_mate_start = (mate.reference_start - mate.cigartuples[0][1]) if mate.cigartuples[0][0] == Ops.soft.value else mate.reference_start
-                soft_mate_end = (mate.reference_end + mate.cigartuples[-1][1]) if mate.cigartuples[-1][0] == Ops.soft.value else mate.reference_end
-
-                sorted_ends.append(sorted([soft_start, soft_end, soft_mate_start, soft_mate_end]))
-                
-            # if we got nothing for that sample after cycling through all reads
-            if len(mut_reads[mut_sample_name]) == 0:
-                # Peter is this an error state? if the sample showed 0/1 would we expect viable reads supporting that mutation?
-                continue
-
-            # return to per sample
-            sorted_ends: list[list[int]] = sorted(sorted_ends)  # sort sublists on first element
-            # Peter what if sorted_ends contains only 1 item?
-            min_ends: list[list[int]] = [sorted_ends.pop(0)]
-            # Peter I dont' fully understand this segment but I think it recapitulates the Julia
-            i = 1
-            drop_idx = []
-            while len(sorted_ends) != 0:
-                loop_ends: list[int] = sorted_ends.pop(0)
-                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, loop_ends)]), min_ends)
+                        if (mut_rng[0][1] != c.Ops.MATCH.value or
+                            mut_rng[-1][1] != c.Ops.MATCH.value or
+                            any(x[1] != c.Ops.INS.value for x in mut_rng[1:-2])):
+                            read_flag |= c.ValidatorFlags.BAD_OP.value
+                        if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
+                            read_flag |= c.ValidatorFlags.NOT_ALT.value
+                # n.b. nothing done if complex read
+        if read_flag == c.ValidatorFlags.CLEAR.value:
+            # is it safe to assume this is always mate?
+            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # THIS ONLY WORKS ASSUMING MATE IS NEXT READ
+            if not (read.flag & 0x40):
+                # this looks like it should be checked for indexing snags
+                pair_start = read.reference_start
+                pair_end = read.reference_end
+                if read.flag & 0x10:
+                    if pair_start <= mate_end:
+                        pair_start = mate_end + 1
+                else:
+                    if pair_end >= read.next_reference_start:
+                        pair_end = read.next_reference_start - 1
+                if not (pair_start <= vcf_record.start <= pair_end):
+                    read_flag |= c.ValidatorFlags.OVERLAP.value
+    return read_flag
+
 
+def test_variant(
+    vcf_rec: pysam.VariantRecord,
+    mutant_bams: dict[str, pysam.AlignmentFile],
+    alt: str,
+    al_thresh: float,
+    max_span: int,
+    cent90_thresh: float,
+    read_validator: c.FlagReturn,
+) -> c.Filters:
+
+    hp_filt = c.HPFilter()
+    al_filt = c.ALFilter()
+
+    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in mutant_bams}
+    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in mutant_bams}
+    mut_read_pos_f: list[int] = []
+    mut_read_pos_r: list[int] = []
+    mut_read_fracs_f: list[float] = []
+    mut_read_fracs_r: list[float] = []
+    aln_scores: list[float] = []
+
+    for mut_sample, bam in mutant_bams.items():
+        read_iter, test_iter = tee(bam.fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
+        try:
+            next(test_iter)
+        except StopIteration:
+            continue
+        sample_readpair_ends = []
+        read = None
+        for read in read_iter: # type: ignore
+            read_flag = c.ValidatorFlags.CLEAR.value
+            read_flag = read_validator(vcf_record=vcf_rec, read=read, alt=alt)
+
+            if read_flag == c.ValidatorFlags.CLEAR.value:
+                mut_reads[mut_sample].append(read)
+                sample_readpair_ends.append([read.reference_start, read.reference_end, read.next_reference_start, r2s.ref_end_via_cigar(read.get_tag('MC'), read.next_reference_start)])  # type: ignore
+            mut_reads_log[mut_sample].append((read.query_name, read_flag))
+            del(read)
+        if len(mut_reads[mut_sample]) > 1:
+            sample_readpair_ends_sorted: list[list[int]] = sorted(list(map(sorted, sample_readpair_ends)))
+            curr_ends = [sample_readpair_ends_sorted[0]]
+            drop_idx = []
+            for i in range(1, len(sample_readpair_ends_sorted)):
+                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, sample_readpair_ends_sorted[i])]), curr_ends)
                 if all([x <= max_span for x in max_spans]):
-                    min_ends.append(loop_ends)
+                    curr_ends.append(sample_readpair_ends_sorted[i])
                     drop_idx.append(i)
                 else:
-                    min_ends = [loop_ends]
-                i += 1
-            mut_reads[mut_sample_name] = [j for i, j in enumerate(mut_reads[mut_sample_name]) if i not in drop_idx]
-            del(i, drop_idx)
-            
-            if read:
-                del(read)
-
-        # if we got nothing for all samples exhibiting 0/1 for this mutation
-        # skip this mutation
-        # Peter please confirm correct approach to this state
-        if all([len(x) == 0 for x in mut_reads.values()]):
-            continue
-
+                    curr_ends = [sample_readpair_ends_sorted[i]]
+            mut_reads[mut_sample] = [j for i, j in enumerate(mut_reads[mut_sample]) if i not in drop_idx]
+    if all([len(x) == 0 for x in mut_reads.values()]):
+        al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+        hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+    else:
         for read_list in mut_reads.values():
             for read in read_list:
-                mut_pos = ref2seq.ref2querypos(read, vcf_rec.start)
-                if mut_pos == -1:
-                    # breakpoint()
-                    continue  # ?
+                mut_pos, _ = r2s.ref2querypos(read, vcf_rec.start)
                 if read.flag & 0x10:
-                    read_loc = read.reference_end - mut_pos + 1
-                    mut_read_fracs_r.append(read_loc / (read.reference_start - read.reference_end + 1))
+                    read_loc = read.query_alignment_end - mut_pos + 1  # Peter since we're getting query mut_pos wrt to the 0-indexed vcf pos, is + 1 correct?
+                    mut_read_fracs_r.append(read_loc / read.query_alignment_length)
                     mut_read_pos_r.append(read_loc)
                 else:
-                    read_loc = (mut_pos - read.reference_start + 1)
-                    mut_read_fracs_f.append(read_loc / (read.reference_end - read.reference_start + 1))
+                    read_loc = mut_pos - read.query_alignment_start + 1
+                    mut_read_fracs_f.append(read_loc / read.query_alignment_length)
                     mut_read_pos_f.append(read_loc)
-                # breakpoint()
+
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)
                 except KeyError:
-                    # Peter what is the correct approach to this state?
-                    continue  # ?
-        if len(aln_scores) == 0:
-            breakpoint()
-        al_filt = (avg_AS := median(aln_scores) <= al_thresh)
-
-        fbool = len(mut_read_pos_f) > 1
-        rbool = len(mut_read_pos_r) > 1
+                    pass
+        if len(aln_scores) != 0:
+            al_filt.avg_as = median(aln_scores)
+            al_filt.code = c.FiltCodes.ON_THRESHOLD.value
+            if al_filt.avg_as <= al_thresh:
+                al_filt.set()
+        else:
+            al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
         # hairpin conditions from Ellis et al.
-        if fbool and not rbool:
+        if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
-            if (((sum([x <= cent90_thresh for x in mut_read_pos_f]) / len(mut_read_pos_f)) < 0.9) and
+            if (((sum([x <= cent90_thresh for x in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
                   mad_f > 0 and
                   sd_f > 4):
-                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60B(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
-                hp_filt = False
+                hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
             else:
-                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60B(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
-                hp_filt = True
-        elif rbool and not fbool:
+                hp_filt.code = c.FiltCodes.SIXTYAI.value
+                hp_filt.set()
+        elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
-            if (((sum([x <= cent90_thresh for x in mut_read_pos_r]) / len(mut_read_pos_r)) < 0.9) and
+            if (((sum([x <= cent90_thresh for x in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
                   mad_r > 0 and
                   sd_r > 4):
-                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
-                hp_filt = False
+                hp_filt.code = c.FiltCodes.SIXTYAI.value
             else:
-                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
-                hp_filt = True
-        elif fbool and rbool:
+                hp_filt.code = c.FiltCodes.SIXTYAI.value
+                hp_filt.set()
+        elif len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
@@ -293,29 +225,46 @@ def main(
                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
                (mad_f > 1 and sd_f > 10) or
                (mad_r > 1 and sd_r > 10)):
-                log_func(msg='Mutation {}:{} --- passed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=2)
-                hp_filt = False
+                hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
             else:
-                log_func(msg='Mutation {}:{} --- failed HPF per Ellis et al. 60A(i)'.format(vcf_rec.chrom, vcf_rec.pos), decision_lvl=1)
-                hp_filt = True
+                hp_filt.code = c.FiltCodes.SIXTYBI.value
+                hp_filt.set()
         else:
-            log_func(msg='Mutation {}:{} --- passed HPF, insufficient reads to support HPF flag', decision_lvl=2)  # Peter does this comment accurately assses the situation
-            hp_filt = False
+            hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+    return c.Filters(hp_filt, al_filt)
 
-        ### update vcf record
-        if al_filt:
-            log_func(msg='Mutation {}:{} --- failed ALF, median AS of {}'.format(vcf_rec.chrom, vcf_rec.pos, avg_AS), decision_lvl=1)
-            vcf_rec.filter.add("ALF")
-        if hp_filt:
-            vcf_rec.filter.add("HPF")
 
-        try:
-            vcf_out.write(vcf_rec)
-        except Exception as e:
-            logging.error('Failed to write VCF, reporting: {}'.format(e))
-            return EXIT_FAILURE
+def process_vcf_record(
+    bams: dict[str, pysam.AlignmentFile],
+    vcf_rec: pysam.VariantRecord,
+    variant_tester: c.FiltReturn,
+) -> tuple[str, c.Filters]:
 
-    return EXIT_SUCCESS
+    if vcf_rec.alts is None:
+        raise ValueError('VCF record has no alts')
+    
+    # favour returning filter bools rather than updated record for testing/reusability
+    filt = c.Filters(c.HPFilter(), c.ALFilter())
+    alt_log = ''
+
+    samples_w_mutants = [name for name in vcf_rec.samples if vcf_rec.samples[name]["GT"] != (0, 0)]
+    if len(samples_w_mutants) == 0:
+        for _, state in filt:
+            state.code = c.FiltCodes.NO_MUTANTS.value
+    else:
+        bams_w_mutants = {k: v for k, v in bams.items() if k in samples_w_mutants}
+        alt_l = list(vcf_rec.alts)
+        # lock filters as true if flipped true
+        while len(alt_l) > 0 and (filt.HP.flag == False or filt.AL.flag == False):
+            alt = alt_l.pop()
+            filt_loop = variant_tester(vcf_rec, bams_w_mutants, alt)
+            for name, state in filt:
+                if not state.flag:
+                    filt.fill_field(name, filt_loop.get_field(name))
+            if any([f.flag for _, f in filt_loop]):
+                alt_log = alt if alt_log == '' else ':'.join([alt_log, alt])
+        alt_log = '-' if alt_log == '' else alt_log
+    return alt_log, filt
 
 
 if __name__ == '__main__':
@@ -330,30 +279,34 @@ def main(
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
     req.add_argument('-b', '--bams', help="list of paths to bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
-    opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int)
-    opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int)
-    opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int)
-    opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int)
-    opt.add_argument('-al', '--AL-filter-threshold', help='default: 0.93', type=float)
-    opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float)
+    opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
+    opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
+    opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
+    opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
+    opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
+    opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
     log_sev_opt = opt.add_mutually_exclusive_group()
-    log_sev_opt.add_argument('-l', dest='log_path', help='log reason for failing records to file', action=utilities.SetLogAndSeverity)
-    log_sev_opt.add_argument('-ll', dest='log_path', help='as -l, and addtionally log reason for passing records')
-    log_sev_opt.add_argument('-lll', dest='log_path', help='as -ll, and additionaly log reason for discarding reads associated with a record', action=utilities.SetLogAndSeverity)
+    log_sev_opt.add_argument('-l', dest='log_path', help='log reason for failing records to file', nargs='?', const=None)
 
     args = parser.parse_args()
 
-    if not hasattr(args, 'severity'):
-        args.severity = 0
-
+    # needs fixing
     if any([x is None for _, x in vars(args).items()]):
         logging.info('option(s) not provided, using defaults')
 
+    al_round = len(str(args.al_filter_threshold).split('.')[1])
+
+    primed_validate_read = partial(validate_read,
+                                   min_mapqual=args.min_mapping_quality,
+                                   clip_qual_cutoff=args.clip_quality_cutoff,
+                                   min_basequal=args.min_base_quality)
+
+    primed_variant_tester = partial(test_variant, al_thresh=args.al_filter_threshold, max_span=args.max_read_span, cent90_thresh=args.cent90_threshold, read_validator=primed_validate_read)
+
     try:
-        log_file = open(args.log_path) if args.log_path else None
+        log_file = open(args.log_path, 'w') if args.log_path else sys.stderr
     except Exception as e:
         cleanup(1, 'failed to open log file, reporting: {}'.format(e))
-    primed_log_func = partial(utilities.log_decision, log_lvl=args.severity, log_file=log_file)
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
@@ -362,7 +315,7 @@ def main(
 
     # init output
     out_head = vcf_in_handle.header
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(args.AL_filter_threshold if args.AL_filter_threshold else None))
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(args.al_filter_threshold))
     out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
 
     try:
@@ -370,10 +323,9 @@ def main(
     except Exception as e:
         cleanup(1, 'failed to open VCF output, reporting: {}'.format(e))
 
-    sample_names: list[str] = list(vcf_out_handle.header.samples)
+    sample_names: list[str] = list(vcf_in_handle.header.samples)
 
-    # func for local scope
-    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)
+    bam_reader_d: dict[str, None | pysam.AlignmentFile] = dict.fromkeys(sample_names)
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -389,18 +341,30 @@ def main(
         else:
             bam_reader_d[bam_sample] = bam
 
+    for record in vcf_in_handle.fetch():
+        try:
+            trig_alts, filtering = process_vcf_record(
+                bams=bam_reader_d,  # type: ignore
+                vcf_rec=record,
+                variant_tester=primed_variant_tester
+            )
+        except ValueError:
+            logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(record.chrom, record.pos))
+        else:
+            if any([f.code == c.FiltCodes.NO_MUTANTS.value for _, f in filtering]):
+                logging.warning('{0: <7}:{1: >12} ¦ no samples contain reads exhibiting record alts'.format(record.chrom, record.pos))
+
+            record_log: str = '{}\t{}\t{}'.format(record.chrom, record.pos, trig_alts)
+            for _, filter in filtering:
+                record_log = record_log + '\t' + ':'.join([str(round(f, 3) if type(f) == float else f) for f in filter])
+                if filter.flag:
+                    record.filter.add(filter.name)
+
+            print(record_log, file = log_file, flush=True)
+
+            try:
+                vcf_out_handle.write(record)
+            except Exception as e:
+                cleanup(1, 'failed to write to vcf, reporting: {}'.format(e))
 
-    main_code = main(
-        bams=bam_reader_d,
-        vcf_in=vcf_in_handle,
-        vcf_out=vcf_out_handle,
-        clip_qual_cutoff=args.clip_quality_cutoff if args.clip_quality_cutoff else 35,
-        min_mapqual=args.min_mapping_quality if args.min_mapping_quality else 11,
-        min_basequal=args.min_base_quality if args.min_base_quality else 25,
-        max_span=args.max_read_span if args.max_read_span else 6,
-        al_thresh=args.AL_filter_threshold if args.AL_filter_threshold else 0.93,
-        cent90_thresh=args.cent90_threshold if args.cent90_threshold else 0.15,
-        log_func=primed_log_func
-    )
-
-    cleanup(main_code)
+    cleanup(0)
diff --git a/pyproject.toml b/pyproject.toml
index 1976f43..c39babe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,6 @@
+[tool.setuptools]
+packages = ["hairpin"]
+
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
@@ -8,4 +11,4 @@ version = "0.0.1"
 requires-python = ">= 3.7"
 dependencies = [
     'pysam'
-]
\ No newline at end of file
+]

From 0e2fb5c370bb9dad288ceacc348fd7203e3deb26 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 9 Jul 2024 14:56:02 +0100
Subject: [PATCH 014/165] directory structure and install setup

---
 README.md                  | 100 +++++++------------------------------
 hairpin/constants.py       |   3 +-
 main.py => hairpin/main.py |  25 +++++-----
 pyproject.toml             |   5 +-
 4 files changed, 35 insertions(+), 98 deletions(-)
 rename main.py => hairpin/main.py (95%)

diff --git a/README.md b/README.md
index 129e5a0..860d5c9 100644
--- a/README.md
+++ b/README.md
@@ -1,93 +1,27 @@
 # hairpin-core
 
+Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented here by Peter Campbell and Alex Byrne
 
+### REQUIREMENTS
 
-## Getting started
+* Python3 - tested with 3.12
 
-To make it easy for you to get started with GitLab, here's a list of recommended next steps.
+### INSTALLATION
 
-Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
-
-## Add your files
-
-- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
-- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
 
+Within a virtual environment:
 ```
-cd existing_repo
-git remote add origin https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core.git
-git branch -M main
-git push -uf origin main
+python -m venv .env
+source .env/bin/activate
+pip install .
+hairpin -h
 ```
 
-## Integrate with your tools
-
-- [ ] [Set up project integrations](https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core/-/settings/integrations)
-
-## Collaborate with your team
-
-- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
-- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
-- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
-- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
-- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
-
-## Test and Deploy
-
-Use the built-in continuous integration in GitLab.
-
-- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
-- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
-- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
-- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
-- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
-
-***
-
-# Editing this README
-
-When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template.
-
-## Suggestions for a good README
-
-Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
-
-## Name
-Choose a self-explaining name for your project.
-
-## Description
-Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
-
-## Badges
-On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
-
-## Visuals
-Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
-
-## Installation
-Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
-
-## Usage
-Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
-
-## Support
-Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
-
-## Roadmap
-If you have ideas for releases in the future, it is a good idea to list them in the README.
-
-## Contributing
-State if you are open to contributions and what your requirements are for accepting them.
-
-For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
-
-You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
-
-## Authors and acknowledgment
-Show your appreciation to those who have contributed to the project.
-
-## License
-For open source projects, say how it is licensed.
-
-## Project status
-If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+For system-wide access:
+```
+export INST_PATH=/path/to/install/location/
+mkdir -p $INST_PATH
+pip install . --target $INST_PATH
+export PATH=${PATH}:${INST_PATH}/bin
+hairpin -h
+```
diff --git a/hairpin/constants.py b/hairpin/constants.py
index 8bc305e..b1b4cdc 100644
--- a/hairpin/constants.py
+++ b/hairpin/constants.py
@@ -2,6 +2,7 @@
 from typing import Callable, Optional
 import dataclasses as d
 
+VERSION = '0.0.1'
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 
@@ -40,8 +41,8 @@ class ALFilter(FilterData):
 
 @d.dataclass
 class Filters:
-    HP: FilterData
     AL: ALFilter
+    HP: HPFilter
 
     def __iter__(self):
         return ((field.name, getattr(self, field.name)) for field in d.fields(self))
diff --git a/main.py b/hairpin/main.py
similarity index 95%
rename from main.py
rename to hairpin/main.py
index b9a667f..251e6cc 100644
--- a/main.py
+++ b/hairpin/main.py
@@ -9,7 +9,7 @@
 from typing import Optional
 
 
-def cleanup(code: int, msg: Optional[str] = None) -> None:
+def cleanup(code: int = c.EXIT_FAILURE, msg: Optional[str] = None) -> None:
     if code != c.EXIT_SUCCESS and msg:
         logging.error(msg)
     for obj_name in ['vcf_in_handle', 'vcf_out_handle']:
@@ -231,7 +231,7 @@ def test_variant(
                 hp_filt.set()
         else:
             hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-    return c.Filters(hp_filt, al_filt)
+    return c.Filters(al_filt, hp_filt)
 
 
 def process_vcf_record(
@@ -244,7 +244,7 @@ def process_vcf_record(
         raise ValueError('VCF record has no alts')
     
     # favour returning filter bools rather than updated record for testing/reusability
-    filt = c.Filters(c.HPFilter(), c.ALFilter())
+    filt = c.Filters(c.ALFilter(), c.HPFilter())
     alt_log = ''
 
     samples_w_mutants = [name for name in vcf_rec.samples if vcf_rec.samples[name]["GT"] != (0, 0)]
@@ -267,13 +267,12 @@ def process_vcf_record(
     return alt_log, filt
 
 
-if __name__ == '__main__':
-
+def main_cli() -> None:
     logging.basicConfig(level=logging.INFO, format='%(asctime)s ¦ %(levelname)-8s ¦ %(message)s', datefmt='%I:%M:%S')
 
     parser = argparse.ArgumentParser(prog="hairpin")
     parser._optionals.title = 'info'
-    parser.add_argument('-v', '--version', help='print version', action='version', version='hairpin 1.0.0')
+    parser.add_argument('-v', '--version', help='print version', action='version', version=c.VERSION)
     req = parser.add_argument_group('required')
     req.add_argument('-i', '--vcf-in', help="path to input vcf", required=True)
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
@@ -306,12 +305,12 @@ def process_vcf_record(
     try:
         log_file = open(args.log_path, 'w') if args.log_path else sys.stderr
     except Exception as e:
-        cleanup(1, 'failed to open log file, reporting: {}'.format(e))
+        cleanup(msg='failed to open log file, reporting: {}'.format(e))
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
     except Exception as e:
-        cleanup(1, 'failed to open VCF input, reporting: {}'.format(e))
+        cleanup(msg='failed to open VCF input, reporting: {}'.format(e))
 
     # init output
     out_head = vcf_in_handle.header
@@ -321,7 +320,7 @@ def process_vcf_record(
     try:
         vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
     except Exception as e:
-        cleanup(1, 'failed to open VCF output, reporting: {}'.format(e))
+        cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_in_handle.header.samples)
 
@@ -330,14 +329,14 @@ def process_vcf_record(
         try:
             bam = pysam.AlignmentFile(path, 'rb')
         except Exception as e:
-            cleanup(1, 'failed to read BAM at {}, reporting: {}'.format(path, e))
+            cleanup(msg='failed to read BAM at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
         # this may cause problems?
         # check with Peter
         bam_sample = bam.header.to_dict()['RG'][1]['SM']
         if bam_sample not in sample_names:
-            cleanup(1, 'name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
+            cleanup(msg='name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
         else:
             bam_reader_d[bam_sample] = bam
 
@@ -365,6 +364,6 @@ def process_vcf_record(
             try:
                 vcf_out_handle.write(record)
             except Exception as e:
-                cleanup(1, 'failed to write to vcf, reporting: {}'.format(e))
+                cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
 
-    cleanup(0)
+    cleanup(c.EXIT_SUCCESS)
diff --git a/pyproject.toml b/pyproject.toml
index c39babe..602ff6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,5 +10,8 @@ name = "hairpin"
 version = "0.0.1"
 requires-python = ">= 3.7"
 dependencies = [
-    'pysam'
+    'pysam == 0.22.1'
 ]
+
+[project.scripts]
+hairpin = "hairpin.main:main_cli"

From 7825dffd956182365ca5404d58a3edb5a9e68a32 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 9 Jul 2024 15:32:11 +0100
Subject: [PATCH 015/165] add parameter logging

---
 README.md       |  2 +-
 hairpin/main.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 860d5c9..de1decc 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # hairpin-core
 
-Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented here by Peter Campbell and Alex Byrne
+Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 
 ### REQUIREMENTS
 
diff --git a/hairpin/main.py b/hairpin/main.py
index 251e6cc..ef49716 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -3,6 +3,7 @@
 from statistics import mean, median, stdev
 import argparse
 import logging
+import json
 from itertools import tee
 from functools import partial
 import sys
@@ -284,17 +285,11 @@ def main_cli() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
     opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
-    log_sev_opt = opt.add_mutually_exclusive_group()
-    log_sev_opt.add_argument('-l', dest='log_path', help='log reason for failing records to file', nargs='?', const=None)
+    opt.add_argument('-j', '--json-path', help='log parameters for hairpin execution', nargs='?', type=str)
+    opt.add_argument('-l', '--log-path', help='log reason for failing records', nargs='?')
 
     args = parser.parse_args()
 
-    # needs fixing
-    if any([x is None for _, x in vars(args).items()]):
-        logging.info('option(s) not provided, using defaults')
-
-    al_round = len(str(args.al_filter_threshold).split('.')[1])
-
     primed_validate_read = partial(validate_read,
                                    min_mapqual=args.min_mapping_quality,
                                    clip_qual_cutoff=args.clip_quality_cutoff,
@@ -366,4 +361,9 @@ def main_cli() -> None:
             except Exception as e:
                 cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
 
+    try:
+        with open(args.json_path, "w") as jo:
+            json.dump(vars(args), jo)
+    except Exception as e:
+        logging.warning('retaining output, but failed to write to parameters json, reporting {}'.format(e))
     cleanup(c.EXIT_SUCCESS)

From b1484e00b01132975c0ef9eb50c89ab8d892d4ec Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 9 Jul 2024 17:15:58 +0100
Subject: [PATCH 016/165] fix calculation error

---
 hairpin/main.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/hairpin/main.py b/hairpin/main.py
index ef49716..76bf9cb 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -176,13 +176,13 @@ def test_variant(
             for read in read_list:
                 mut_pos, _ = r2s.ref2querypos(read, vcf_rec.start)
                 if read.flag & 0x10:
-                    read_loc = read.query_alignment_end - mut_pos + 1  # Peter since we're getting query mut_pos wrt to the 0-indexed vcf pos, is + 1 correct?
-                    mut_read_fracs_r.append(read_loc / read.query_alignment_length)
-                    mut_read_pos_r.append(read_loc)
+                    read_idx_wrt_aln = read.query_alignment_end - mut_pos  # 1-based position where start, idx 1, is alignment end
+                    mut_read_fracs_r.append(read_idx_wrt_aln / read.query_alignment_length)
+                    mut_read_pos_r.append(read_idx_wrt_aln)
                 else:
-                    read_loc = mut_pos - read.query_alignment_start + 1
-                    mut_read_fracs_f.append(read_loc / read.query_alignment_length)
-                    mut_read_pos_f.append(read_loc)
+                    read_idx_wrt_aln  = mut_pos - read.query_alignment_start + 1
+                    mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
+                    mut_read_pos_f.append(read_idx_wrt_aln)
 
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)
@@ -360,10 +360,10 @@ def main_cli() -> None:
                 vcf_out_handle.write(record)
             except Exception as e:
                 cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
-
-    try:
-        with open(args.json_path, "w") as jo:
-            json.dump(vars(args), jo)
-    except Exception as e:
-        logging.warning('retaining output, but failed to write to parameters json, reporting {}'.format(e))
+    if args.json_path:
+        try:
+            with open(args.json_path, "w") as jo:
+                json.dump(vars(args), jo)
+        except Exception as e:
+            logging.warning('retaining output, but failed to write to parameters json, reporting {}'.format(e))
     cleanup(c.EXIT_SUCCESS)

From af4ab6c0975c724b52ee3f0881b5ab950000b774 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 10 Jul 2024 10:22:57 +0100
Subject: [PATCH 017/165] add behaviour for complex read, plus housekeeping

---
 README.md            |  2 +-
 hairpin/constants.py | 10 +++++-----
 hairpin/main.py      | 20 ++++++++++++++++----
 pyproject.toml       |  2 +-
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index de1decc..d3502bb 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ Maintainable, transparent, implementation of the hairpin detection and flagging
 
 ### REQUIREMENTS
 
-* Python3 - tested with 3.12
+* Python >= 3.10
 
 ### INSTALLATION
 
diff --git a/hairpin/constants.py b/hairpin/constants.py
index b1b4cdc..04cd479 100644
--- a/hairpin/constants.py
+++ b/hairpin/constants.py
@@ -1,5 +1,5 @@
 from enum import Enum, IntEnum, Flag
-from typing import Callable, Optional
+from typing import Callable
 import dataclasses as d
 
 VERSION = '0.0.1'
@@ -21,7 +21,7 @@
 class FilterData:
     name: str
     flag: bool = False
-    code: Optional[int] = None
+    code: int | None = None
 
     def set(self):
         self.flag = True
@@ -37,7 +37,7 @@ class HPFilter(FilterData):
 @d.dataclass
 class ALFilter(FilterData):
     name: str = d.field(default='ALF')
-    avg_as: Optional[float] = None
+    avg_as: float | None = None
 
 @d.dataclass
 class Filters:
@@ -65,12 +65,12 @@ def get_field(self, field_name):
 
 
 def print_flag(
-    print_enum: Enum
+    print_enum: Flag
 ) -> None:
     print([':'.join([str(e), hex(e.value)]) for e in print_enum])
 
 def print_enum(
-    print_enum: Enum
+    print_enum: IntEnum
 ) -> None:
     print([e for e in print_enum])
 
diff --git a/hairpin/main.py b/hairpin/main.py
index 76bf9cb..c396fae 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -7,10 +7,9 @@
 from itertools import tee
 from functools import partial
 import sys
-from typing import Optional
 
 
-def cleanup(code: int = c.EXIT_FAILURE, msg: Optional[str] = None) -> None:
+def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
     if code != c.EXIT_SUCCESS and msg:
         logging.error(msg)
     for obj_name in ['vcf_in_handle', 'vcf_out_handle']:
@@ -99,6 +98,19 @@ def validate_read(
                             read_flag |= c.ValidatorFlags.BAD_OP.value
                         if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
                             read_flag |= c.ValidatorFlags.NOT_ALT.value
+                else:  # COMPLEX
+                    max_rng = range(vcf_record.start, vcf_record.stop) if (vcf_record.start + vcf_record.rlen) > (vcf_record.start + len(alt)) else range(vcf_record.start, (vcf_record.start + len(alt)))
+                    try:
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), max_rng))
+                    except IndexError:
+                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+                    else:
+                        if (mut_rng[0][1] != c.Ops.MATCH.value or
+                            mut_rng[-1][1] != c.Ops.MATCH.value):
+                            read_flag |= c.ValidatorFlags.BAD_OP.value
+                        if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
+                            read_flag |= c.ValidatorFlags.NOT_ALT.value
+
                 # n.b. nothing done if complex read
         if read_flag == c.ValidatorFlags.CLEAR.value:
             # is it safe to assume this is always mate?
@@ -285,8 +297,8 @@ def main_cli() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
     opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
-    opt.add_argument('-j', '--json-path', help='log parameters for hairpin execution', nargs='?', type=str)
-    opt.add_argument('-l', '--log-path', help='log reason for failing records', nargs='?')
+    opt.add_argument('-j', '--arg-log', dest='json-path' help='log input parameters to JSON', nargs='?', type=str)
+    opt.add_argument('-l', '--record-log', dest='log-path' help='log basis for decisions on each record to TSV', nargs='?')
 
     args = parser.parse_args()
 
diff --git a/pyproject.toml b/pyproject.toml
index 602ff6c..e95eb20 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "hairpin"
 version = "0.0.1"
-requires-python = ">= 3.7"
+requires-python = ">= 3.10"
 dependencies = [
     'pysam == 0.22.1'
 ]

From ce412b077655be534c761053b3b1e0f93ce28e75 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 10 Jul 2024 10:38:32 +0100
Subject: [PATCH 018/165] look before you leap

---
 hairpin/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hairpin/main.py b/hairpin/main.py
index c396fae..924cf89 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -297,8 +297,8 @@ def main_cli() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
     opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
-    opt.add_argument('-j', '--arg-log', dest='json-path' help='log input parameters to JSON', nargs='?', type=str)
-    opt.add_argument('-l', '--record-log', dest='log-path' help='log basis for decisions on each record to TSV', nargs='?')
+    opt.add_argument('-j', '--arg-log', dest='json_path', help='log input parameters to JSON', nargs='?', type=str)
+    opt.add_argument('-l', '--record-log', dest='log_path', help='log basis for decisions on each record to TSV', nargs='?')
 
     args = parser.parse_args()
 

From 852944e9007ac127a9182e699a26e42a997640cf Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 10 Jul 2024 11:12:16 +0000
Subject: [PATCH 019/165] containerise

---
 Singularity.def | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 Singularity.def

diff --git a/Singularity.def b/Singularity.def
new file mode 100644
index 0000000..8c948d1
--- /dev/null
+++ b/Singularity.def
@@ -0,0 +1,19 @@
+Bootstrap: docker
+From: python:3.12-slim
+
+%files
+. hairpin/
+
+%post
+pip install hairpin/
+
+%test
+	LOC=$(which hairpin)
+	if [ -z "$LOC"]; then
+		echo "hairpin install failed"
+	else
+		echo "hairpin install successful"
+	fi
+
+%runscript
+	exec hairpin "$@"

From 74d96f098a06a68d3e9df19dfefb2eab07d859ab Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 11 Jul 2024 13:59:12 +0100
Subject: [PATCH 020/165] add filter decision details to VCF log (self-logging)

---
 hairpin/constants.py | 11 +++++---
 hairpin/main.py      | 63 +++++++++++++++-----------------------------
 2 files changed, 29 insertions(+), 45 deletions(-)

diff --git a/hairpin/constants.py b/hairpin/constants.py
index 04cd479..24e6a9b 100644
--- a/hairpin/constants.py
+++ b/hairpin/constants.py
@@ -1,4 +1,4 @@
-from enum import Enum, IntEnum, Flag
+from enum import IntEnum, Flag
 from typing import Callable
 import dataclasses as d
 
@@ -16,6 +16,12 @@
             ['CLEAR', 'FLAG', 'MAPQUAL', 'READ_FIELDS_MISSING', 'NOT_ALIGNED', 'BAD_OP', 'NOT_ALT', 'BASEQUAL', 'SHORT', 'CLIPQUAL', 'MATE_MISSING_FIELDS', 'OVERLAP'],
             start=0)
 
+class NoAlts(ValueError):
+    pass
+
+class NoMutants(ValueError):
+    pass
+
 
 @d.dataclass
 class FilterData:
@@ -45,7 +51,7 @@ class Filters:
     HP: HPFilter
 
     def __iter__(self):
-        return ((field.name, getattr(self, field.name)) for field in d.fields(self))
+        return (getattr(self, field.name) for field in d.fields(self))
 
     def fill_field(self, field_name, value):
         if hasattr(self, field_name):
@@ -73,4 +79,3 @@ def print_enum(
     print_enum: IntEnum
 ) -> None:
     print([e for e in print_enum])
-
diff --git a/hairpin/main.py b/hairpin/main.py
index 924cf89..7d5a995 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -247,37 +247,23 @@ def test_variant(
     return c.Filters(al_filt, hp_filt)
 
 
-def process_vcf_record(
+def test_record_per_alt(
     bams: dict[str, pysam.AlignmentFile],
     vcf_rec: pysam.VariantRecord,
     variant_tester: c.FiltReturn,
-) -> tuple[str, c.Filters]:
+) -> dict[str, c.Filters]:
 
     if vcf_rec.alts is None:
-        raise ValueError('VCF record has no alts')
-    
-    # favour returning filter bools rather than updated record for testing/reusability
-    filt = c.Filters(c.ALFilter(), c.HPFilter())
-    alt_log = ''
-
+        raise c.NoAlts
     samples_w_mutants = [name for name in vcf_rec.samples if vcf_rec.samples[name]["GT"] != (0, 0)]
     if len(samples_w_mutants) == 0:
-        for _, state in filt:
-            state.code = c.FiltCodes.NO_MUTANTS.value
-    else:
-        bams_w_mutants = {k: v for k, v in bams.items() if k in samples_w_mutants}
-        alt_l = list(vcf_rec.alts)
-        # lock filters as true if flipped true
-        while len(alt_l) > 0 and (filt.HP.flag == False or filt.AL.flag == False):
-            alt = alt_l.pop()
-            filt_loop = variant_tester(vcf_rec, bams_w_mutants, alt)
-            for name, state in filt:
-                if not state.flag:
-                    filt.fill_field(name, filt_loop.get_field(name))
-            if any([f.flag for _, f in filt_loop]):
-                alt_log = alt if alt_log == '' else ':'.join([alt_log, alt])
-        alt_log = '-' if alt_log == '' else alt_log
-    return alt_log, filt
+        raise c.NoMutants
+
+    bams_w_mutants = {k: v for k, v in bams.items() if k in samples_w_mutants}
+    filt_d = {}
+    for alt in vcf_rec.alts:
+        filt_d[alt] = variant_tester(vcf_rec, bams_w_mutants, alt)
+    return filt_d
 
 
 def main_cli() -> None:
@@ -298,7 +284,6 @@ def main_cli() -> None:
     opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
     opt.add_argument('-j', '--arg-log', dest='json_path', help='log input parameters to JSON', nargs='?', type=str)
-    opt.add_argument('-l', '--record-log', dest='log_path', help='log basis for decisions on each record to TSV', nargs='?')
 
     args = parser.parse_args()
 
@@ -309,11 +294,6 @@ def main_cli() -> None:
 
     primed_variant_tester = partial(test_variant, al_thresh=args.al_filter_threshold, max_span=args.max_read_span, cent90_thresh=args.cent90_threshold, read_validator=primed_validate_read)
 
-    try:
-        log_file = open(args.log_path, 'w') if args.log_path else sys.stderr
-    except Exception as e:
-        cleanup(msg='failed to open log file, reporting: {}'.format(e))
-
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
     except Exception as e:
@@ -323,6 +303,8 @@ def main_cli() -> None:
     out_head = vcf_in_handle.header
     out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(args.al_filter_threshold))
     out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
+    out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
+    out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
 
     try:
         vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
@@ -349,24 +331,21 @@ def main_cli() -> None:
 
     for record in vcf_in_handle.fetch():
         try:
-            trig_alts, filtering = process_vcf_record(
+            filter_d: dict[str, c.Filters] = test_record_per_alt(
                 bams=bam_reader_d,  # type: ignore
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )
-        except ValueError:
+        except c.NoAlts:
             logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(record.chrom, record.pos))
+        except c.NoMutants:
+            logging.warning('{0: <7}:{1: >12} ¦ no samples contain reads exhibiting record alts'.format(record.chrom, record.pos))
         else:
-            if any([f.code == c.FiltCodes.NO_MUTANTS.value for _, f in filtering]):
-                logging.warning('{0: <7}:{1: >12} ¦ no samples contain reads exhibiting record alts'.format(record.chrom, record.pos))
-
-            record_log: str = '{}\t{}\t{}'.format(record.chrom, record.pos, trig_alts)
-            for _, filter in filtering:
-                record_log = record_log + '\t' + ':'.join([str(round(f, 3) if type(f) == float else f) for f in filter])
-                if filter.flag:
-                    record.filter.add(filter.name)
-
-            print(record_log, file = log_file, flush=True)
+            for alt, filter_bundle in filter_d.items():
+                for filter in filter_bundle:
+                    if filter.flag:
+                        record.filter.add(filter.name)
+                    record.info.update({filter.name: '|'.join([alt] + [str(f) if not type(f) == float else str(round(f, 3)) for f in filter][2:])})
 
             try:
                 vcf_out_handle.write(record)

From 618983cf15c1b13e831f1401ff7f4dae4bba9b06 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 19 Jul 2024 13:52:48 +0100
Subject: [PATCH 021/165] test markdown for readme

---
 README.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/README.md b/README.md
index d3502bb..3b0afc9 100644
--- a/README.md
+++ b/README.md
@@ -25,3 +25,45 @@ pip install . --target $INST_PATH
 export PATH=${PATH}:${INST_PATH}/bin
 hairpin -h
 ```
+
+### DETAILS
+
+```
+usage: hairpin [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUALITY_CUTOFF] [-mq MIN_MAPPING_QUALITY] 
+  [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-al AL_FILTER_THRESHOLD] [-c9 CENT90_THRESHOLD] [-j [JSON_PATH]]
+
+  info:
+    -h, --help            show this help message and exit
+    -v, --version         print version
+
+  required:
+    -i VCF_IN, --vcf-in VCF_IN
+                path to input vcf
+    -o VCF_OUT, --vcf-out VCF_OUT
+                path to vcf out
+    -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
+                list of paths to bams for samples in input vcf, whitespace separated
+
+  options:
+    -cq CLIP_QUALITY_CUTOFF, --clip-quality-cutoff CLIP_QUALITY_CUTOFF
+                default: 35
+    -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
+                default: 11
+    -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
+                default: 25
+    -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
+                default: 6
+    -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
+                default: 0.93
+    -c9 CENT90_THRESHOLD, --cent90-threshold CENT90_THRESHOLD
+                default: 0.15
+    -j [JSON_PATH], --arg-log [JSON_PATH]
+                log input parameters to JSON
+```
+
+The basic procedure of this implementation is as follows:
+> For each record in the VCF, test every alt for that record by:
+> * retrieving reads from samples exhibiting the mutations, then
+> * testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on), then
+> * performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+> * on the results of the statistical analysis pass or fail the record for the filters ALF and HPF, and log a code and relevant info to the INFO field indicating the reason for the decision

From 6dd71cbc49dee959e790e9411dba19ab3f23bc9c Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 19 Jul 2024 14:01:43 +0100
Subject: [PATCH 022/165] basic doc, parameter clarity

---
 README.md       | 8 ++++----
 hairpin/main.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 3b0afc9..43bd1a1 100644
--- a/README.md
+++ b/README.md
@@ -57,13 +57,13 @@ usage: hairpin [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUALI
                 default: 0.93
     -c9 CENT90_THRESHOLD, --cent90-threshold CENT90_THRESHOLD
                 default: 0.15
-    -j [JSON_PATH], --arg-log [JSON_PATH]
-                log input parameters to JSON
+    -j JSON_PATH, --json-log JSON_PATH
+                log input parameters/arguments to JSON
 ```
 
 The basic procedure of this implementation is as follows:
 > For each record in the VCF, test every alt for that record by:
 > * retrieving reads from samples exhibiting the mutations, then
 > * testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on), then
-> * performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
-> * on the results of the statistical analysis pass or fail the record for the filters ALF and HPF, and log a code and relevant info to the INFO field indicating the reason for the decision
+> * performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads, then
+> * on the results of the statistical analysis, pass or fail the record for the filters ALF and HPF, and log a code and relevant info to the INFO field indicating the reason for the decision
diff --git a/hairpin/main.py b/hairpin/main.py
index 7d5a995..19fe565 100644
--- a/hairpin/main.py
+++ b/hairpin/main.py
@@ -283,7 +283,7 @@ def main_cli() -> None:
     opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
     opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
-    opt.add_argument('-j', '--arg-log', dest='json_path', help='log input parameters to JSON', nargs='?', type=str)
+    opt.add_argument('-j', '--json-log', dest='json_path', help='log input parameters/arguments to JSON', type=str)
 
     args = parser.parse_args()
 

From 81aa7e33aa9d9fd2a48289ed5c7a39fd0f00d8f9 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:43:14 +0100
Subject: [PATCH 023/165] rename, reorder

---
 README.md                          | 6 +++---
 hairpin2.egg-info/PKG-INFO         | 5 +++++
 {hairpin => hairpin2}/__init__.py  | 0
 {hairpin => hairpin2}/constants.py | 2 +-
 {hairpin => hairpin2}/main.py      | 4 ++--
 {hairpin => hairpin2}/ref2seq.py   | 2 +-
 pyproject.toml                     | 8 ++++----
 7 files changed, 16 insertions(+), 11 deletions(-)
 create mode 100644 hairpin2.egg-info/PKG-INFO
 rename {hairpin => hairpin2}/__init__.py (100%)
 rename {hairpin => hairpin2}/constants.py (99%)
 rename {hairpin => hairpin2}/main.py (98%)
 rename {hairpin => hairpin2}/ref2seq.py (97%)

diff --git a/README.md b/README.md
index 43bd1a1..aa9be98 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# hairpin-core
+# hairpin2
 
 Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 
@@ -29,8 +29,8 @@ hairpin -h
 ### DETAILS
 
 ```
-usage: hairpin [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUALITY_CUTOFF] [-mq MIN_MAPPING_QUALITY] 
-  [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-al AL_FILTER_THRESHOLD] [-c9 CENT90_THRESHOLD] [-j [JSON_PATH]]
+usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUALITY_CUTOFF] [-mq MIN_MAPPING_QUALITY] 
+  [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-al AL_FILTER_THRESHOLD] [-c9 CENT90_THRESHOLD] [-j JSON_PATH]
 
   info:
     -h, --help            show this help message and exit
diff --git a/hairpin2.egg-info/PKG-INFO b/hairpin2.egg-info/PKG-INFO
new file mode 100644
index 0000000..59629a2
--- /dev/null
+++ b/hairpin2.egg-info/PKG-INFO
@@ -0,0 +1,5 @@
+Metadata-Version: 2.1
+Name: hairpin2
+Version: 0.0.1a0
+Requires-Python: >=3.10
+Requires-Dist: pysam==0.22.1
diff --git a/hairpin/__init__.py b/hairpin2/__init__.py
similarity index 100%
rename from hairpin/__init__.py
rename to hairpin2/__init__.py
diff --git a/hairpin/constants.py b/hairpin2/constants.py
similarity index 99%
rename from hairpin/constants.py
rename to hairpin2/constants.py
index 24e6a9b..c9a494e 100644
--- a/hairpin/constants.py
+++ b/hairpin2/constants.py
@@ -2,7 +2,7 @@
 from typing import Callable
 import dataclasses as d
 
-VERSION = '0.0.1'
+VERSION = '0.0.1a'
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 
diff --git a/hairpin/main.py b/hairpin2/main.py
similarity index 98%
rename from hairpin/main.py
rename to hairpin2/main.py
index 19fe565..fc4e2bd 100644
--- a/hairpin/main.py
+++ b/hairpin2/main.py
@@ -1,5 +1,5 @@
 import pysam
-from hairpin import ref2seq as r2s, constants as c
+from hairpin2 import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
 import argparse
 import logging
@@ -339,7 +339,7 @@ def main_cli() -> None:
         except c.NoAlts:
             logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(record.chrom, record.pos))
         except c.NoMutants:
-            logging.warning('{0: <7}:{1: >12} ¦ no samples contain reads exhibiting record alts'.format(record.chrom, record.pos))
+            logging.warning('{0: <7}:{1: >12} ¦ no samples exhibit record alts'.format(record.chrom, record.pos))
         else:
             for alt, filter_bundle in filter_d.items():
                 for filter in filter_bundle:
diff --git a/hairpin/ref2seq.py b/hairpin2/ref2seq.py
similarity index 97%
rename from hairpin/ref2seq.py
rename to hairpin2/ref2seq.py
index 00411a9..436fd3a 100644
--- a/hairpin/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -1,5 +1,5 @@
 import pysam
-from hairpin import constants as c
+from hairpin2 import constants as c
 
 def ref2querypos(
             bam_record: pysam.AlignedSegment,
diff --git a/pyproject.toml b/pyproject.toml
index e95eb20..236db44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,17 +1,17 @@
 [tool.setuptools]
-packages = ["hairpin"]
+packages = ["hairpin2"]
 
 [build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "hairpin"
-version = "0.0.1"
+name = "hairpin2"
+version = "0.0.1a"
 requires-python = ">= 3.10"
 dependencies = [
     'pysam == 0.22.1'
 ]
 
 [project.scripts]
-hairpin = "hairpin.main:main_cli"
+hairpin2 = "hairpin2.main:main_cli"

From abf9b28dd1f2e9135ac768bf4345eabddc89af65 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:44:25 +0100
Subject: [PATCH 024/165] Update README.md

---
 README.md | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index aa9be98..d187ad7 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ Within a virtual environment:
 python -m venv .env
 source .env/bin/activate
 pip install .
-hairpin -h
+hairpin2 -h
 ```
 
 For system-wide access:
@@ -23,7 +23,7 @@ export INST_PATH=/path/to/install/location/
 mkdir -p $INST_PATH
 pip install . --target $INST_PATH
 export PATH=${PATH}:${INST_PATH}/bin
-hairpin -h
+hairpin2 -h
 ```
 
 ### DETAILS
@@ -61,6 +61,20 @@ usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUAL
                 log input parameters/arguments to JSON
 ```
 
+The tool tests records in a VCF file and applies the HPF, indicating a hairpin, and ALF, flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form HPF=<alt>|<code> and ALF=<alt>|<code>|<average AS score>
+
+The codes are as follows
+
+0 - passed/failed on condition 60A(i) of Ellis et al. (HPF only)
+
+1 - passed/failed on condition 60B(i) of Ellis et al. (HPF only)
+
+2 - passed/failed on filter threshold (ALF only)
+
+3 - insufficient appropriate reads to support calling flag (This covers a lot of possiblities, if more granularity is desired, please request it)
+
+4 - no samples have non 0,0 genotype for the record
+
 The basic procedure of this implementation is as follows:
 > For each record in the VCF, test every alt for that record by:
 > * retrieving reads from samples exhibiting the mutations, then

From ae611df3a73924134806586118750c5ffaa9b921 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:45:20 +0100
Subject: [PATCH 025/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d187ad7..2dae4a3 100644
--- a/README.md
+++ b/README.md
@@ -61,7 +61,7 @@ usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUAL
                 log input parameters/arguments to JSON
 ```
 
-The tool tests records in a VCF file and applies the HPF, indicating a hairpin, and ALF, flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form HPF=<alt>|<code> and ALF=<alt>|<code>|<average AS score>
+The tool tests records in a VCF file and applies the `HPF`, indicating a hairpin, and `ALF`, flags as appropriate. It records reasoning for its decisions in the `INFO` field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<average AS score>`
 
 The codes are as follows
 

From 300e8b06d3f9f05bbd7bd6c86ec8e40ab7983d78 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:47:25 +0100
Subject: [PATCH 026/165] remove unwanted file

---
 hairpin2.egg-info/PKG-INFO | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 hairpin2.egg-info/PKG-INFO

diff --git a/hairpin2.egg-info/PKG-INFO b/hairpin2.egg-info/PKG-INFO
deleted file mode 100644
index 59629a2..0000000
--- a/hairpin2.egg-info/PKG-INFO
+++ /dev/null
@@ -1,5 +0,0 @@
-Metadata-Version: 2.1
-Name: hairpin2
-Version: 0.0.1a0
-Requires-Python: >=3.10
-Requires-Dist: pysam==0.22.1

From e4c4d8b474956e638f0f42338498e3b7aaf6f7c5 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:48:26 +0100
Subject: [PATCH 027/165] update gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index f287d02..a1e5cef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 data/
 .env/
 dist/
-hairpin.egg-info/
+*.egg-info/
 
 data/
 data

From af8900b32b3adfe1bf0be7a2228a877636243838 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:50:16 +0100
Subject: [PATCH 028/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2dae4a3..d419810 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# hairpin2
+# hairpin2 - 0.0.1a
 
 Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 

From c7df2a40d58a239de158a68c87f0b16ff8e746df Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:50:48 +0100
Subject: [PATCH 029/165] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index d419810..b1b2e06 100644
--- a/README.md
+++ b/README.md
@@ -65,15 +65,15 @@ The tool tests records in a VCF file and applies the `HPF`, indicating a hairpin
 
 The codes are as follows
 
-0 - passed/failed on condition 60A(i) of Ellis et al. (HPF only)
+**0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)
 
-1 - passed/failed on condition 60B(i) of Ellis et al. (HPF only)
+**1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)
 
-2 - passed/failed on filter threshold (ALF only)
+**2** - passed/failed on filter threshold (ALF only)
 
-3 - insufficient appropriate reads to support calling flag (This covers a lot of possiblities, if more granularity is desired, please request it)
+**3** - insufficient appropriate reads to support calling flag (This covers a lot of possiblities, if more granularity is desired, please request it)
 
-4 - no samples have non 0,0 genotype for the record
+**4** - no samples have non 0,0 genotype for the record
 
 The basic procedure of this implementation is as follows:
 > For each record in the VCF, test every alt for that record by:

From 038182ff4c7290ccdc8fc2005cc72af0ef328640 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 15:51:32 +0100
Subject: [PATCH 030/165] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b1b2e06..fa7fd65 100644
--- a/README.md
+++ b/README.md
@@ -65,9 +65,9 @@ The tool tests records in a VCF file and applies the `HPF`, indicating a hairpin
 
 The codes are as follows
 
-**0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)
+**0** - passed/failed on condition 60A(i) of Ellis _et al._ (HPF only)
 
-**1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)
+**1** - passed/failed on condition 60B(i) of Ellis _et al._ (HPF only)
 
 **2** - passed/failed on filter threshold (ALF only)
 

From 1878647a16ce2fe85cff8d62201f8c0133428d8c Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 16:13:20 +0100
Subject: [PATCH 031/165] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fa7fd65..959ec8b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # hairpin2 - 0.0.1a
 
-Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
+Maintainable, transparent, and fast, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 
 ### REQUIREMENTS
 
@@ -61,7 +61,7 @@ usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUAL
                 log input parameters/arguments to JSON
 ```
 
-The tool tests records in a VCF file and applies the `HPF`, indicating a hairpin, and `ALF`, flags as appropriate. It records reasoning for its decisions in the `INFO` field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<average AS score>`
+The tool tests records in a VCF file and applies the filter flags `HPF`, indicating a hairpin, and `ALF`, as appropriate. It records reasoning for its decisions in the `INFO` field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<average AS score>`
 
 The codes are as follows
 

From 47338574c32c510b62b090862ef4d5552bf3458b Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 22 Jul 2024 16:47:06 +0100
Subject: [PATCH 032/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 959ec8b..c593ad8 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # hairpin2 - 0.0.1a
 
-Maintainable, transparent, and fast, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
+**Maintainable, transparent, fast, and error-free** implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 
 ### REQUIREMENTS
 

From fb8a900665102a7732a917d20be94b1aca325443 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 23 Jul 2024 16:06:42 +0100
Subject: [PATCH 033/165] update readme and param descs

---
 README.md        | 24 +++++-------------------
 hairpin2/main.py |  2 +-
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index c593ad8..1829d9a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# hairpin2 - 0.0.1a
+# hairpin2
 
-**Maintainable, transparent, fast, and error-free** implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
+Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
 
 ### REQUIREMENTS
 
@@ -14,7 +14,7 @@ Within a virtual environment:
 python -m venv .env
 source .env/bin/activate
 pip install .
-hairpin2 -h
+hairpin -h
 ```
 
 For system-wide access:
@@ -23,7 +23,7 @@ export INST_PATH=/path/to/install/location/
 mkdir -p $INST_PATH
 pip install . --target $INST_PATH
 export PATH=${PATH}:${INST_PATH}/bin
-hairpin2 -h
+hairpin -h
 ```
 
 ### DETAILS
@@ -42,7 +42,7 @@ usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUAL
     -o VCF_OUT, --vcf-out VCF_OUT
                 path to vcf out
     -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                list of paths to bams for samples in input vcf, whitespace separated
+                list of paths to name-sorted bams for samples in input vcf, whitespace separated
 
   options:
     -cq CLIP_QUALITY_CUTOFF, --clip-quality-cutoff CLIP_QUALITY_CUTOFF
@@ -61,20 +61,6 @@ usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUAL
                 log input parameters/arguments to JSON
 ```
 
-The tool tests records in a VCF file and applies the filter flags `HPF`, indicating a hairpin, and `ALF`, as appropriate. It records reasoning for its decisions in the `INFO` field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<average AS score>`
-
-The codes are as follows
-
-**0** - passed/failed on condition 60A(i) of Ellis _et al._ (HPF only)
-
-**1** - passed/failed on condition 60B(i) of Ellis _et al._ (HPF only)
-
-**2** - passed/failed on filter threshold (ALF only)
-
-**3** - insufficient appropriate reads to support calling flag (This covers a lot of possiblities, if more granularity is desired, please request it)
-
-**4** - no samples have non 0,0 genotype for the record
-
 The basic procedure of this implementation is as follows:
 > For each record in the VCF, test every alt for that record by:
 > * retrieving reads from samples exhibiting the mutations, then
diff --git a/hairpin2/main.py b/hairpin2/main.py
index fc4e2bd..a41fcbc 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -275,7 +275,7 @@ def main_cli() -> None:
     req = parser.add_argument_group('required')
     req.add_argument('-i', '--vcf-in', help="path to input vcf", required=True)
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
-    req.add_argument('-b', '--bams', help="list of paths to bams for samples in input vcf, whitespace separated", nargs='+', required=True)
+    req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
     opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
     opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)

From 8d12ace7616372931136e4393a9d2a272b1974a3 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 31 Jul 2024 13:17:46 +0100
Subject: [PATCH 034/165] untested pass at name mapping implementation

---
 hairpin2/main.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index a41fcbc..c6a7566 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,3 +1,4 @@
+from enum import unique
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
@@ -60,7 +61,7 @@ def validate_read(
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED - vcf_record.start = 0-indexed mutation position. testing with pos, 1-indexed, to see if match Peter
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED (vcf_record.start = 0-indexed mutation position)
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
@@ -195,7 +196,6 @@ def test_variant(
                     read_idx_wrt_aln  = mut_pos - read.query_alignment_start + 1
                     mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
                     mut_read_pos_f.append(read_idx_wrt_aln)
-
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)
                 except KeyError:
@@ -277,6 +277,7 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
+    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names', metavar='VCF:BAM', nargs='+')
     opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
     opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
     opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
@@ -312,8 +313,7 @@ def main_cli() -> None:
         cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_in_handle.header.samples)
-
-    bam_reader_d: dict[str, None | pysam.AlignmentFile] = dict.fromkeys(sample_names)
+    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)  # type: ignore
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -324,15 +324,34 @@ def main_cli() -> None:
         # this may cause problems?
         # check with Peter
         bam_sample = bam.header.to_dict()['RG'][1]['SM']
-        if bam_sample not in sample_names:
-            cleanup(msg='name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
-        else:
-            bam_reader_d[bam_sample] = bam
+        bam_reader_d[bam_sample] = bam
+    if args.name_mapping:
+        vcf_map_names = []
+        bam_map_names = []
+        for pair in args.name_mapping:
+            kv_split = pair.split(':')  # VCF:BAM
+            if len(kv_split) != 2:
+                cleanup(1, 'name mapping misformatted, more than two elements in map string {}'.format(pair))
+            vcf_map_names.append(kv_split[0])
+            bam_map_names.append(kv_split[1])
+        if len(set(vcf_map_names)) != len(vcf_map_names):
+            cleanup(1, 'duplicate VCF sample names in name mapping')
+        if not sorted(vcf_map_names) == sorted(sample_names):
+            cleanup(1, 'VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
+        if len(set(bam_map_names)) != len(bam_map_names):
+            cleanup(1, 'duplicate BAM sample names in name mapping')
+        if not sorted(bam_map_names) == sorted(bam_reader_d.keys()):
+            cleanup(1, 'BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
+        mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
+    else:
+        for bam_sample in bam_reader_d.keys():
+            if bam_sample not in sample_names:
+                cleanup(msg='name in header ({}) of BAM does not match any samples in VCF'.format(bam_sample))
 
     for record in vcf_in_handle.fetch():
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=bam_reader_d,  # type: ignore
+                bams=mapped_bam_reader_d if args.name_mapping else bam_reader_d,
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )

From 7fb8bc805346d02189b00cbd60ea25c0a5a07831 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 31 Jul 2024 13:19:10 +0100
Subject: [PATCH 035/165] Revert "untested pass at name mapping implementation"

whoops wrong branch

This reverts commit 8d12ace7616372931136e4393a9d2a272b1974a3.
---
 hairpin2/main.py | 37 +++++++++----------------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index c6a7566..a41fcbc 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,4 +1,3 @@
-from enum import unique
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
@@ -61,7 +60,7 @@ def validate_read(
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED (vcf_record.start = 0-indexed mutation position)
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED - vcf_record.start = 0-indexed mutation position. testing with pos, 1-indexed, to see if match Peter
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
@@ -196,6 +195,7 @@ def test_variant(
                     read_idx_wrt_aln  = mut_pos - read.query_alignment_start + 1
                     mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
                     mut_read_pos_f.append(read_idx_wrt_aln)
+
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)
                 except KeyError:
@@ -277,7 +277,6 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
-    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names', metavar='VCF:BAM', nargs='+')
     opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
     opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
     opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
@@ -313,7 +312,8 @@ def main_cli() -> None:
         cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_in_handle.header.samples)
-    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)  # type: ignore
+
+    bam_reader_d: dict[str, None | pysam.AlignmentFile] = dict.fromkeys(sample_names)
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -324,34 +324,15 @@ def main_cli() -> None:
         # this may cause problems?
         # check with Peter
         bam_sample = bam.header.to_dict()['RG'][1]['SM']
-        bam_reader_d[bam_sample] = bam
-    if args.name_mapping:
-        vcf_map_names = []
-        bam_map_names = []
-        for pair in args.name_mapping:
-            kv_split = pair.split(':')  # VCF:BAM
-            if len(kv_split) != 2:
-                cleanup(1, 'name mapping misformatted, more than two elements in map string {}'.format(pair))
-            vcf_map_names.append(kv_split[0])
-            bam_map_names.append(kv_split[1])
-        if len(set(vcf_map_names)) != len(vcf_map_names):
-            cleanup(1, 'duplicate VCF sample names in name mapping')
-        if not sorted(vcf_map_names) == sorted(sample_names):
-            cleanup(1, 'VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
-        if len(set(bam_map_names)) != len(bam_map_names):
-            cleanup(1, 'duplicate BAM sample names in name mapping')
-        if not sorted(bam_map_names) == sorted(bam_reader_d.keys()):
-            cleanup(1, 'BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
-        mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
-    else:
-        for bam_sample in bam_reader_d.keys():
-            if bam_sample not in sample_names:
-                cleanup(msg='name in header ({}) of BAM does not match any samples in VCF'.format(bam_sample))
+        if bam_sample not in sample_names:
+            cleanup(msg='name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
+        else:
+            bam_reader_d[bam_sample] = bam
 
     for record in vcf_in_handle.fetch():
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=mapped_bam_reader_d if args.name_mapping else bam_reader_d,
+                bams=bam_reader_d,  # type: ignore
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )

From 491bf335c35a4a542f99987ebb75316995517834 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 31 Jul 2024 13:20:21 +0100
Subject: [PATCH 036/165] untested pass at name mapping implementation

---
 hairpin2/main.py | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index a41fcbc..c6a7566 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,3 +1,4 @@
+from enum import unique
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
@@ -60,7 +61,7 @@ def validate_read(
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED - vcf_record.start = 0-indexed mutation position. testing with pos, 1-indexed, to see if match Peter
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED (vcf_record.start = 0-indexed mutation position)
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
@@ -195,7 +196,6 @@ def test_variant(
                     read_idx_wrt_aln  = mut_pos - read.query_alignment_start + 1
                     mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
                     mut_read_pos_f.append(read_idx_wrt_aln)
-
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)
                 except KeyError:
@@ -277,6 +277,7 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
+    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names', metavar='VCF:BAM', nargs='+')
     opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
     opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
     opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
@@ -312,8 +313,7 @@ def main_cli() -> None:
         cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_in_handle.header.samples)
-
-    bam_reader_d: dict[str, None | pysam.AlignmentFile] = dict.fromkeys(sample_names)
+    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)  # type: ignore
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -324,15 +324,34 @@ def main_cli() -> None:
         # this may cause problems?
         # check with Peter
         bam_sample = bam.header.to_dict()['RG'][1]['SM']
-        if bam_sample not in sample_names:
-            cleanup(msg='name in header ({}) of BAM at {} does not match any samples in VCF'.format(bam_sample, path))
-        else:
-            bam_reader_d[bam_sample] = bam
+        bam_reader_d[bam_sample] = bam
+    if args.name_mapping:
+        vcf_map_names = []
+        bam_map_names = []
+        for pair in args.name_mapping:
+            kv_split = pair.split(':')  # VCF:BAM
+            if len(kv_split) != 2:
+                cleanup(1, 'name mapping misformatted, more than two elements in map string {}'.format(pair))
+            vcf_map_names.append(kv_split[0])
+            bam_map_names.append(kv_split[1])
+        if len(set(vcf_map_names)) != len(vcf_map_names):
+            cleanup(1, 'duplicate VCF sample names in name mapping')
+        if not sorted(vcf_map_names) == sorted(sample_names):
+            cleanup(1, 'VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
+        if len(set(bam_map_names)) != len(bam_map_names):
+            cleanup(1, 'duplicate BAM sample names in name mapping')
+        if not sorted(bam_map_names) == sorted(bam_reader_d.keys()):
+            cleanup(1, 'BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
+        mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
+    else:
+        for bam_sample in bam_reader_d.keys():
+            if bam_sample not in sample_names:
+                cleanup(msg='name in header ({}) of BAM does not match any samples in VCF'.format(bam_sample))
 
     for record in vcf_in_handle.fetch():
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=bam_reader_d,  # type: ignore
+                bams=mapped_bam_reader_d if args.name_mapping else bam_reader_d,
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )

From c41dce144a484ccc0262bd5b12dd6f91a78b97d1 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 5 Aug 2024 16:05:00 +0100
Subject: [PATCH 037/165] tested pass at name mapping

---
 .gitignore       | 5 +----
 hairpin2/main.py | 6 +++---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index a1e5cef..749ae4b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,7 @@
-data/
+data*
 .env/
 dist/
 *.egg-info/
-
-data/
-data
 __pycache__/
 .helix/
 build/
diff --git a/hairpin2/main.py b/hairpin2/main.py
index c6a7566..5ab866e 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -313,7 +313,7 @@ def main_cli() -> None:
         cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names: list[str] = list(vcf_in_handle.header.samples)
-    bam_reader_d: dict[str, pysam.AlignmentFile] = dict.fromkeys(sample_names)  # type: ignore
+    bam_reader_d: dict[str, pysam.AlignmentFile] = {}
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -323,8 +323,8 @@ def main_cli() -> None:
         # in header field RG
         # this may cause problems?
         # check with Peter
-        bam_sample = bam.header.to_dict()['RG'][1]['SM']
-        bam_reader_d[bam_sample] = bam
+        bam_sample_name = bam.header.to_dict()['RG'][0]['SM']
+        bam_reader_d[bam_sample_name] = bam
     if args.name_mapping:
         vcf_map_names = []
         bam_map_names = []

From 502763f8eeba7f5bbeda011ec2a0cbf455205fac Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 5 Aug 2024 16:05:53 +0100
Subject: [PATCH 038/165] bump version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 236db44..76e5504 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "hairpin2"
-version = "0.0.1a"
+version = "0.0.2a"
 requires-python = ">= 3.10"
 dependencies = [
     'pysam == 0.22.1'

From e7079454e4cfd220963ae1056132147ec287a30d Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 5 Aug 2024 16:39:16 +0100
Subject: [PATCH 039/165] remove unused dependency

---
 hairpin2/main.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 5ab866e..1188c41 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,4 +1,3 @@
-from enum import unique
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c
 from statistics import mean, median, stdev
@@ -114,8 +113,8 @@ def validate_read(
 
                 # n.b. nothing done if complex read
         if read_flag == c.ValidatorFlags.CLEAR.value:
-            # is it safe to assume this is always mate?
-            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # THIS ONLY WORKS ASSUMING MATE IS NEXT READ
+            # "next", through an unfortunate quirk of history, means "mate", so this is reliable (pulls RNEXT)
+            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)
             if not (read.flag & 0x40):
                 # this looks like it should be checked for indexing snags
                 pair_start = read.reference_start

From ba3011861bdc20dc8ccbba77ff61ba65af5a07ed Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 6 Aug 2024 15:34:13 +0100
Subject: [PATCH 040/165] incorporate Luca's review

---
 hairpin2/constants.py | 10 ----------
 hairpin2/helpers.py   | 26 ++++++++++++++++++++++++++
 hairpin2/main.py      | 31 +++++++++++++++++--------------
 3 files changed, 43 insertions(+), 24 deletions(-)
 create mode 100644 hairpin2/helpers.py

diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index c9a494e..5db5905 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -69,13 +69,3 @@ def get_field(self, field_name):
 FiltReturn = Callable[..., Filters]
 FlagReturn = Callable[..., int]
 
-
-def print_flag(
-    print_enum: Flag
-) -> None:
-    print([':'.join([str(e), hex(e.value)]) for e in print_enum])
-
-def print_enum(
-    print_enum: IntEnum
-) -> None:
-    print([e for e in print_enum])
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
new file mode 100644
index 0000000..4636971
--- /dev/null
+++ b/hairpin2/helpers.py
@@ -0,0 +1,26 @@
+from enum import IntEnum, Flag
+
+
+def has_duplicates(
+    l: list
+) -> bool:
+    return len(l) != len(set(l))
+
+
+def lists_not_equal(
+    l1: list,
+    l2: list
+) -> bool:
+    return sorted(l1) != sorted(l2)
+
+
+def print_flag(
+    print_enum: Flag
+) -> None:
+    print([':'.join([str(e), hex(e.value)]) for e in print_enum])
+
+
+def print_enum(
+    print_enum: IntEnum
+) -> None:
+    print([e for e in print_enum])
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 1188c41..313b293 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,5 +1,5 @@
 import pysam
-from hairpin2 import ref2seq as r2s, constants as c
+from hairpin2 import ref2seq as r2s, constants as c, helpers as h
 from statistics import mean, median, stdev
 import argparse
 import logging
@@ -311,7 +311,10 @@ def main_cli() -> None:
     except Exception as e:
         cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
-    sample_names: list[str] = list(vcf_in_handle.header.samples)
+    sample_names = list(vcf_in_handle.header.samples)  # type:ignore
+    if len(set(sample_names)) != len(sample_names):
+        cleanup(msg='duplicate sample names in VCF')
+    sample_names: set[str] = set(sample_names)
     bam_reader_d: dict[str, pysam.AlignmentFile] = {}
     for path in args.bams:
         try:
@@ -330,22 +333,22 @@ def main_cli() -> None:
         for pair in args.name_mapping:
             kv_split = pair.split(':')  # VCF:BAM
             if len(kv_split) != 2:
-                cleanup(1, 'name mapping misformatted, more than two elements in map string {}'.format(pair))
+                cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
             vcf_map_names.append(kv_split[0])
             bam_map_names.append(kv_split[1])
-        if len(set(vcf_map_names)) != len(vcf_map_names):
-            cleanup(1, 'duplicate VCF sample names in name mapping')
-        if not sorted(vcf_map_names) == sorted(sample_names):
-            cleanup(1, 'VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
-        if len(set(bam_map_names)) != len(bam_map_names):
-            cleanup(1, 'duplicate BAM sample names in name mapping')
-        if not sorted(bam_map_names) == sorted(bam_reader_d.keys()):
-            cleanup(1, 'BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
+        if h.has_duplicates(vcf_map_names):
+            cleanup(msg='duplicate VCF sample names in name mapping')
+        if h.lists_not_equal(vcf_map_names, sample_names):
+            cleanup(msg='VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
+        if h.has_duplicates(bam_map_names):
+            cleanup(msg='duplicate BAM sample names in name mapping')
+        if h.lists_not_equal(bam_map_names, bam_reader_d.keys()):
+            cleanup(msg='BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
         mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
     else:
-        for bam_sample in bam_reader_d.keys():
-            if bam_sample not in sample_names:
-                cleanup(msg='name in header ({}) of BAM does not match any samples in VCF'.format(bam_sample))
+        names_mismatch = sample_names ^ bam_reader_d.keys()
+        if len(names_mismatch):
+            cleanup(msg='name mismatch between BAMs and VCF: {}'.format(names_mismatch))
 
     for record in vcf_in_handle.fetch():
         try:

From 48136ca74743ff5be7c5cdec12f2983f922d31d0 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 6 Aug 2024 16:55:15 +0100
Subject: [PATCH 041/165] update help

---
 hairpin2/main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 313b293..faa5bb5 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -276,13 +276,13 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
     opt = parser.add_argument_group('options')
-    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names', metavar='VCF:BAM', nargs='+')
+    opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
     opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
     opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
     opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
-    opt.add_argument('-ms', '--max-read-span', help='default: 6', type=int, default=6)
-    opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
-    opt.add_argument('-c9', '--cent90-threshold', help='default: 0.15', type=float, default=0.15)
+    opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int, default=6)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [fraction] of start/end to allow HPF flag - default: 0.15', type=float, default=0.15)
+    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
     opt.add_argument('-j', '--json-log', dest='json_path', help='log input parameters/arguments to JSON', type=str)
 
     args = parser.parse_args()

From 51bbd5809963b7ecb169e149539d5787a6c79366 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 7 Aug 2024 16:36:25 +0100
Subject: [PATCH 042/165] update arg handling to allow for submission of config
 file

---
 hairpin2/constants.py |  2 +
 hairpin2/helpers.py   |  4 +-
 hairpin2/main.py      | 85 +++++++++++++++++++++++++++----------------
 3 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 5db5905..64a41db 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -6,6 +6,8 @@
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 
+DEFAULTS: dict[str, int | float] = dict((('al_filter_threshold', 0.93), ('min_clip_quality', 35), ('min_mapping_quality', 11), ('min_base_quality', 25), ('max_read_span', 6), ('position_fraction', 0.15)))
+
 FiltCodes = IntEnum('FiltCodes',
             ['SIXTYAI', 'SIXTYBI', 'ON_THRESHOLD', 'INSUFFICIENT_READS', 'NO_MUTANTS'],
             start=0)
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 4636971..b6f6ea4 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -8,8 +8,8 @@ def has_duplicates(
 
 
 def lists_not_equal(
-    l1: list,
-    l2: list
+    l1: list | set,
+    l2: list | set
 ) -> bool:
     return sorted(l1) != sorted(l2)
 
diff --git a/hairpin2/main.py b/hairpin2/main.py
index faa5bb5..e85d832 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -12,13 +12,13 @@
 def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
     if code != c.EXIT_SUCCESS and msg:
         logging.error(msg)
-    for obj_name in ['vcf_in_handle', 'vcf_out_handle']:
+    for obj_name in ['vcf_in_handle', 'vcf_out_handle', 'output_json']:
         if obj_name in locals():
             locals()[obj_name].close()  # lol
-    if 'bam_reader_d' in locals():
-        locals()['bam_reader_d'].close()
-    if 'log_file' in locals() and locals()['log_file']:
-        locals()['log_file'].close()
+    for obj_name in ['bam_reader_d', 'mapped_bam_reader_d']:
+        if obj_name in locals():
+            for k in locals()[obj_name].keys():
+                locals()[obj_name][k].close()
     if code == c.EXIT_SUCCESS:
         logging.info('hairpin complete')
     sys.exit(code)
@@ -29,7 +29,7 @@ def validate_read(
     vcf_record: pysam.VariantRecord,
     read: pysam.AlignedSegment,
     min_mapqual: int,
-    clip_qual_cutoff: int,
+    min_clipqual: int,
     min_basequal: int,
     alt: str
 ) -> int:
@@ -56,7 +56,7 @@ def validate_read(
         read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
     else:
         if ('S' in read.cigarstring and  # type: ignore
-            mean(read.query_alignment_qualities) < clip_qual_cutoff):  # type: ignore
+            mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
@@ -136,7 +136,7 @@ def test_variant(
     alt: str,
     al_thresh: float,
     max_span: int,
-    cent90_thresh: float,
+    position_fraction_thresh: float,
     read_validator: c.FlagReturn,
 ) -> c.Filters:
 
@@ -210,7 +210,7 @@ def test_variant(
         if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
-            if (((sum([x <= cent90_thresh for x in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
+            if (((sum([x <= position_fraction_thresh for x in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
                   mad_f > 0 and
                   sd_f > 4):
                 hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
@@ -220,7 +220,7 @@ def test_variant(
         elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
-            if (((sum([x <= cent90_thresh for x in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
+            if (((sum([x <= position_fraction_thresh for x in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
                   mad_r > 0 and
                   sd_r > 4):
                 hp_filt.code = c.FiltCodes.SIXTYAI.value
@@ -232,7 +232,7 @@ def test_variant(
             sd_f = stdev(mut_read_pos_f)
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
-            frac_lt_thresh = sum([x <= cent90_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
+            frac_lt_thresh = sum([x <= position_fraction_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
             if (frac_lt_thresh < 0.9 or
                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
                (mad_f > 1 and sd_f > 10) or
@@ -271,28 +271,55 @@ def main_cli() -> None:
     parser = argparse.ArgumentParser(prog="hairpin")
     parser._optionals.title = 'info'
     parser.add_argument('-v', '--version', help='print version', action='version', version=c.VERSION)
-    req = parser.add_argument_group('required')
-    req.add_argument('-i', '--vcf-in', help="path to input vcf", required=True)
-    req.add_argument('-o', '--vcf-out', help="path to vcf out", required=True)
-    req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+', required=True)
-    opt = parser.add_argument_group('options')
-    opt.add_argument('-al', '--al-filter-threshold', help='default: 0.93', type=float, default=0.93)
-    opt.add_argument('-cq', '--clip-quality-cutoff', help='default: 35', type=int, default=35)
-    opt.add_argument('-mq', '--min-mapping-quality', help='default: 11', type=int, default=11)
-    opt.add_argument('-mb', '--min-base-quality', help='default: 25', type=int, default=25)
-    opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int, default=6)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [fraction] of start/end to allow HPF flag - default: 0.15', type=float, default=0.15)
+    req = parser.add_argument_group('basic')
+    req.add_argument('-i', '--vcf-in', help="path to input vcf")
+    req.add_argument('-o', '--vcf-out', help="path to vcf out")
+    req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+')
+    opt = parser.add_argument_group('extended')
+    opt.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
+    opt.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
     opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
-    opt.add_argument('-j', '--json-log', dest='json_path', help='log input parameters/arguments to JSON', type=str)
+    opt.add_argument('-al', '--al-filter-threshold', help='median read alignment score below which a variant is flagged as ALF - default: 0.93', type=float)
+    opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
+    opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
+    opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
+    opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [fraction] of start/end to allow HPF flag - default: 0.15', type=float)
 
     args = parser.parse_args()
 
+    json_config: dict | None = None
+    if args.input_json:
+        logging.info('args JSON provided, optional arguments will be loaded from args JSON if not present on command line')
+        try:
+            with open(args.input_json, 'r') as f:
+                json_config = json.load(f)
+        except Exception as e:
+            cleanup(msg='failed to open input JSON, reporting: {}'.format(e))
+
+    # set arg defaults
+    for k in vars(args).keys():
+        if not vars(args)[k]:
+            if json_config and k in json_config.keys():
+                setattr(args, k, json_config[k])
+            elif k in c.DEFAULTS.keys():
+                setattr(args, k, c.DEFAULTS[k])
+
+    # test args are sensible
+
+    if args.output_json:
+        try:
+            with open(args.output_json, "w") as output_json:
+                json.dump({k: vars(args)[k] for k in (vars(args).keys() - set(['input_json', 'output_json']))}, output_json, indent="")
+        except Exception as e:
+            cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
+
     primed_validate_read = partial(validate_read,
                                    min_mapqual=args.min_mapping_quality,
-                                   clip_qual_cutoff=args.clip_quality_cutoff,
+                                   min_clipqual=args.min_clip_quality,
                                    min_basequal=args.min_base_quality)
 
-    primed_variant_tester = partial(test_variant, al_thresh=args.al_filter_threshold, max_span=args.max_read_span, cent90_thresh=args.cent90_threshold, read_validator=primed_validate_read)
+    primed_variant_tester = partial(test_variant, al_thresh=args.al_filter_threshold, max_span=args.max_read_span, position_fraction_thresh=args.position_fraction, read_validator=primed_validate_read)
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
@@ -369,13 +396,7 @@ def main_cli() -> None:
                     record.info.update({filter.name: '|'.join([alt] + [str(f) if not type(f) == float else str(round(f, 3)) for f in filter][2:])})
 
             try:
-                vcf_out_handle.write(record)
+                vcf_out_handle.write(record)  # type:ignore
             except Exception as e:
                 cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
-    if args.json_path:
-        try:
-            with open(args.json_path, "w") as jo:
-                json.dump(vars(args), jo)
-        except Exception as e:
-            logging.warning('retaining output, but failed to write to parameters json, reporting {}'.format(e))
     cleanup(c.EXIT_SUCCESS)

From ec20ecf74e34763480b192b1ab183e9fdb7e3006 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 8 Aug 2024 11:11:14 +0100
Subject: [PATCH 043/165] verify args

---
 hairpin2/helpers.py | 40 +++++++++++++++++++++++++++++++
 hairpin2/main.py    | 57 +++++++++++++++++----------------------------
 2 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index b6f6ea4..12384dc 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -1,4 +1,44 @@
 from enum import IntEnum, Flag
+import logging
+import sys
+from hairpin2 import constants as c
+
+
+def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
+    if code != c.EXIT_SUCCESS and msg:
+        logging.error(msg)
+    for obj_name in ['vcf_in_handle', 'vcf_out_handle', 'output_json']:
+        if obj_name in locals():
+            locals()[obj_name].close()
+    for obj_name in ['bam_reader_d', 'mapped_bam_reader_d']:
+        if obj_name in locals():
+            for v in locals()[obj_name].values():
+                v.close()
+    if code == c.EXIT_SUCCESS:
+        logging.info('hairpin complete')
+    sys.exit(code)
+
+
+# <= - is subset of
+def verify_json(jd: dict) -> bool:
+    return jd.keys() <= {'vcf_in', 'vcf_out', 'bams', 'input_json', 'ouput_json', 'name_mapping', 'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'}
+
+
+def test_options(args):
+    if not args.vcf_in:
+        cleanup(msg='--vcf-in required')
+    if not args.vcf_out:
+        cleanup(msg='--vcf-out required')
+    if not args.bams:
+        cleanup(msg='--bams required')
+    if not (0 < args.min_clip_quality < 93):
+        cleanup(msg='invalid --min-clip-quality; range 0-93')
+    if not (0 < args.min_mapping_quality < 60):
+        cleanup(msg='invalid --min-mapping-quality; range 0-60')
+    if not (0 < args.min_base_quality < 93):
+        cleanup(msg='invalid --min-base-quality; range 0-93')
+    if not (0 < args.position_fraction < 1):
+        cleanup(msg='invalid --position-fraction; range 0-1')
 
 
 def has_duplicates(
diff --git a/hairpin2/main.py b/hairpin2/main.py
index e85d832..2fd3bd1 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -6,23 +6,6 @@
 import json
 from itertools import tee
 from functools import partial
-import sys
-
-
-def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
-    if code != c.EXIT_SUCCESS and msg:
-        logging.error(msg)
-    for obj_name in ['vcf_in_handle', 'vcf_out_handle', 'output_json']:
-        if obj_name in locals():
-            locals()[obj_name].close()  # lol
-    for obj_name in ['bam_reader_d', 'mapped_bam_reader_d']:
-        if obj_name in locals():
-            for k in locals()[obj_name].keys():
-                locals()[obj_name][k].close()
-    if code == c.EXIT_SUCCESS:
-        logging.info('hairpin complete')
-    sys.exit(code)
-
 
 # CIGAR best retrieved from CG:B,I tag - implement in future
 def validate_read(
@@ -279,7 +262,7 @@ def main_cli() -> None:
     opt.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
     opt.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
     opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
-    opt.add_argument('-al', '--al-filter-threshold', help='median read alignment score below which a variant is flagged as ALF - default: 0.93', type=float)
+    opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment scores over read length, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
@@ -290,12 +273,15 @@ def main_cli() -> None:
 
     json_config: dict | None = None
     if args.input_json:
-        logging.info('args JSON provided, optional arguments will be loaded from args JSON if not present on command line')
+        logging.info('args JSON provided, arguments will be loaded from JSON if not present on command line')
         try:
             with open(args.input_json, 'r') as f:
                 json_config = json.load(f)
         except Exception as e:
-            cleanup(msg='failed to open input JSON, reporting: {}'.format(e))
+            h.cleanup(msg='failed to open input JSON, reporting: {}'.format(e))
+        else:
+            if not h.verify_json(json_config): # type:ignore
+                h.cleanup(msg='JSON keys are not subset of available arguments (excluding --input-json and --output_json)')
 
     # set arg defaults
     for k in vars(args).keys():
@@ -305,14 +291,15 @@ def main_cli() -> None:
             elif k in c.DEFAULTS.keys():
                 setattr(args, k, c.DEFAULTS[k])
 
-    # test args are sensible
+    # test args are sensible, exit if not
+    h.test_options(args)
 
     if args.output_json:
         try:
             with open(args.output_json, "w") as output_json:
-                json.dump({k: vars(args)[k] for k in (vars(args).keys() - set(['input_json', 'output_json']))}, output_json, indent="")
+                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json'})}, output_json, indent="")
         except Exception as e:
-            cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
+            h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 
     primed_validate_read = partial(validate_read,
                                    min_mapqual=args.min_mapping_quality,
@@ -324,7 +311,7 @@ def main_cli() -> None:
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
     except Exception as e:
-        cleanup(msg='failed to open VCF input, reporting: {}'.format(e))
+        h.cleanup(msg='failed to open VCF input, reporting: {}'.format(e))
 
     # init output
     out_head = vcf_in_handle.header
@@ -336,18 +323,18 @@ def main_cli() -> None:
     try:
         vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
     except Exception as e:
-        cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
+        h.cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
     sample_names = list(vcf_in_handle.header.samples)  # type:ignore
     if len(set(sample_names)) != len(sample_names):
-        cleanup(msg='duplicate sample names in VCF')
+        h.cleanup(msg='duplicate sample names in VCF')
     sample_names: set[str] = set(sample_names)
     bam_reader_d: dict[str, pysam.AlignmentFile] = {}
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
         except Exception as e:
-            cleanup(msg='failed to read BAM at {}, reporting: {}'.format(path, e))
+            h.cleanup(msg='failed to read BAM at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
         # this may cause problems?
@@ -360,22 +347,22 @@ def main_cli() -> None:
         for pair in args.name_mapping:
             kv_split = pair.split(':')  # VCF:BAM
             if len(kv_split) != 2:
-                cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
+                h.cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
             vcf_map_names.append(kv_split[0])
             bam_map_names.append(kv_split[1])
         if h.has_duplicates(vcf_map_names):
-            cleanup(msg='duplicate VCF sample names in name mapping')
+            h.cleanup(msg='duplicate VCF sample names in name mapping')
         if h.lists_not_equal(vcf_map_names, sample_names):
-            cleanup(msg='VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
+            h.cleanup(msg='VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
         if h.has_duplicates(bam_map_names):
-            cleanup(msg='duplicate BAM sample names in name mapping')
+            h.cleanup(msg='duplicate BAM sample names in name mapping')
         if h.lists_not_equal(bam_map_names, bam_reader_d.keys()):
-            cleanup(msg='BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
+            h.cleanup(msg='BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
         mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
     else:
         names_mismatch = sample_names ^ bam_reader_d.keys()
         if len(names_mismatch):
-            cleanup(msg='name mismatch between BAMs and VCF: {}'.format(names_mismatch))
+            h.cleanup(msg='name mismatch between BAMs and VCF: {}'.format(names_mismatch))
 
     for record in vcf_in_handle.fetch():
         try:
@@ -398,5 +385,5 @@ def main_cli() -> None:
             try:
                 vcf_out_handle.write(record)  # type:ignore
             except Exception as e:
-                cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
-    cleanup(c.EXIT_SUCCESS)
+                h.cleanup(msg='failed to write to vcf, reporting: {}'.format(e))
+    h.cleanup(c.EXIT_SUCCESS)

From 7250ef429668781cb8c5d746473c945f53a4a6d0 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 9 Aug 2024 10:20:00 +0100
Subject: [PATCH 044/165] remove unnecessary cleanup code

---
 hairpin2/helpers.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 12384dc..785edfc 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -7,13 +7,6 @@
 def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
     if code != c.EXIT_SUCCESS and msg:
         logging.error(msg)
-    for obj_name in ['vcf_in_handle', 'vcf_out_handle', 'output_json']:
-        if obj_name in locals():
-            locals()[obj_name].close()
-    for obj_name in ['bam_reader_d', 'mapped_bam_reader_d']:
-        if obj_name in locals():
-            for v in locals()[obj_name].values():
-                v.close()
     if code == c.EXIT_SUCCESS:
         logging.info('hairpin complete')
     sys.exit(code)

From 07392f48a6135677f7f692fd26400f10f650b415 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 13 Aug 2024 14:05:23 +0100
Subject: [PATCH 045/165] allow for ignoring VCF samples

---
 hairpin2/main.py | 61 +++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 2fd3bd1..4f1ce37 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -97,7 +97,7 @@ def validate_read(
                 # n.b. nothing done if complex read
         if read_flag == c.ValidatorFlags.CLEAR.value:
             # "next", through an unfortunate quirk of history, means "mate", so this is reliable (pulls RNEXT)
-            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)
+            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # type:ignore
             if not (read.flag & 0x40):
                 # this looks like it should be checked for indexing snags
                 pair_start = read.reference_start
@@ -106,9 +106,9 @@ def validate_read(
                     if pair_start <= mate_end:
                         pair_start = mate_end + 1
                 else:
-                    if pair_end >= read.next_reference_start:
+                    if pair_end >= read.next_reference_start:  # type:ignore
                         pair_end = read.next_reference_start - 1
-                if not (pair_start <= vcf_record.start <= pair_end):
+                if not (pair_start <= vcf_record.start <= pair_end):  # type:ignore
                     read_flag |= c.ValidatorFlags.OVERLAP.value
     return read_flag
 
@@ -179,7 +179,7 @@ def test_variant(
                     mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
                     mut_read_pos_f.append(read_idx_wrt_aln)
                 try:
-                    aln_scores.append(read.get_tag('AS') / read.query_length)
+                    aln_scores.append(read.get_tag('AS') / read.query_length)  # type:ignore
                 except KeyError:
                     pass
         if len(aln_scores) != 0:
@@ -313,23 +313,11 @@ def main_cli() -> None:
     except Exception as e:
         h.cleanup(msg='failed to open VCF input, reporting: {}'.format(e))
 
-    # init output
-    out_head = vcf_in_handle.header
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}\">".format(args.al_filter_threshold))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Evidence that variant arises from hairpin artefact\">")
-    out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
-    out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
-
-    try:
-        vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
-    except Exception as e:
-        h.cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
-
     sample_names = list(vcf_in_handle.header.samples)  # type:ignore
     if len(set(sample_names)) != len(sample_names):
         h.cleanup(msg='duplicate sample names in VCF')
     sample_names: set[str] = set(sample_names)
-    bam_reader_d: dict[str, pysam.AlignmentFile] = {}
+    vcf_sample_to_bam_file: dict[str, pysam.AlignmentFile] = {}
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -339,9 +327,11 @@ def main_cli() -> None:
         # in header field RG
         # this may cause problems?
         # check with Peter
-        bam_sample_name = bam.header.to_dict()['RG'][0]['SM']
-        bam_reader_d[bam_sample_name] = bam
+        bam_sample_name = bam.header.to_dict()['RG'][0]['SM']  # type:ignore
+        vcf_sample_to_bam_file[bam_sample_name] = bam  # type:ignore
     if args.name_mapping:
+        if len(args.name_mapping) > len(args.bams):
+            h.cleanup(msg="more name mappings provided than BAMs")
         vcf_map_names = []
         bam_map_names = []
         for pair in args.name_mapping:
@@ -352,22 +342,35 @@ def main_cli() -> None:
             bam_map_names.append(kv_split[1])
         if h.has_duplicates(vcf_map_names):
             h.cleanup(msg='duplicate VCF sample names in name mapping')
-        if h.lists_not_equal(vcf_map_names, sample_names):
-            h.cleanup(msg='VCF sample names in name mapping do not match VCF sample names as retrieved from VCF')
+        if not set(vcf_map_names) <= sample_names:
+            h.cleanup(msg="VCF sample names provided to flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(bam_map_names):
             h.cleanup(msg='duplicate BAM sample names in name mapping')
-        if h.lists_not_equal(bam_map_names, bam_reader_d.keys()):
-            h.cleanup(msg='BAM sample names in name mapping do not match BAM sample names as retreived from BAMs')
-        mapped_bam_reader_d = {vcf_map_names[bam_map_names.index(k)]: v for k, v in bam_reader_d.items()}
+        if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file.keys()):  # type:ignore
+            h.cleanup(msg='BAM sample names provided to name mapping flags do not match BAM sample names as retreived from BAM SM tags')
+        vcf_sample_to_bam_file = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file.items()}
     else:
-        names_mismatch = sample_names ^ bam_reader_d.keys()
-        if len(names_mismatch):
-            h.cleanup(msg='name mismatch between BAMs and VCF: {}'.format(names_mismatch))
+        if not vcf_sample_to_bam_file.keys() <= sample_names:
+            h.cleanup(msg='SM tags of BAMs provided do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
+    if sample_names != vcf_sample_to_bam_file.keys():
+        logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file.keys()))
+
+    # init output
+    out_head = vcf_in_handle.header  # type:ignore
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_bam_file.keys())))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_bam_file.keys())))
+    out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
+    out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
+
+    try:
+        vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
+    except Exception as e:
+        h.cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
-    for record in vcf_in_handle.fetch():
+    for record in vcf_in_handle.fetch():  # type:ignore
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=mapped_bam_reader_d if args.name_mapping else bam_reader_d,
+                bams=vcf_sample_to_bam_file,
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )

From 52a05742348b8571a7587128b929810ff8137a65 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 13 Aug 2024 15:40:03 +0100
Subject: [PATCH 046/165] grammar

---
 hairpin2/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 4f1ce37..087be05 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -343,11 +343,11 @@ def main_cli() -> None:
         if h.has_duplicates(vcf_map_names):
             h.cleanup(msg='duplicate VCF sample names in name mapping')
         if not set(vcf_map_names) <= sample_names:
-            h.cleanup(msg="VCF sample names provided to flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
+            h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(bam_map_names):
             h.cleanup(msg='duplicate BAM sample names in name mapping')
         if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file.keys()):  # type:ignore
-            h.cleanup(msg='BAM sample names provided to name mapping flags do not match BAM sample names as retreived from BAM SM tags')
+            h.cleanup(msg='BAM sample names provided to name mapping flag do not match BAM sample names as retreived from BAM SM tags')
         vcf_sample_to_bam_file = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file.items()}
     else:
         if not vcf_sample_to_bam_file.keys() <= sample_names:

From aa8bb232d62319089c6787ccc0dce7100c68b2f5 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 13 Aug 2024 15:43:26 +0100
Subject: [PATCH 047/165] message consistency

---
 hairpin2/main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 087be05..5a41706 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -341,17 +341,17 @@ def main_cli() -> None:
             vcf_map_names.append(kv_split[0])
             bam_map_names.append(kv_split[1])
         if h.has_duplicates(vcf_map_names):
-            h.cleanup(msg='duplicate VCF sample names in name mapping')
+            h.cleanup(msg='duplicate VCF sample names provided to name mapping flag')
         if not set(vcf_map_names) <= sample_names:
             h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(bam_map_names):
-            h.cleanup(msg='duplicate BAM sample names in name mapping')
+            h.cleanup(msg='duplicate BAM sample names provided to name mapping flag')
         if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file.keys()):  # type:ignore
-            h.cleanup(msg='BAM sample names provided to name mapping flag do not match BAM sample names as retreived from BAM SM tags')
+            h.cleanup(msg='BAM sample names provided to name mapping flag do not match BAM SM tags')
         vcf_sample_to_bam_file = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file.items()}
     else:
         if not vcf_sample_to_bam_file.keys() <= sample_names:
-            h.cleanup(msg='SM tags of BAMs provided do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
+            h.cleanup(msg='SM tags of BAMs do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
     if sample_names != vcf_sample_to_bam_file.keys():
         logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file.keys()))
 

From af2e1beb5295012ecb34329693887d7c254170f2 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 13 Aug 2024 15:47:12 +0100
Subject: [PATCH 048/165] message consistency 2

---
 hairpin2/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 5a41706..fab1bc9 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -351,7 +351,7 @@ def main_cli() -> None:
         vcf_sample_to_bam_file = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file.items()}
     else:
         if not vcf_sample_to_bam_file.keys() <= sample_names:
-            h.cleanup(msg='SM tags of BAMs do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
+            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
     if sample_names != vcf_sample_to_bam_file.keys():
         logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file.keys()))
 

From 11f10a1444afecb2f3ddeac02e1a606320150163 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 13 Aug 2024 15:50:36 +0100
Subject: [PATCH 049/165] rename variable for clarity

---
 hairpin2/main.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index fab1bc9..64c9e66 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -317,7 +317,7 @@ def main_cli() -> None:
     if len(set(sample_names)) != len(sample_names):
         h.cleanup(msg='duplicate sample names in VCF')
     sample_names: set[str] = set(sample_names)
-    vcf_sample_to_bam_file: dict[str, pysam.AlignmentFile] = {}
+    vcf_sample_to_bam_file_map: dict[str, pysam.AlignmentFile] = {}
     for path in args.bams:
         try:
             bam = pysam.AlignmentFile(path, 'rb')
@@ -328,7 +328,7 @@ def main_cli() -> None:
         # this may cause problems?
         # check with Peter
         bam_sample_name = bam.header.to_dict()['RG'][0]['SM']  # type:ignore
-        vcf_sample_to_bam_file[bam_sample_name] = bam  # type:ignore
+        vcf_sample_to_bam_file_map[bam_sample_name] = bam  # type:ignore
     if args.name_mapping:
         if len(args.name_mapping) > len(args.bams):
             h.cleanup(msg="more name mappings provided than BAMs")
@@ -346,19 +346,19 @@ def main_cli() -> None:
             h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(bam_map_names):
             h.cleanup(msg='duplicate BAM sample names provided to name mapping flag')
-        if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file.keys()):  # type:ignore
+        if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file_map.keys()):  # type:ignore
             h.cleanup(msg='BAM sample names provided to name mapping flag do not match BAM SM tags')
-        vcf_sample_to_bam_file = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file.items()}
+        vcf_sample_to_bam_file_map = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file_map.items()}
     else:
-        if not vcf_sample_to_bam_file.keys() <= sample_names:
-            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_bam_file.keys() - sample_names))
-    if sample_names != vcf_sample_to_bam_file.keys():
-        logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file.keys()))
+        if not vcf_sample_to_bam_file_map.keys() <= sample_names:
+            h.cleanup(msg='SM tagsdo not match VCF sample names: {}'.format(vcf_sample_to_bam_file_map.keys() - sample_names))
+    if sample_names != vcf_sample_to_bam_file_map.keys():
+        logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file_map.keys()))
 
     # init output
     out_head = vcf_in_handle.header  # type:ignore
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_bam_file.keys())))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_bam_file.keys())))
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_bam_file_map.keys())))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_bam_file_map.keys())))
     out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
     out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
 
@@ -370,7 +370,7 @@ def main_cli() -> None:
     for record in vcf_in_handle.fetch():  # type:ignore
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=vcf_sample_to_bam_file,
+                bams=vcf_sample_to_bam_file_map,
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )

From af68633de76d3f0dfac72951976f3423317c167c Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 10:47:00 +0000
Subject: [PATCH 050/165] update singularity def

---
 .gitignore      | 1 +
 Singularity.def | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 749ae4b..c16f40e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ __pycache__/
 .helix/
 build/
 *.txt
+*.sif
diff --git a/Singularity.def b/Singularity.def
index 8c948d1..3979113 100644
--- a/Singularity.def
+++ b/Singularity.def
@@ -8,7 +8,7 @@ From: python:3.12-slim
 pip install hairpin/
 
 %test
-	LOC=$(which hairpin)
+	LOC=$(which hairpin2)
 	if [ -z "$LOC"]; then
 		echo "hairpin install failed"
 	else
@@ -16,4 +16,4 @@ pip install hairpin/
 	fi
 
 %runscript
-	exec hairpin "$@"
+	exec hairpin2 "$@"

From 3b46c0f85ca6ad2372ae06b6295e882e0ea1dce8 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 11:15:51 +0000
Subject: [PATCH 051/165] better args order

---
 hairpin2/main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 64c9e66..dbc3a2d 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -259,15 +259,16 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out")
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+')
     opt = parser.add_argument_group('extended')
-    opt.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
-    opt.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
-    opt.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
     opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment scores over read length, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
     opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [fraction] of start/end to allow HPF flag - default: 0.15', type=float)
+    proc = parser.add_argument_group('procedural')
+    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
+    proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
+    proc.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
 
     args = parser.parse_args()
 

From 2f2f7780717f8d5fb0f05ae6c6877481b762c282 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 13:44:53 +0100
Subject: [PATCH 052/165] small help clarity improvements

---
 hairpin2/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index dbc3a2d..b1fb9b8 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -332,7 +332,7 @@ def main_cli() -> None:
         vcf_sample_to_bam_file_map[bam_sample_name] = bam  # type:ignore
     if args.name_mapping:
         if len(args.name_mapping) > len(args.bams):
-            h.cleanup(msg="more name mappings provided than BAMs")
+            h.cleanup(msg="more name mappings than BAMs provided")
         vcf_map_names = []
         bam_map_names = []
         for pair in args.name_mapping:
@@ -352,7 +352,7 @@ def main_cli() -> None:
         vcf_sample_to_bam_file_map = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file_map.items()}
     else:
         if not vcf_sample_to_bam_file_map.keys() <= sample_names:
-            h.cleanup(msg='SM tagsdo not match VCF sample names: {}'.format(vcf_sample_to_bam_file_map.keys() - sample_names))
+            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_bam_file_map.keys() - sample_names))
     if sample_names != vcf_sample_to_bam_file_map.keys():
         logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file_map.keys()))
 

From 1abc9c6378a9bdff7a442cfe095dc0462a149034 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:13:56 +0100
Subject: [PATCH 053/165] add doc for internal use

---
 internal_doc.md | 117 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 internal_doc.md

diff --git a/internal_doc.md b/internal_doc.md
new file mode 100644
index 0000000..bbb2891
--- /dev/null
+++ b/internal_doc.md
@@ -0,0 +1,117 @@
+### INTRODUCTION
+
+`hairpin2` - CLI implementation of the hairpin detection and flagging algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63). Code not yet public, but availabe on internal gitlab at https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core
+
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+
+`hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
+However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
+
+> Mathjis LCM filters includes the following steps:
+>    - Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
+>    - Hairpin Filtering
+>    - Filtering based on fragment numbers.
+> Which are split across the following steps: (As per his scripts)
+>    - preselect
+>    - imitateANNOVAR
+>    - annotateBAMStatistics
+>    - additionalBAMStatistics
+>    - filtering
+> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
+
+Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
+  - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
+  - Transparency - reasoning for flagging decisions logged in VCF
+  - Single tool centrally maintained and versioned - for reproducibility/citing/distribution
+  - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
+  - The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
+  - The module **does not** prefilter, or perform fragment filtering
+With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.
+
+**N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. It also may be broken in some way; if so please get in touch. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
+
+
+### ACCESS
+
+For local or VM use, see GitLab for install instructions.
+For farm22 use, available as a module.
+```
+module avail hairpin2
+module load <version>
+```
+N.B. do not confuse with the module `hairpin` - this is `hairpin2`
+
+
+### ASSUMPTIONS
+
+`hairpin2` is designed for paired data where reads have the MC tag. If this tag is not present in your data, it can be added using samtools fixmate or biobambam2 bamsormadup. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
+
+
+### USAGE
+
+```
+usage: hairpin [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION]
+               [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+
+info:
+  -h, --help            show this help message and exit
+  -v, --version         print version
+
+basic:
+  -i VCF_IN, --vcf-in VCF_IN
+                        path to input vcf
+  -o VCF_OUT, --vcf-out VCF_OUT
+                        path to vcf out
+  -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
+                        list of paths to name-sorted bams for samples in input vcf, whitespace separated
+
+extended:
+  -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
+                        threshhold for median of read alignment scores over read length, below which a variant is flagged as **ALF** - default: 0.93
+  -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
+                        discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
+  -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
+                        discard reads with mapping quality below this value - default: 11
+  -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
+                        discard reads with base quality at variant position below this value - default: 25
+  -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
+                        maximum +- position to use when detecting PCR duplicates - default: 6
+  -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
+                        >90% of variant reads variant must occur within [fraction] of start/end to allow **HPF** flag - default: 0.15
+
+procedural:
+  -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
+                        map VCF sample names to BAM sample names; useful if they differ
+  -ji INPUT_JSON, --input-json INPUT_JSON
+                        path to JSON of input parameters; overridden by arguments provided on command line
+  -jo OUTPUT_JSON, --output-json OUTPUT_JSON
+```
+
+**N.B.** the above usage block indicates the call for the tool is `hairpin2` - this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`
+
+Parameters are hopefully clear from the helptext, but two are more unusual:
+
+  `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
+  `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+
+
+### DETAILS
+
+The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`.
+The codes are as follows:
+  0 - passed/failed on condition 60A(i) of Ellis et al. (**HPF** only)
+  1 - passed/failed on condition 60B(i) of Ellis et al. (**HPF** only)
+  2 - passed/failed on filter threshold (**ALF** only)
+  3 - insufficient appropriate reads to support calling flag (pass only) (This covers a lot of possiblities, if more granularity is desired, please request it)
+  4 - no samples have non 0,0 genotype for the record (pass only)
+For the **ALF** flag, the median alignment score is also recorded.
+
+The basic procedure of this implementation is as follows:
+
+  For each record in the VCF, test every alt for that record by:
+
+  - retrieving reads from samples exhibiting the mutations
+  - testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+  - performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+  - on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the INFO field indicating the reason for the decision
+

From 04d294947cf6d1cb0a3fbc184e7bd2fa081ea958 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:15:12 +0100
Subject: [PATCH 054/165] fix doc formatting 1

---
 internal_doc.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index bbb2891..217ea92 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -7,17 +7,17 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
 
-> Mathjis LCM filters includes the following steps:
+>Mathjis LCM filters includes the following steps:
 >    - Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 >    - Hairpin Filtering
 >    - Filtering based on fragment numbers.
-> Which are split across the following steps: (As per his scripts)
+>Which are split across the following steps: (As per his scripts)
 >    - preselect
 >    - imitateANNOVAR
 >    - annotateBAMStatistics
 >    - additionalBAMStatistics
 >    - filtering
-> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
+>The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
   - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)

From 7b201d6153140e372d3819a4c4f3f1962fa09a68 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:16:55 +0100
Subject: [PATCH 055/165] fix doc formatting 2

---
 internal_doc.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 217ea92..67fe13c 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -11,13 +11,13 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 >    - Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 >    - Hairpin Filtering
 >    - Filtering based on fragment numbers.
->Which are split across the following steps: (As per his scripts)
+    >Which are split across the following steps: (As per his scripts)
 >    - preselect
 >    - imitateANNOVAR
 >    - annotateBAMStatistics
 >    - additionalBAMStatistics
 >    - filtering
->The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
+    >The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
   - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)

From cccc793de7e229d5df9267d91ec1479ff15a313f Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:19:13 +0100
Subject: [PATCH 056/165] fix doc formatting 2

---
 internal_doc.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 67fe13c..41d8235 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -7,17 +7,17 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
 
->Mathjis LCM filters includes the following steps:
->    - Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
->    - Hairpin Filtering
->    - Filtering based on fragment numbers.
-    >Which are split across the following steps: (As per his scripts)
->    - preselect
->    - imitateANNOVAR
->    - annotateBAMStatistics
->    - additionalBAMStatistics
->    - filtering
-    >The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
+> Mathjis LCM filters includes the following steps:
+> 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
+> 2. Hairpin Filtering
+> 3. Filtering based on fragment numbers.
+> Which are split across the following steps: (As per his scripts)
+> - preselect
+> - imitateANNOVAR
+> - annotateBAMStatistics
+> - additionalBAMStatistics
+> - filtering
+> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
   - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)

From be45bb312e571185fd3bb1abbad4ae502dbc5bdc Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:20:29 +0100
Subject: [PATCH 057/165] fix doc formatting 3

---
 internal_doc.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 41d8235..067eb4b 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -10,13 +10,13 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 > Mathjis LCM filters includes the following steps:
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 > 2. Hairpin Filtering
-> 3. Filtering based on fragment numbers.
-> Which are split across the following steps: (As per his scripts)
+> 3. Filtering based on fragment numbers.  
+> Which are split across the following steps: (As per his scripts)  
 > - preselect
 > - imitateANNOVAR
 > - annotateBAMStatistics
 > - additionalBAMStatistics
-> - filtering
+> - filtering  
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:

From 9304ebca2670b338ec02ca32ec00c1f901b902bb Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:22:26 +0100
Subject: [PATCH 058/165] fix doc formatting 4

---
 internal_doc.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/internal_doc.md b/internal_doc.md
index 067eb4b..55c411a 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -11,12 +11,14 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 > 2. Hairpin Filtering
 > 3. Filtering based on fragment numbers.  
+>
 > Which are split across the following steps: (As per his scripts)  
 > - preselect
 > - imitateANNOVAR
 > - annotateBAMStatistics
 > - additionalBAMStatistics
 > - filtering  
+>
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:

From 78c154e281a6093253fea08dd99a6e7bf9e2ca98 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:47:30 +0100
Subject: [PATCH 059/165] Update internal_doc.md

---
 internal_doc.md | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 55c411a..925f410 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -30,7 +30,7 @@ Since the versions available of "Mathijs' Scripts" are many and varied, we canno
   - The module **does not** prefilter, or perform fragment filtering
 With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.
 
-**N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. It also may be broken in some way; if so please get in touch. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
+**N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
 
 
 ### ACCESS
@@ -41,12 +41,12 @@ For farm22 use, available as a module.
 module avail hairpin2
 module load <version>
 ```
-N.B. do not confuse with the module `hairpin` - this is `hairpin2`
+ **N.B. do not confuse with the module `hairpin` - this is `hairpin2`**
 
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where reads have the MC tag. If this tag is not present in your data, it can be added using samtools fixmate or biobambam2 bamsormadup. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
+`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
 
 
 ### USAGE
@@ -91,29 +91,26 @@ procedural:
 
 **N.B.** the above usage block indicates the call for the tool is `hairpin2` - this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`
 
-Parameters are hopefully clear from the helptext, but two are more unusual:
+Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-  `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
-  `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error. Another way to consider   
+`--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
+`--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
 
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`.
-The codes are as follows:
-  0 - passed/failed on condition 60A(i) of Ellis et al. (**HPF** only)
-  1 - passed/failed on condition 60B(i) of Ellis et al. (**HPF** only)
-  2 - passed/failed on filter threshold (**ALF** only)
-  3 - insufficient appropriate reads to support calling flag (pass only) (This covers a lot of possiblities, if more granularity is desired, please request it)
-  4 - no samples have non 0,0 genotype for the record (pass only)
-For the **ALF** flag, the median alignment score is also recorded.
+The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-The basic procedure of this implementation is as follows:
-
-  For each record in the VCF, test every alt for that record by:
-
-  - retrieving reads from samples exhibiting the mutations
-  - testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
-  - performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
-  - on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the INFO field indicating the reason for the decision
+**0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+**1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+**2** - passed/failed on filter threshold (ALF only)  
+**3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+**4** - no samples have non 0,0 genotype for the record (pass only)  
 
+The basic procedure of this implementation is as follows:  
+>   For each record in the VCF, test every alt for that record by:  
+>   1. retrieving reads from samples exhibiting the mutations
+>   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+>   3. performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+>   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision

From 356dc9e484099034715c227c013cf360d1cc4354 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:47:54 +0100
Subject: [PATCH 060/165] helptext clarity

---
 hairpin2/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index b1fb9b8..4a8b213 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -259,12 +259,12 @@ def main_cli() -> None:
     req.add_argument('-o', '--vcf-out', help="path to vcf out")
     req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+')
     opt = parser.add_argument_group('extended')
-    opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment scores over read length, below which a variant is flagged as ALF - default: 0.93', type=float)
+    opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [fraction] of start/end to allow HPF flag - default: 0.15', type=float)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [POSITION_FRACTION] of start/end to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
     proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)

From db6fa148f70a61f1514020de361da9e5dfeef2e7 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:49:44 +0100
Subject: [PATCH 061/165] Update internal_doc.md

---
 internal_doc.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 925f410..13a893a 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -69,7 +69,7 @@ basic:
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshhold for median of read alignment scores over read length, below which a variant is flagged as **ALF** - default: 0.93
+                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as **ALF** - default: 0.93
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
@@ -93,7 +93,7 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error. Another way to consider   
+`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.
 `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
 `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
 

From e47597240c039edc11a857d80332667d54bd8a03 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:50:42 +0100
Subject: [PATCH 062/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 13a893a..6206190 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -93,7 +93,7 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.
+`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
 `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
 `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
 

From f23568633f4e5722096a009b591310d6f36e676d Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:53:35 +0100
Subject: [PATCH 063/165] Update internal_doc.md

---
 internal_doc.md | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 6206190..f25f25f 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -7,6 +7,7 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
 
+> """  
 > Mathjis LCM filters includes the following steps:
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 > 2. Hairpin Filtering
@@ -19,16 +20,17 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 > - additionalBAMStatistics
 > - filtering  
 >
-> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.
+> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
+> """  
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
-  - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
-  - Transparency - reasoning for flagging decisions logged in VCF
-  - Single tool centrally maintained and versioned - for reproducibility/citing/distribution
-  - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
-  - The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
-  - The module **does not** prefilter, or perform fragment filtering
-With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.
+> - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
+> - Transparency - reasoning for flagging decisions logged in VCF
+> - Single tool centrally maintained and versioned - for reproducibility/citing/distribution
+> - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
+> - The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
+> - The module **does not** prefilter, or perform fragment filtering
+> With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
 
 **N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
 
@@ -93,24 +95,26 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-`--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-`--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.
-`--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+> `--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+> `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.  
+> `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+
 
 
 ### DETAILS
 
 The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-**0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-**1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-**2** - passed/failed on filter threshold (ALF only)  
-**3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-**4** - no samples have non 0,0 genotype for the record (pass only)  
+> **0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+> **1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+> **2** - passed/failed on filter threshold (ALF only)  
+> **3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+> **4** - no samples have non 0,0 genotype for the record (pass only)
+  
 
 The basic procedure of this implementation is as follows:  
 >   For each record in the VCF, test every alt for that record by:  
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+>   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file

From 6fff44867ac8eefec0bee53ea9aab5026b55932a Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:56:10 +0100
Subject: [PATCH 064/165] Update README.md

---
 README.md | 107 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 38 deletions(-)

diff --git a/README.md b/README.md
index 1829d9a..e1b549a 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,9 @@
 # hairpin2
 
-Maintainable, transparent, implementation of the hairpin detection and flagging algorithm concieved by Mathijs' Sanders. Implemented by Peter Campbell and Alex Byrne
+`hairpin2` - CLI implementation of the hairpin detection and flagging algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne.
+
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+
 
 ### REQUIREMENTS
 
@@ -26,44 +29,72 @@ export PATH=${PATH}:${INST_PATH}/bin
 hairpin -h
 ```
 
-### DETAILS
+### ASSUMPTIONS
+
+`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
+
+
+### USAGE
 
 ```
-usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -b BAMS [BAMS ...] [-cq CLIP_QUALITY_CUTOFF] [-mq MIN_MAPPING_QUALITY] 
-  [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-al AL_FILTER_THRESHOLD] [-c9 CENT90_THRESHOLD] [-j JSON_PATH]
-
-  info:
-    -h, --help            show this help message and exit
-    -v, --version         print version
-
-  required:
-    -i VCF_IN, --vcf-in VCF_IN
-                path to input vcf
-    -o VCF_OUT, --vcf-out VCF_OUT
-                path to vcf out
-    -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                list of paths to name-sorted bams for samples in input vcf, whitespace separated
-
-  options:
-    -cq CLIP_QUALITY_CUTOFF, --clip-quality-cutoff CLIP_QUALITY_CUTOFF
-                default: 35
-    -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
-                default: 11
-    -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
-                default: 25
-    -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
-                default: 6
-    -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                default: 0.93
-    -c9 CENT90_THRESHOLD, --cent90-threshold CENT90_THRESHOLD
-                default: 0.15
-    -j JSON_PATH, --json-log JSON_PATH
-                log input parameters/arguments to JSON
+usage: hairpin [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION]
+               [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+
+info:
+  -h, --help            show this help message and exit
+  -v, --version         print version
+
+basic:
+  -i VCF_IN, --vcf-in VCF_IN
+                        path to input vcf
+  -o VCF_OUT, --vcf-out VCF_OUT
+                        path to vcf out
+  -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
+                        list of paths to name-sorted bams for samples in input vcf, whitespace separated
+
+extended:
+  -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
+                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as **ALF** - default: 0.93
+  -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
+                        discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
+  -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
+                        discard reads with mapping quality below this value - default: 11
+  -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
+                        discard reads with base quality at variant position below this value - default: 25
+  -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
+                        maximum +- position to use when detecting PCR duplicates - default: 6
+  -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
+                        >90% of variant reads variant must occur within [fraction] of start/end to allow **HPF** flag - default: 0.15
+
+procedural:
+  -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
+                        map VCF sample names to BAM sample names; useful if they differ
+  -ji INPUT_JSON, --input-json INPUT_JSON
+                        path to JSON of input parameters; overridden by arguments provided on command line
+  -jo OUTPUT_JSON, --output-json OUTPUT_JSON
 ```
 
-The basic procedure of this implementation is as follows:
-> For each record in the VCF, test every alt for that record by:
-> * retrieving reads from samples exhibiting the mutations, then
-> * testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on), then
-> * performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads, then
-> * on the results of the statistical analysis, pass or fail the record for the filters ALF and HPF, and log a code and relevant info to the INFO field indicating the reason for the decision
+Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
+
+> `--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+> `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.  
+> `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+
+
+### DETAILS
+
+The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+
+> **0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+> **1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+> **2** - passed/failed on filter threshold (ALF only)  
+> **3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+> **4** - no samples have non 0,0 genotype for the record (pass only)
+  
+
+The basic procedure of this implementation is as follows:  
+>   For each record in the VCF, test every alt for that record by:  
+>   1. retrieving reads from samples exhibiting the mutations
+>   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+>   3. performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+>   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file

From 64c62cc26bf755131cf4980796eeef824cf17232 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Wed, 14 Aug 2024 14:56:16 +0100
Subject: [PATCH 065/165] Update internal_doc.md

---
 internal_doc.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index f25f25f..1b9c68f 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -35,7 +35,7 @@ Since the versions available of "Mathijs' Scripts" are many and varied, we canno
 **N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
 
 
-### ACCESS
+### MODULE ACCESS
 
 For local or VM use, see GitLab for install instructions.
 For farm22 use, available as a module.
@@ -43,7 +43,7 @@ For farm22 use, available as a module.
 module avail hairpin2
 module load <version>
 ```
- **N.B. do not confuse with the module `hairpin` - this is `hairpin2`**
+**N.B. do not confuse with the module `hairpin` - this is `hairpin2`**
 
 
 ### ASSUMPTIONS

From a470a2b8a64a1b17c960a409f2547a9e252cc44c Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 09:35:30 +0000
Subject: [PATCH 066/165] cleaner args, build

---
 Singularity.def  | 2 +-
 hairpin2/main.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Singularity.def b/Singularity.def
index 3979113..d01e4e1 100644
--- a/Singularity.def
+++ b/Singularity.def
@@ -5,7 +5,7 @@ From: python:3.12-slim
 . hairpin/
 
 %post
-pip install hairpin/
+pip install --root-user-action ignore hairpin/
 
 %test
 	LOC=$(which hairpin2)
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 4a8b213..72d4b9c 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -264,9 +264,9 @@ def main_cli() -> None:
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within [POSITION_FRACTION] of start/end to allow HPF flag - default: 0.15', type=float)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within POSITION_FRACTION of start/end to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
-    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM sample names; useful if they differ', metavar='VCF:BAM', nargs='+')
+    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM SM tags; useful if they differ', metavar='VCF:BAM', nargs='+')
     proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
     proc.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
 

From a2ac6dfb325b0b3b326f463ad3aec5d44f9be329 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 10:21:42 +0000
Subject: [PATCH 067/165] further helptext refining

---
 hairpin2/main.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 72d4b9c..d5dec81 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -251,20 +251,20 @@ def test_record_per_alt(
 def main_cli() -> None:
     logging.basicConfig(level=logging.INFO, format='%(asctime)s ¦ %(levelname)-8s ¦ %(message)s', datefmt='%I:%M:%S')
 
-    parser = argparse.ArgumentParser(prog="hairpin")
+    parser = argparse.ArgumentParser(prog="hairpin2", description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
     parser._optionals.title = 'info'
     parser.add_argument('-v', '--version', help='print version', action='version', version=c.VERSION)
     req = parser.add_argument_group('basic')
-    req.add_argument('-i', '--vcf-in', help="path to input vcf")
-    req.add_argument('-o', '--vcf-out', help="path to vcf out")
-    req.add_argument('-b', '--bams', help="list of paths to name-sorted bams for samples in input vcf, whitespace separated", nargs='+')
+    req.add_argument('-i', '--vcf-in', help="path to input VCF")
+    req.add_argument('-o', '--vcf-out', help="path to write output VCF")
+    req.add_argument('-b', '--bams', help="list of paths to BAMs for samples in input VCF, whitespace separated", nargs='+')
     opt = parser.add_argument_group('extended')
     opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant reads variant must occur within POSITION_FRACTION of start/end to allow HPF flag - default: 0.15', type=float)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM SM tags; useful if they differ', metavar='VCF:BAM', nargs='+')
     proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)

From 1a0dec75cf0e666e8f6142d1bdec36ff2b63717b Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:24:10 +0100
Subject: [PATCH 068/165] Update internal_doc.md

---
 internal_doc.md | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 1b9c68f..95c30f5 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -48,14 +48,16 @@ module load <version>
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
+`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. There are no further assumptions.
 
 
 ### USAGE
 
 ```
-usage: hairpin [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION]
-               [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+usage: hairpin2 [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY]
+                [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+
+cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
 info:
   -h, --help            show this help message and exit
@@ -63,15 +65,15 @@ info:
 
 basic:
   -i VCF_IN, --vcf-in VCF_IN
-                        path to input vcf
+                        path to input VCF
   -o VCF_OUT, --vcf-out VCF_OUT
-                        path to vcf out
+                        path to write output VCF
   -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                        list of paths to name-sorted bams for samples in input vcf, whitespace separated
+                        list of paths to BAMs for samples in input VCF, whitespace separated
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as **ALF** - default: 0.93
+                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
@@ -81,14 +83,15 @@ extended:
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
                         maximum +- position to use when detecting PCR duplicates - default: 6
   -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant reads variant must occur within [fraction] of start/end to allow **HPF** flag - default: 0.15
+                        >90% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15
 
 procedural:
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
-                        map VCF sample names to BAM sample names; useful if they differ
+                        map VCF sample names to BAM SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON
                         path to JSON of input parameters; overridden by arguments provided on command line
   -jo OUTPUT_JSON, --output-json OUTPUT_JSON
+                        log input arguments to JSON
 ```
 
 **N.B.** the above usage block indicates the call for the tool is `hairpin2` - this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`

From b68af1a0298905c5a3c04903d587598be9e1d7fa Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:25:56 +0100
Subject: [PATCH 069/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 95c30f5..19b6cb0 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -1,6 +1,6 @@
 ### INTRODUCTION
 
-`hairpin2` - CLI implementation of the hairpin detection and flagging algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63). Code not yet public, but availabe on internal gitlab at https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core
+`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63). Code not yet public, but availabe on internal gitlab at https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core
 
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 

From 6761c2cd9bd5d4f2b3283d8344a08b817c4b17dc Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:30:23 +0100
Subject: [PATCH 070/165] Update README.md

---
 README.md | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index e1b549a..3c5af7c 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,10 @@
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 
 
-### REQUIREMENTS
+### DEPENDENCIES
 
-* Python >= 3.10
+* Python >= 3.10 (required)
+* pysam >= 0.22.1 (installed automatically during install process) - tested with 0.22.1 only
 
 ### INSTALLATION
 
@@ -31,14 +32,16 @@ hairpin -h
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired.
+`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made.
 
 
 ### USAGE
 
 ```
-usage: hairpin [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION]
-               [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+usage: hairpin2 [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY]
+                [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+
+cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
 info:
   -h, --help            show this help message and exit
@@ -46,15 +49,15 @@ info:
 
 basic:
   -i VCF_IN, --vcf-in VCF_IN
-                        path to input vcf
+                        path to input VCF
   -o VCF_OUT, --vcf-out VCF_OUT
-                        path to vcf out
+                        path to write output VCF
   -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                        list of paths to name-sorted bams for samples in input vcf, whitespace separated
+                        list of paths to BAMs for samples in input VCF, whitespace separated
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as **ALF** - default: 0.93
+                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
@@ -64,14 +67,15 @@ extended:
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
                         maximum +- position to use when detecting PCR duplicates - default: 6
   -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant reads variant must occur within [fraction] of start/end to allow **HPF** flag - default: 0.15
+                        >90% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15
 
 procedural:
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
-                        map VCF sample names to BAM sample names; useful if they differ
+                        map VCF sample names to BAM SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON
                         path to JSON of input parameters; overridden by arguments provided on command line
   -jo OUTPUT_JSON, --output-json OUTPUT_JSON
+                        log input arguments to JSON
 ```
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:

From 3560a14092d0958e49cea41c47cfc797cb365f04 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:31:16 +0100
Subject: [PATCH 071/165] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3c5af7c..97c0b5f 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 
 ### DEPENDENCIES
 
-* Python >= 3.10 (required)
-* pysam >= 0.22.1 (installed automatically during install process) - tested with 0.22.1 only
+* Python >= 3.10 - required
+* pysam >= 0.22.1 - installed automatically during install process (tested with 0.22.1 only)
 
 ### INSTALLATION
 

From d4c4d3c20e654b60492656c4a5f83121c0f16c50 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:33:32 +0100
Subject: [PATCH 072/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 97c0b5f..f3b3a2e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # hairpin2
 
-`hairpin2` - CLI implementation of the hairpin detection and flagging algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne.
+`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). 
 
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 

From f100160be407855feb5f886e8f5b9388778deffc Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:06:58 +0100
Subject: [PATCH 073/165] Update internal_doc.md

---
 internal_doc.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 19b6cb0..4a38bef 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -2,7 +2,7 @@
 
 `hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63). Code not yet public, but availabe on internal gitlab at https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core
 
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if relevant reads have lower median alignment score per base than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter – cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
@@ -48,7 +48,7 @@ module load <version>
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. There are no further assumptions.
+`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specification.
 
 
 ### USAGE
@@ -83,7 +83,7 @@ extended:
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
                         maximum +- position to use when detecting PCR duplicates - default: 6
   -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15
+                        >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
 
 procedural:
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
@@ -98,26 +98,26 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-> `--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-> `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+> `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+> `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants tend to cluster at the edge of the alignment, appearing with anomalous regularity at the alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag
 
 
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the **HPF** and **ALF** filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-> **0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-> **1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-> **2** - passed/failed on filter threshold (ALF only)  
-> **3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-> **4** - no samples have non 0,0 genotype for the record (pass only)
+> **0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+> **1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+> **2** – passed/failed on filter threshold (ALF only)  
+> **3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+> **4** – no samples have non 0,0 genotype for the record (pass only)
   
 
 The basic procedure of this implementation is as follows:  
 >   For each record in the VCF, test every alt for that record by:  
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
->   3. performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+>   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
 >   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file

From c22bbb5850d85ebe64ff7c1634efa0f253994a40 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:07:04 +0100
Subject: [PATCH 074/165] Update README.md

---
 README.md | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index f3b3a2e..d14350e 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,14 @@
 # hairpin2
 
-`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). 
+`hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). 
 
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if they exhibit a lower median alignment score than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter - cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if relevant reads have lower median alignment score per base than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter – cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 
 
 ### DEPENDENCIES
 
-* Python >= 3.10 - required
-* pysam >= 0.22.1 - installed automatically during install process (tested with 0.22.1 only)
+* Python >= 3.10 – required
+* pysam >= 0.22.1 – installed automatically during install process (tested with 0.22.1 only)
 
 ### INSTALLATION
 
@@ -32,7 +32,7 @@ hairpin -h
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where reads have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made.
+`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specification.
 
 
 ### USAGE
@@ -67,7 +67,7 @@ extended:
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
                         maximum +- position to use when detecting PCR duplicates - default: 6
   -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15
+                        >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
 
 procedural:
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
@@ -80,25 +80,25 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-> `--al-filter-threshold` - the default value of 0.93 was arrived at by trial and error - since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-> `--max-read-span`  - Long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. max-read-span is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` - cruciform artefacts usually contain segments that align beause the segment is not in ref genome, and so the segment is soft clipped – this pushes the false variants associated with the arterfact to edges of the reads; unlike true variants. If more than 90% of the reads are within that first/last fraction, allow for calling **HPF** flag
+> `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+> `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants tend to cluster at the edge of the alignment, appearing with anomalous regularity at the alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag
 
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the **HPF**, indicating a hairpin/cruciform artefact, and **ALF** filter flags as appropriate. It records reasoning for its decisions in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the **HPF** and **ALF** filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-> **0** - passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-> **1** - passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-> **2** - passed/failed on filter threshold (ALF only)  
-> **3** - insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-> **4** - no samples have non 0,0 genotype for the record (pass only)
+> **0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+> **1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+> **2** – passed/failed on filter threshold (ALF only)  
+> **3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+> **4** – no samples have non 0,0 genotype for the record (pass only)
   
 
 The basic procedure of this implementation is as follows:  
 >   For each record in the VCF, test every alt for that record by:  
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
->   3. performing statistical analysis on aggregates of the position of the mutatation relative to the start and end of the aligned portion of the reads
+>   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
 >   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file

From 160cd744d303a33b009d071ebb344f7fd6c2eb2d Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:09:35 +0100
Subject: [PATCH 075/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 4a38bef..19b84ea 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -100,7 +100,7 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 > `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
 > `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants tend to cluster at the edge of the alignment, appearing with anomalous regularity at the alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag
+> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
 
 
 

From 14ca1b3b6fcc6e83e7fde363dec255cd87320949 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:09:39 +0100
Subject: [PATCH 076/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d14350e..b9104e7 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 > `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
 > `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants tend to cluster at the edge of the alignment, appearing with anomalous regularity at the alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag
+> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
 
 
 ### DETAILS

From 9b02b3c85285f5cd6e3a9a5b12153780b672f302 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 11:09:56 +0000
Subject: [PATCH 077/165] further helptext refining

---
 hairpin2/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index d5dec81..78b9242 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -264,7 +264,7 @@ def main_cli() -> None:
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of start/end of reads to allow HPF flag - default: 0.15', type=float)
+    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM SM tags; useful if they differ', metavar='VCF:BAM', nargs='+')
     proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)

From be2004acea601b004036a085bb0ed0755bc4b45b Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:13:13 +0100
Subject: [PATCH 078/165] Update README.md

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b9104e7..8fef067 100644
--- a/README.md
+++ b/README.md
@@ -101,4 +101,6 @@ The basic procedure of this implementation is as follows:
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file
+>   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+
+The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
\ No newline at end of file

From d7921b02b91d91dfeed6a91ef2d6d40306ae75fd Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:15:01 +0100
Subject: [PATCH 079/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 19b84ea..84b1462 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -1,6 +1,6 @@
 ### INTRODUCTION
 
-`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63). Code not yet public, but availabe on internal gitlab at https://gitlab.internal.sanger.ac.uk/casm/team78/hairpin-core
+`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63).
 
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if relevant reads have lower median alignment score per base than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter – cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
 

From 648d9c821cc559d8346869ca506c8ef093686fc2 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:16:20 +0100
Subject: [PATCH 080/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 84b1462..10e1623 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -37,7 +37,7 @@ Since the versions available of "Mathijs' Scripts" are many and varied, we canno
 
 ### MODULE ACCESS
 
-For local or VM use, see GitLab for install instructions.
+For local or VM use, see README for install instructions.
 For farm22 use, available as a module.
 ```
 module avail hairpin2

From 3a3e19e06ced0cee8a1ec2daed9905e51caeaf88 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:16:53 +0100
Subject: [PATCH 081/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 10e1623..92251f2 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -48,7 +48,7 @@ module load <version>
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specification.
+`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specifications.
 
 
 ### USAGE

From 8615097f3688a865a293b46778a13e246f3bb6db Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:17:06 +0100
Subject: [PATCH 082/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8fef067..0071789 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ hairpin -h
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specification.
+`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specifications.
 
 
 ### USAGE

From f3dd13faaf0e8b040d579cd028f637e4310cbc9a Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:21:17 +0100
Subject: [PATCH 083/165] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 0071789..62f5e64 100644
--- a/README.md
+++ b/README.md
@@ -80,9 +80,9 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-> `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-> `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
+- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+- `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
 
 
 ### DETAILS

From f3df35808e7507edf542a3cf22be536a068f3f18 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 12:23:25 +0100
Subject: [PATCH 084/165] Update internal_doc.md

---
 internal_doc.md | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 92251f2..2d24084 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -7,7 +7,6 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
 
-> """  
 > Mathjis LCM filters includes the following steps:
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 > 2. Hairpin Filtering
@@ -21,16 +20,16 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 > - filtering  
 >
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
-> """  
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
-> - No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
-> - Transparency - reasoning for flagging decisions logged in VCF
-> - Single tool centrally maintained and versioned - for reproducibility/citing/distribution
-> - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
-> - The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
-> - The module **does not** prefilter, or perform fragment filtering
-> With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
+- No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
+- Transparency - reasoning for flagging decisions logged in VCF
+- Single tool centrally maintained and versioned - for reproducibility/citing/distribution
+- Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
+- The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
+- The module **does not** prefilter, or perform fragment filtering  
+
+With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
 
 **N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
 
@@ -98,9 +97,9 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-> `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-> `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-> `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
+- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+- `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
 
 
 
@@ -108,16 +107,15 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 The tool tests records in a VCF file and applies the **HPF** and **ALF** filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-> **0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-> **1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-> **2** – passed/failed on filter threshold (ALF only)  
-> **3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-> **4** – no samples have non 0,0 genotype for the record (pass only)
+**0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
+**1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
+**2** – passed/failed on filter threshold (ALF only)  
+**3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
+**4** – no samples have non 0,0 genotype for the record (pass only)
   
 
-The basic procedure of this implementation is as follows:  
->   For each record in the VCF, test every alt for that record by:  
->   1. retrieving reads from samples exhibiting the mutations
->   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
->   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file
+The basic procedure of this implementation is as follows. For each record in the VCF, test every alt for that record by:  
+1. retrieving reads from samples exhibiting the mutations
+2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
+4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file

From b0c809b2342c833d8713ebb052581bff920d459a Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 14:47:31 +0100
Subject: [PATCH 085/165] untested cram support

---
 README.md           |  4 +-
 hairpin2/helpers.py |  8 +---
 hairpin2/main.py    | 93 +++++++++++++++++++++++++--------------------
 internal_doc.md     |  4 +-
 4 files changed, 56 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 62f5e64..19226e4 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ hairpin -h
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specifications.
+`hairpin2` is designed for paired data where alignment records have the `MC` tag and the complete CIGAR string is present in the `CIGAR` field (rather than the `CG:B,I` tag). If the `MC` tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. No further assumptions are made – other alignment tags and VCF fields are used, however they are mandatory per the relevant format specifications.
 
 
 ### USAGE
@@ -103,4 +103,4 @@ The basic procedure of this implementation is as follows:
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
 >   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
 
-The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
\ No newline at end of file
+The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 785edfc..2df74ea 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -14,16 +14,10 @@ def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
 
 # <= - is subset of
 def verify_json(jd: dict) -> bool:
-    return jd.keys() <= {'vcf_in', 'vcf_out', 'bams', 'input_json', 'ouput_json', 'name_mapping', 'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'}
+    return jd.keys() <= {'vcf_in', 'vcf_out', 'alignments', 'format', 'name_mapping', 'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'}
 
 
 def test_options(args):
-    if not args.vcf_in:
-        cleanup(msg='--vcf-in required')
-    if not args.vcf_out:
-        cleanup(msg='--vcf-out required')
-    if not args.bams:
-        cleanup(msg='--bams required')
     if not (0 < args.min_clip_quality < 93):
         cleanup(msg='invalid --min-clip-quality; range 0-93')
     if not (0 < args.min_mapping_quality < 60):
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 78b9242..e47c7d3 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -7,7 +7,6 @@
 from itertools import tee
 from functools import partial
 
-# CIGAR best retrieved from CG:B,I tag - implement in future
 def validate_read(
     vcf_record: pysam.VariantRecord,
     read: pysam.AlignedSegment,
@@ -115,7 +114,7 @@ def validate_read(
 
 def test_variant(
     vcf_rec: pysam.VariantRecord,
-    mutant_bams: dict[str, pysam.AlignmentFile],
+    mutant_alignments: dict[str, pysam.AlignmentFile],
     alt: str,
     al_thresh: float,
     max_span: int,
@@ -126,15 +125,15 @@ def test_variant(
     hp_filt = c.HPFilter()
     al_filt = c.ALFilter()
 
-    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in mutant_bams}
-    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in mutant_bams}
+    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in mutant_alignments}
+    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in mutant_alignments}
     mut_read_pos_f: list[int] = []
     mut_read_pos_r: list[int] = []
     mut_read_fracs_f: list[float] = []
     mut_read_fracs_r: list[float] = []
     aln_scores: list[float] = []
 
-    for mut_sample, bam in mutant_bams.items():
+    for mut_sample, bam in mutant_alignments.items():
         read_iter, test_iter = tee(bam.fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
         try:
             next(test_iter)
@@ -230,7 +229,7 @@ def test_variant(
 
 
 def test_record_per_alt(
-    bams: dict[str, pysam.AlignmentFile],
+    alignments: dict[str, pysam.AlignmentFile],
     vcf_rec: pysam.VariantRecord,
     variant_tester: c.FiltReturn,
 ) -> dict[str, c.Filters]:
@@ -241,10 +240,10 @@ def test_record_per_alt(
     if len(samples_w_mutants) == 0:
         raise c.NoMutants
 
-    bams_w_mutants = {k: v for k, v in bams.items() if k in samples_w_mutants}
+    alignments_w_mutants = {k: v for k, v in alignments.items() if k in samples_w_mutants}
     filt_d = {}
     for alt in vcf_rec.alts:
-        filt_d[alt] = variant_tester(vcf_rec, bams_w_mutants, alt)
+        filt_d[alt] = variant_tester(vcf_rec, alignments_w_mutants, alt)
     return filt_d
 
 
@@ -254,10 +253,11 @@ def main_cli() -> None:
     parser = argparse.ArgumentParser(prog="hairpin2", description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
     parser._optionals.title = 'info'
     parser.add_argument('-v', '--version', help='print version', action='version', version=c.VERSION)
-    req = parser.add_argument_group('basic')
-    req.add_argument('-i', '--vcf-in', help="path to input VCF")
-    req.add_argument('-o', '--vcf-out', help="path to write output VCF")
-    req.add_argument('-b', '--bams', help="list of paths to BAMs for samples in input VCF, whitespace separated", nargs='+')
+    req = parser.add_argument_group('mandatory')
+    req.add_argument('-i', '--vcf-in', help="path to input VCF", required=True)
+    req.add_argument('-o', '--vcf-out', help="path to write output VCF", required=True)
+    req.add_argument('-a', '--alignments', help="list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated", nargs='+', required=True)
+    req.add_argument('-f', "--format", help="format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM", choices=["s", "b", "c"], type=str, required=True)
     opt = parser.add_argument_group('extended')
     opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
@@ -266,6 +266,7 @@ def main_cli() -> None:
     opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
     opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
+    proc.add_argument('-r', '--cram-reference', help="path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM")
     proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM SM tags; useful if they differ', metavar='VCF:BAM', nargs='+')
     proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
     proc.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
@@ -295,13 +296,6 @@ def main_cli() -> None:
     # test args are sensible, exit if not
     h.test_options(args)
 
-    if args.output_json:
-        try:
-            with open(args.output_json, "w") as output_json:
-                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json'})}, output_json, indent="")
-        except Exception as e:
-            h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
-
     primed_validate_read = partial(validate_read,
                                    min_mapqual=args.min_mapping_quality,
                                    min_clipqual=args.min_clip_quality,
@@ -313,53 +307,60 @@ def main_cli() -> None:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
     except Exception as e:
         h.cleanup(msg='failed to open VCF input, reporting: {}'.format(e))
-
     sample_names = list(vcf_in_handle.header.samples)  # type:ignore
     if len(set(sample_names)) != len(sample_names):
         h.cleanup(msg='duplicate sample names in VCF')
     sample_names: set[str] = set(sample_names)
-    vcf_sample_to_bam_file_map: dict[str, pysam.AlignmentFile] = {}
-    for path in args.bams:
+
+    vcf_sample_to_alignment_map: dict[str, pysam.AlignmentFile] = {}
+    match args.format:
+        case "s":
+            mode = "r"
+        case "b":
+            mode = "rb"
+        case "c":
+            mode = "rc"
+    for path in args.alignments:
         try:
-            bam = pysam.AlignmentFile(path, 'rb')
+            alignment = pysam.AlignmentFile(path, mode, reference_filename=args.cram_reference if args.cram_reference and args.format == "c" else None)
         except Exception as e:
-            h.cleanup(msg='failed to read BAM at {}, reporting: {}'.format(path, e))
+            h.cleanup(msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
         # this may cause problems?
         # check with Peter
-        bam_sample_name = bam.header.to_dict()['RG'][0]['SM']  # type:ignore
-        vcf_sample_to_bam_file_map[bam_sample_name] = bam  # type:ignore
+        alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']
+        vcf_sample_to_alignment_map[alignment_sample_name] = alignment
     if args.name_mapping:
-        if len(args.name_mapping) > len(args.bams):
-            h.cleanup(msg="more name mappings than BAMs provided")
+        if len(args.name_mapping) > len(args.alignments):
+            h.cleanup(msg="more name mappings than alignments provided")
         vcf_map_names = []
-        bam_map_names = []
+        alignment_map_names = []
         for pair in args.name_mapping:
             kv_split = pair.split(':')  # VCF:BAM
             if len(kv_split) != 2:
                 h.cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
             vcf_map_names.append(kv_split[0])
-            bam_map_names.append(kv_split[1])
+            alignment_map_names.append(kv_split[1])
         if h.has_duplicates(vcf_map_names):
             h.cleanup(msg='duplicate VCF sample names provided to name mapping flag')
         if not set(vcf_map_names) <= sample_names:
             h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
-        if h.has_duplicates(bam_map_names):
-            h.cleanup(msg='duplicate BAM sample names provided to name mapping flag')
-        if h.lists_not_equal(bam_map_names, vcf_sample_to_bam_file_map.keys()):  # type:ignore
-            h.cleanup(msg='BAM sample names provided to name mapping flag do not match BAM SM tags')
-        vcf_sample_to_bam_file_map = {vcf_map_names[bam_map_names.index(k)]: v for k, v in vcf_sample_to_bam_file_map.items()}
+        if h.has_duplicates(alignment_map_names):
+            h.cleanup(msg='duplicate aligment sample names provided to name mapping flag')
+        if h.lists_not_equal(alignment_map_names, vcf_sample_to_alignment_map.keys()):
+            h.cleanup(msg='alignment sample names provided to name mapping flag do not match alignment SM tags')
+        vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v for k, v in vcf_sample_to_alignment_map.items()}
     else:
-        if not vcf_sample_to_bam_file_map.keys() <= sample_names:
-            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_bam_file_map.keys() - sample_names))
-    if sample_names != vcf_sample_to_bam_file_map.keys():
-        logging.info("BAMs not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_bam_file_map.keys()))
+        if not vcf_sample_to_alignment_map.keys() <= sample_names:
+            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_alignment_map.keys() - sample_names))
+    if sample_names != vcf_sample_to_alignment_map.keys():
+        logging.info("alignments not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_alignment_map.keys()))
 
     # init output
     out_head = vcf_in_handle.header  # type:ignore
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_bam_file_map.keys())))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_bam_file_map.keys())))
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_alignment_map.keys())))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_alignment_map.keys())))
     out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
     out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
 
@@ -368,10 +369,18 @@ def main_cli() -> None:
     except Exception as e:
         h.cleanup(msg='failed to open VCF output, reporting: {}'.format(e))
 
+    # write args once all verified
+    if args.output_json:
+        try:
+            with open(args.output_json, "w") as output_json:
+                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json'})}, output_json, indent="")
+        except Exception as e:
+            h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
+
     for record in vcf_in_handle.fetch():  # type:ignore
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                bams=vcf_sample_to_bam_file_map,
+                alignments=vcf_sample_to_alignment_map,
                 vcf_rec=record,
                 variant_tester=primed_variant_tester
             )
diff --git a/internal_doc.md b/internal_doc.md
index 2d24084..4eae276 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -47,7 +47,7 @@ module load <version>
 
 ### ASSUMPTIONS
 
-`hairpin2` is designed for paired data where BAM records have the **MC** tag. If this tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool expects data specifically in the VCF and BAM formats; support for a wider variety of formats could be implemented if desired. No further assumptions are made – other BAM tags and VCF fields are used, however they are mandatory per the format specifications.
+`hairpin2` is designed for paired data where alignment records have the `MC` tag and the complete CIGAR string is present in the `CIGAR` field (rather than the `CG:B,I` tag). If the `MC` tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. No further assumptions are made – other alignment tags and VCF fields are used, however they are mandatory per the relevant format specifications.
 
 
 ### USAGE
@@ -118,4 +118,4 @@ The basic procedure of this implementation is as follows. For each record in the
 1. retrieving reads from samples exhibiting the mutations
 2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
-4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
\ No newline at end of file
+4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision

From 7d2046b74cb0ca8e3fe6b94ab97ba845cc3c275a Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 14:49:26 +0100
Subject: [PATCH 086/165] Update internal_doc.md

---
 internal_doc.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 4eae276..c19a743 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -53,8 +53,8 @@ module load <version>
 ### USAGE
 
 ```
-usage: hairpin2 [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY]
-                [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
+                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
 
 cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
@@ -62,13 +62,15 @@ info:
   -h, --help            show this help message and exit
   -v, --version         print version
 
-basic:
+mandatory:
   -i VCF_IN, --vcf-in VCF_IN
                         path to input VCF
   -o VCF_OUT, --vcf-out VCF_OUT
                         path to write output VCF
-  -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                        list of paths to BAMs for samples in input VCF, whitespace separated
+  -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
+                        list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated
+  -f {s,b,c}, --format {s,b,c}
+                        format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
@@ -85,6 +87,8 @@ extended:
                         >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
 
 procedural:
+  -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
+                        path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
                         map VCF sample names to BAM SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON

From 4bd87cc3365da19d2a00f3140d1d272f08b39136 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 14:49:50 +0100
Subject: [PATCH 087/165] Update README.md

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 19226e4..3f2c2bf 100644
--- a/README.md
+++ b/README.md
@@ -38,8 +38,8 @@ hairpin -h
 ### USAGE
 
 ```
-usage: hairpin2 [-h] [-v] [-i VCF_IN] [-o VCF_OUT] [-b BAMS [BAMS ...]] [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY]
-                [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
+                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
 
 cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
@@ -47,13 +47,15 @@ info:
   -h, --help            show this help message and exit
   -v, --version         print version
 
-basic:
+mandatory:
   -i VCF_IN, --vcf-in VCF_IN
                         path to input VCF
   -o VCF_OUT, --vcf-out VCF_OUT
                         path to write output VCF
-  -b BAMS [BAMS ...], --bams BAMS [BAMS ...]
-                        list of paths to BAMs for samples in input VCF, whitespace separated
+  -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
+                        list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated
+  -f {s,b,c}, --format {s,b,c}
+                        format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
@@ -70,6 +72,8 @@ extended:
                         >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
 
 procedural:
+  -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
+                        path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
   -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
                         map VCF sample names to BAM SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON

From 885d35a494a3708c08f3124edc3b1ac3019a5cb0 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:08:32 +0100
Subject: [PATCH 088/165] tested cram support

---
 .gitignore          |  1 +
 hairpin2/helpers.py |  2 +-
 hairpin2/main.py    | 28 ++++++++++++++--------------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index c16f40e..95ac7c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ __pycache__/
 build/
 *.txt
 *.sif
+*.json
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 2df74ea..a6613e3 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -14,7 +14,7 @@ def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
 
 # <= - is subset of
 def verify_json(jd: dict) -> bool:
-    return jd.keys() <= {'vcf_in', 'vcf_out', 'alignments', 'format', 'name_mapping', 'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'}
+    return {'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'} <= jd.keys()
 
 
 def test_options(args):
diff --git a/hairpin2/main.py b/hairpin2/main.py
index e47c7d3..8dbeccf 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -42,7 +42,7 @@ def validate_read(
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, BAM 0-INDEXED (vcf_record.start = 0-indexed mutation position)
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, alignments 0-INDEXED (vcf_record.start = 0-indexed mutation position)
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
@@ -133,8 +133,8 @@ def test_variant(
     mut_read_fracs_r: list[float] = []
     aln_scores: list[float] = []
 
-    for mut_sample, bam in mutant_alignments.items():
-        read_iter, test_iter = tee(bam.fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
+    for mut_sample, alignment in mutant_alignments.items():
+        read_iter, test_iter = tee(alignment.fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
         try:
             next(test_iter)
         except StopIteration:
@@ -256,7 +256,7 @@ def main_cli() -> None:
     req = parser.add_argument_group('mandatory')
     req.add_argument('-i', '--vcf-in', help="path to input VCF", required=True)
     req.add_argument('-o', '--vcf-out', help="path to write output VCF", required=True)
-    req.add_argument('-a', '--alignments', help="list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated", nargs='+', required=True)
+    req.add_argument('-a', '--alignments', help="list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories", nargs='+', required=True)
     req.add_argument('-f', "--format", help="format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM", choices=["s", "b", "c"], type=str, required=True)
     opt = parser.add_argument_group('extended')
     opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
@@ -267,28 +267,25 @@ def main_cli() -> None:
     opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15', type=float)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-r', '--cram-reference', help="path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM")
-    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to BAM SM tags; useful if they differ', metavar='VCF:BAM', nargs='+')
-    proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters; overridden by arguments provided on command line', type=str)
+    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to alignment SM tags; useful if they differ', metavar='VCF:aln', nargs='+')
+    proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line', type=str)
     proc.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
 
     args = parser.parse_args()
 
     json_config: dict | None = None
     if args.input_json:
-        logging.info('args JSON provided, arguments will be loaded from JSON if not present on command line')
+        logging.info('args JSON provided, extended arguments will be loaded from JSON if not present on command line')
         try:
             with open(args.input_json, 'r') as f:
                 json_config = json.load(f)
         except Exception as e:
             h.cleanup(msg='failed to open input JSON, reporting: {}'.format(e))
-        else:
-            if not h.verify_json(json_config): # type:ignore
-                h.cleanup(msg='JSON keys are not subset of available arguments (excluding --input-json and --output_json)')
 
     # set arg defaults
     for k in vars(args).keys():
         if not vars(args)[k]:
-            if json_config and k in json_config.keys():
+            if json_config and k in json_config.keys() and k in c.DEFAULTS.keys():
                 setattr(args, k, json_config[k])
             elif k in c.DEFAULTS.keys():
                 setattr(args, k, c.DEFAULTS[k])
@@ -316,10 +313,13 @@ def main_cli() -> None:
     match args.format:
         case "s":
             mode = "r"
+            logging.info("SAM format specified")
         case "b":
             mode = "rb"
+            logging.info("BAM format specified")
         case "c":
             mode = "rc"
+            logging.info("CRAM format specified")
     for path in args.alignments:
         try:
             alignment = pysam.AlignmentFile(path, mode, reference_filename=args.cram_reference if args.cram_reference and args.format == "c" else None)
@@ -337,7 +337,7 @@ def main_cli() -> None:
         vcf_map_names = []
         alignment_map_names = []
         for pair in args.name_mapping:
-            kv_split = pair.split(':')  # VCF:BAM
+            kv_split = pair.split(':')  # VCF:aln
             if len(kv_split) != 2:
                 h.cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
             vcf_map_names.append(kv_split[0])
@@ -353,7 +353,7 @@ def main_cli() -> None:
         vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v for k, v in vcf_sample_to_alignment_map.items()}
     else:
         if not vcf_sample_to_alignment_map.keys() <= sample_names:
-            h.cleanup(msg='BAM SM tags do not match VCF sample names: {}'.format(vcf_sample_to_alignment_map.keys() - sample_names))
+            h.cleanup(msg='alignment SM tags do not match VCF sample names: {}'.format(vcf_sample_to_alignment_map.keys() - sample_names))
     if sample_names != vcf_sample_to_alignment_map.keys():
         logging.info("alignments not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_alignment_map.keys()))
 
@@ -373,7 +373,7 @@ def main_cli() -> None:
     if args.output_json:
         try:
             with open(args.output_json, "w") as output_json:
-                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json'})}, output_json, indent="")
+                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json', 'format'})}, output_json, indent="")
         except Exception as e:
             h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 

From c7d0cd254e323d1347c52f0acb937f8c92e5f4b0 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:10:11 +0100
Subject: [PATCH 089/165] Update README.md

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 3f2c2bf..7b5f0a6 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ hairpin -h
 
 ```
 usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
-                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:aln [VCF:aln ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
 
 cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
@@ -53,7 +53,7 @@ mandatory:
   -o VCF_OUT, --vcf-out VCF_OUT
                         path to write output VCF
   -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
-                        list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated
+                        list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories
   -f {s,b,c}, --format {s,b,c}
                         format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
 
@@ -74,10 +74,10 @@ extended:
 procedural:
   -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
                         path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
-  -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
-                        map VCF sample names to BAM SM tags; useful if they differ
+  -m VCF:aln [VCF:aln ...], --name-mapping VCF:aln [VCF:aln ...]
+                        map VCF sample names to alignment SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON
-                        path to JSON of input parameters; overridden by arguments provided on command line
+                        path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line
   -jo OUTPUT_JSON, --output-json OUTPUT_JSON
                         log input arguments to JSON
 ```

From d749423e30e317e4d96ccfc58fe5792c4f700498 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:10:42 +0100
Subject: [PATCH 090/165] add changelog

---
 CHANGES.md | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 CHANGES.md

diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 0000000..c51e12a
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1,11 @@
+### 0.0.2a
+
+- CRAM/SAM support
+- JSON argument input/output
+- added ability to ignore samples in VCF (e.g. normal) by not providing alignments for them to the -a flag
+- added `--name-mapping` flag to allow mapping VCF sample names to alignment SM tags when they do not match, as these are used to connect VCF samples with alignments
+- improved doc, helptext, and argument clarity
+
+### 0.0.1a
+
+- first release

From a00b359ee7f8ebec4adce6e556c28a71bc18b85a Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:12:45 +0100
Subject: [PATCH 091/165] Update internal_doc.md

---
 internal_doc.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index c19a743..1c56352 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -54,7 +54,7 @@ module load <version>
 
 ```
 usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
-                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:BAM [VCF:BAM ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:aln [VCF:aln ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
 
 cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
 
@@ -68,7 +68,7 @@ mandatory:
   -o VCF_OUT, --vcf-out VCF_OUT
                         path to write output VCF
   -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
-                        list of paths to S/B/CR/AMs (indicated by --format) for samples in input VCF, whitespace separated
+                        list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories
   -f {s,b,c}, --format {s,b,c}
                         format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
 
@@ -89,10 +89,10 @@ extended:
 procedural:
   -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
                         path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
-  -m VCF:BAM [VCF:BAM ...], --name-mapping VCF:BAM [VCF:BAM ...]
-                        map VCF sample names to BAM SM tags; useful if they differ
+  -m VCF:aln [VCF:aln ...], --name-mapping VCF:aln [VCF:aln ...]
+                        map VCF sample names to alignment SM tags; useful if they differ
   -ji INPUT_JSON, --input-json INPUT_JSON
-                        path to JSON of input parameters; overridden by arguments provided on command line
+                        path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line
   -jo OUTPUT_JSON, --output-json OUTPUT_JSON
                         log input arguments to JSON
 ```

From 227e5bc4975fe5fcb7fa8a747745d1066cd7c1c3 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 15 Aug 2024 16:27:03 +0100
Subject: [PATCH 092/165] remove obsolete function

---
 hairpin2/helpers.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index a6613e3..387a2ea 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -12,11 +12,6 @@ def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
     sys.exit(code)
 
 
-# <= - is subset of
-def verify_json(jd: dict) -> bool:
-    return {'al_filter_threshold', 'min_clip_quality', 'min_mapping_quality', 'min_base_quality', 'max_read_span', 'position_fraction'} <= jd.keys()
-
-
 def test_options(args):
     if not (0 < args.min_clip_quality < 93):
         cleanup(msg='invalid --min-clip-quality; range 0-93')

From e7731d638fd2aca82a4dd178d00965e1ae265ad4 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 12:29:31 +0000
Subject: [PATCH 093/165] bump version

---
 hairpin2/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 64a41db..dabc36b 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -2,7 +2,7 @@
 from typing import Callable
 import dataclasses as d
 
-VERSION = '0.0.1a'
+VERSION = '0.0.2a'
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 

From f160bdd97d147a50b570e5c3d2e5e5718ba06f73 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 12:34:21 +0000
Subject: [PATCH 094/165] modify dependecy

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 76e5504..a9f252e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ name = "hairpin2"
 version = "0.0.2a"
 requires-python = ">= 3.10"
 dependencies = [
-    'pysam == 0.22.1'
+    'pysam >= 0.22.1'
 ]
 
 [project.scripts]

From acf25f9b726b330db1dfe035c54f5606f97f3087 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 13:47:45 +0100
Subject: [PATCH 095/165] small doc improvement

---
 README.md       | 18 +++++++++---------
 internal_doc.md | 20 ++++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 7b5f0a6..bb2c898 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 `hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). 
 
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if relevant reads have lower median alignment score per base than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter – cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
 
 ### DEPENDENCIES
@@ -86,18 +86,18 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 - `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
 - `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
+- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the **HPF** and **ALF** filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-> **0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-> **1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-> **2** – passed/failed on filter threshold (ALF only)  
-> **3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-> **4** – no samples have non 0,0 genotype for the record (pass only)
+> **0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
+> **1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
+> **2** – passed/failed on filter threshold (`ALF` only)  
+> **3** – insufficient appropriate reads to support calling flag - this covers a lot of possiblities, if more granularity is desired, please request it  
+> **4** – no samples have non 0,0 genotype for the record  
   
 
 The basic procedure of this implementation is as follows:  
@@ -105,6 +105,6 @@ The basic procedure of this implementation is as follows:
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+>   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the **INFO** field indicating the reason for the decision
 
 The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
diff --git a/internal_doc.md b/internal_doc.md
index 1c56352..36ec41f 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -2,7 +2,7 @@
 
 `hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63).
 
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with **HPF** if they are suspected cruciform artefacts, and **ALF** if relevant reads have lower median alignment score per base than a specified threshold. The **ALF** filter indicates poor signal-to-noise, and provides additional confidence in the **HPF** filter – cruciform artefacts usually cause a marked decrease in alignment score. The **ALF** flag also may appear on variants without **HPF**, often indicating other artefacts associated with poor signal-to-noise.
+For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
 `hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
 However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
@@ -26,7 +26,7 @@ Since the versions available of "Mathijs' Scripts" are many and varied, we canno
 - Transparency - reasoning for flagging decisions logged in VCF
 - Single tool centrally maintained and versioned - for reproducibility/citing/distribution
 - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
-- The module adds **filter flags**, **HPF** and **ALF**, to a VCF. It **does not** output into separate files containing passed and failed positions
+- The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
 - The module **does not** prefilter, or perform fragment filtering  
 
 With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
@@ -103,23 +103,23 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 - `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
 - `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling **HPF** flag.
+- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 
 
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the **HPF** and **ALF** filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
 
-**0** – passed/failed on condition 60A(i) of Ellis et al. (HPF only)  
-**1** – passed/failed on condition 60B(i) of Ellis et al. (HPF only)  
-**2** – passed/failed on filter threshold (ALF only)  
-**3** – insufficient appropriate reads to support calling flag (pass only)   (This covers a lot of possiblities, if more granularity is desired, please request it)  
-**4** – no samples have non 0,0 genotype for the record (pass only)
+**0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
+**1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
+**2** – passed/failed on filter threshold (`ALF` only)  
+**3** – insufficient appropriate reads to support calling flag - this covers a lot of possiblities, if more granularity is desired, please request it  
+**4** – no samples have non 0,0 genotype for the record  
   
 
 The basic procedure of this implementation is as follows. For each record in the VCF, test every alt for that record by:  
 1. retrieving reads from samples exhibiting the mutations
 2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
-4. on the results of the statistical analysis, pass or fail the record for the filters **ALF** and **HPF**, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the **INFO** field indicating the reason for the decision

From a8dd8455d16127e5bd25311f28339e3a25314777 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 13:48:53 +0100
Subject: [PATCH 096/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 36ec41f..0fb9d31 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -114,7 +114,7 @@ The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flag
 **0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
 **1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
 **2** – passed/failed on filter threshold (`ALF` only)  
-**3** – insufficient appropriate reads to support calling flag - this covers a lot of possiblities, if more granularity is desired, please request it  
+**3** – insufficient appropriate reads to support calling flag – this covers a lot of possiblities, if more granularity is desired, please request it  
 **4** – no samples have non 0,0 genotype for the record  
   
 

From 7e27b9b10553ab993a4a9ceb7318b16658de5c7d Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 13:49:18 +0100
Subject: [PATCH 097/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bb2c898..74ff14d 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flag
 > **0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
 > **1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
 > **2** – passed/failed on filter threshold (`ALF` only)  
-> **3** – insufficient appropriate reads to support calling flag - this covers a lot of possiblities, if more granularity is desired, please request it  
+> **3** – insufficient appropriate reads to support calling flag – this covers a lot of possiblities, if more granularity is desired, please request it  
 > **4** – no samples have non 0,0 genotype for the record  
   
 

From fe627cfb67d8a751515a588a3e8cd4dccfdb535e Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 15:55:01 +0100
Subject: [PATCH 098/165] small doc improvement

---
 README.md       | 2 +-
 internal_doc.md | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index bb2c898..8d481c1 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,6 @@ The basic procedure of this implementation is as follows:
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+>   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision
 
 The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
diff --git a/internal_doc.md b/internal_doc.md
index 36ec41f..834d241 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -122,4 +122,7 @@ The basic procedure of this implementation is as follows. For each record in the
 1. retrieving reads from samples exhibiting the mutations
 2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
-4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the **INFO** field indicating the reason for the decision
+4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision
+
+The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
+

From 9899604965bc39ae46695fffcb8eb538e2e6430a Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:00:09 +0100
Subject: [PATCH 099/165] Update internal_doc.md

---
 internal_doc.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index f639c9b..19dd187 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -22,16 +22,16 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
 
 Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
-- No more ambiguous/cryptic/unfixable errors - the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
-- Transparency - reasoning for flagging decisions logged in VCF
-- Single tool centrally maintained and versioned - for reproducibility/citing/distribution
+- No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
+- Transparency – reasoning for flagging decisions logged in VCF
+- Single tool centrally maintained and versioned – for reproducibility/citing/distribution
 - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
 - The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
 - The module **does not** prefilter, or perform fragment filtering  
 
-With regard to prefiltering - this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
+With regard to prefiltering – this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
 
-**N.B.** this program is currently in an alpha/testing phase - it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available - it will be made public as soon as it is out of this alpha phase.
+**N.B.** this program is currently in an alpha/testing phase – it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available – it will be made public as soon as it is out of this alpha phase.
 
 
 ### MODULE ACCESS
@@ -42,7 +42,7 @@ For farm22 use, available as a module.
 module avail hairpin2
 module load <version>
 ```
-**N.B. do not confuse with the module `hairpin` - this is `hairpin2`**
+**N.B. do not confuse with the module `hairpin` – this is `hairpin2`**
 
 
 ### ASSUMPTIONS
@@ -97,12 +97,12 @@ procedural:
                         log input arguments to JSON
 ```
 
-**N.B.** the above usage block indicates the call for the tool is `hairpin2` - this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`
+**N.B.** the above usage block indicates the call for the tool is `hairpin2` – this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
 - `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-- `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+- `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
 - `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 
 

From 3b4f745709890647fa8899d94f71fbd0ccc44cae Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:01:09 +0100
Subject: [PATCH 100/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1f75fb6..bd73b10 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ procedural:
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
 - `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-- `--max-read-span`  – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
+- `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
 - `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 
 

From 0d474ffdc01840d89300355cf1018655dbe82cb9 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:05:44 +0100
Subject: [PATCH 101/165] Update README.md

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index bd73b10..df352a0 100644
--- a/README.md
+++ b/README.md
@@ -98,13 +98,12 @@ The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flag
 > **2** – passed/failed on filter threshold (`ALF` only)  
 > **3** – insufficient appropriate reads to support calling flag – this covers a lot of possiblities, if more granularity is desired, please request it  
 > **4** – no samples have non 0,0 genotype for the record  
-  
 
 The basic procedure of this implementation is as follows:  
 >   For each record in the VCF, test every alt for that record by:  
 >   1. retrieving reads from samples exhibiting the mutations
 >   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision
+>   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
 
-The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
+The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
\ No newline at end of file

From f87173eb9979dc8bd1425a06778ec7b49750fffe Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:15:28 +0100
Subject: [PATCH 102/165] Update internal_doc.md to reflect some of Rashesh's
 guidance

---
 internal_doc.md | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 19dd187..1800feb 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -4,8 +4,7 @@
 
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
-`hairpin2` should replace, as far as is possible, the tools known as "Mathijs' Scripts", "AdditionalBamStatistics", "Tim Butler's Scripts" and, unfortunately, probably many other names. It also supersedes `hairpin`, a stopgap version /of Mathijs' Scripts that relied on some of Mathijs' original code, and therefore was unreliable and error prone (though less so than the raw scripts themselves).
-However, this incarnation is not a total replacement for Mathijs' Scripts at this time (and has changed in functionality since the stopgap tool, the original hairpin):
+`hairpin2` has been designed to replace `AdditionalBamStatistics`, which forms a key part of the the LCM processing pipelines known as "Mathijs' Scripts" and "Tim Butler's scripts" (there may also be other names and other pipelines which incoroprate this tool). 
 
 > Mathjis LCM filters includes the following steps:
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
@@ -21,13 +20,12 @@ However, this incarnation is not a total replacement for Mathijs' Scripts at thi
 >
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
 
-Since the versions available of "Mathijs' Scripts" are many and varied, we cannot account for all differences/changes, but in general:
+Improvements and differences to the original `AdditionalBamStatistics` implementation include:
 - No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
 - Transparency – reasoning for flagging decisions logged in VCF
 - Single tool centrally maintained and versioned – for reproducibility/citing/distribution
 - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
 - The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
-- The module **does not** prefilter, or perform fragment filtering  
 
 With regard to prefiltering – this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
 
@@ -42,7 +40,7 @@ For farm22 use, available as a module.
 module avail hairpin2
 module load <version>
 ```
-**N.B. do not confuse with the module `hairpin` – this is `hairpin2`**
+**N.B. do not confuse with the module `hairpin` – this is `hairpin2`**. `hairpin` was a stopgap version of Mathijs' Scripts that relied on some of Mathijs' original code, and was unreliable and error prone.
 
 
 ### ASSUMPTIONS

From 3294398733b59a4daa6b10ba35f7d5ed4cb589ed Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:16:27 +0100
Subject: [PATCH 103/165] Update internal_doc.md

---
 internal_doc.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 1800feb..6753e90 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -6,6 +6,14 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 
 `hairpin2` has been designed to replace `AdditionalBamStatistics`, which forms a key part of the the LCM processing pipelines known as "Mathijs' Scripts" and "Tim Butler's scripts" (there may also be other names and other pipelines which incoroprate this tool). 
 
+Improvements and differences to the original `AdditionalBamStatistics` implementation include:
+- No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
+- Transparency – reasoning for flagging decisions logged in VCF
+- Single tool centrally maintained and versioned – for reproducibility/citing/distribution
+- Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
+- The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
+
+####TOFIX
 > Mathjis LCM filters includes the following steps:
 > 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
 > 2. Hairpin Filtering
@@ -20,13 +28,6 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 >
 > The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
 
-Improvements and differences to the original `AdditionalBamStatistics` implementation include:
-- No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
-- Transparency – reasoning for flagging decisions logged in VCF
-- Single tool centrally maintained and versioned – for reproducibility/citing/distribution
-- Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
-- The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
-
 With regard to prefiltering – this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
 
 **N.B.** this program is currently in an alpha/testing phase – it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available – it will be made public as soon as it is out of this alpha phase.

From b3ff70f53627783a5fa46f34e486e230445bacc0 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Fri, 16 Aug 2024 16:20:17 +0100
Subject: [PATCH 104/165] Update internal_doc.md

---
 internal_doc.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 6753e90..0d2456d 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -12,6 +12,7 @@ Improvements and differences to the original `AdditionalBamStatistics` implement
 - Single tool centrally maintained and versioned – for reproducibility/citing/distribution
 - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
 - The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
+- The `ALF` flag supersedes the `ASRD` info field
 
 ####TOFIX
 > Mathjis LCM filters includes the following steps:
@@ -100,7 +101,7 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
+- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on `ASRD`.  
 - `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
 - `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 

From c1470da7be688850a4ff792ef1d75005ab517ddc Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 19 Aug 2024 09:31:23 +0100
Subject: [PATCH 105/165] Update internal_doc.md

---
 internal_doc.md | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/internal_doc.md b/internal_doc.md
index 0d2456d..5262310 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -9,28 +9,11 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 Improvements and differences to the original `AdditionalBamStatistics` implementation include:
 - No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
 - Transparency – reasoning for flagging decisions logged in VCF
-- Single tool centrally maintained and versioned – for reproducibility/citing/distribution
+- Centrally maintained and versioned – for reproducibility/citing/distribution
 - Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
 - The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
 - The `ALF` flag supersedes the `ASRD` info field
 
-####TOFIX
-> Mathjis LCM filters includes the following steps:
-> 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
-> 2. Hairpin Filtering
-> 3. Filtering based on fragment numbers.  
->
-> Which are split across the following steps: (As per his scripts)  
-> - preselect
-> - imitateANNOVAR
-> - annotateBAMStatistics
-> - additionalBAMStatistics
-> - filtering  
->
-> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect and fragment based filter.  
-
-With regard to prefiltering – this is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
-
 **N.B.** this program is currently in an alpha/testing phase – it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available – it will be made public as soon as it is out of this alpha phase.
 
 
@@ -105,6 +88,26 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 - `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
 - `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
 
+##### Usage in context of Mathijs pipeline
+
+This section is under construction - if you have questions in the meantime please ask Rashesh
+
+> Mathjis LCM filters includes the following steps:
+> 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
+> 2. Hairpin Filtering
+> 3. Filtering based on fragment numbers.  
+>
+> Which are split across the following steps: (As per his scripts)  
+> - preselect
+> - imitateANNOVAR
+> - annotateBAMStatistics
+> - additionalBAMStatistics
+> - filtering  
+>
+> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect [sans the ASMD filter] and fragment based filter.  
+
+(pre)filtering is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
+
 
 
 ### DETAILS

From 8a18d7da32c759a3b2fe5d33672b2528fe8cb4aa Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 19 Aug 2024 14:41:35 +0100
Subject: [PATCH 106/165] spelling

---
 hairpin2/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 8dbeccf..ce83cce 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -259,7 +259,7 @@ def main_cli() -> None:
     req.add_argument('-a', '--alignments', help="list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories", nargs='+', required=True)
     req.add_argument('-f', "--format", help="format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM", choices=["s", "b", "c"], type=str, required=True)
     opt = parser.add_argument_group('extended')
-    opt.add_argument('-al', '--al-filter-threshold', help='threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
+    opt.add_argument('-al', '--al-filter-threshold', help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
     opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
     opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
     opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )

From c1b56ac0a05bf7cc51aa046868f532a1ba754340 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 19 Aug 2024 14:42:18 +0100
Subject: [PATCH 107/165] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index df352a0..1a87628 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ mandatory:
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
+                        threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY

From 4de1514daf3eccf457fb2f156d264707d6c52083 Mon Sep 17 00:00:00 2001
From: Alex Byrne <ab63@sanger.ac.uk>
Date: Mon, 19 Aug 2024 14:42:38 +0100
Subject: [PATCH 108/165] Update internal_doc.md

---
 internal_doc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/internal_doc.md b/internal_doc.md
index 5262310..f9a0ec3 100644
--- a/internal_doc.md
+++ b/internal_doc.md
@@ -57,7 +57,7 @@ mandatory:
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshhold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
+                        threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY

From e6ddb8415d78545426ac98e7eb9bd1c6565f5223 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 22 Aug 2024 16:31:59 +0100
Subject: [PATCH 109/165] super basic unit tests

---
 .gitignore                 |   1 +
 hairpin2/main.py           |  24 ++++---
 test/single_var_silico.vcf | 141 +++++++++++++++++++++++++++++++++++++
 test_validate_read.py      |  27 +++++++
 4 files changed, 182 insertions(+), 11 deletions(-)
 create mode 100644 test/single_var_silico.vcf
 create mode 100644 test_validate_read.py

diff --git a/.gitignore b/.gitignore
index 95ac7c6..5a67585 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ dist/
 __pycache__/
 .helix/
 build/
+test_data_creation/
 *.txt
 *.sif
 *.json
diff --git a/hairpin2/main.py b/hairpin2/main.py
index ce83cce..52fb530 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -8,12 +8,14 @@
 from functools import partial
 
 def validate_read(
-    vcf_record: pysam.VariantRecord,
     read: pysam.AlignedSegment,
+    vcf_start: int,
+    vcf_stop: int,
+    vcf_rlen: int,
+    alt: str,
     min_mapqual: int,
     min_clipqual: int,
     min_basequal: int,
-    alt: str
 ) -> int:
     read_flag = c.ValidatorFlags.CLEAR.value
 
@@ -42,11 +44,11 @@ def validate_read(
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_record.start) # VCF 1-INDEXED, alignments 0-INDEXED (vcf_record.start = 0-indexed mutation position)
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start) # VCF 1-INDEXED, alignments 0-INDEXED (vcf_start = 0-indexed mutation position)
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
-            if vcf_record.rlen == len(alt) == 1:
+            if vcf_rlen == len(alt) == 1:
                 if (mut_op not in [c.Ops.MATCH.value, c.Ops.DIFF.value]):
                     read_flag |= c.ValidatorFlags.BAD_OP.value
                 if read.query_sequence[mut_pos] != alt:  # type: ignore
@@ -54,13 +56,13 @@ def validate_read(
                 if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
                         read_flag |= c.ValidatorFlags.BASEQUAL.value
             # Second, check whether length of read can accommodate size of indel
-            elif (mut_pos + vcf_record.rlen > read.query_length or
+            elif (mut_pos + vcf_rlen > read.query_length or
                   mut_pos + len(alt) > read.query_length):
                 read_flag |= c.ValidatorFlags.SHORT.value
             else:
                 if len(alt) == 1:  # DEL
                     try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_record.start, vcf_record.stop)))
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_start, vcf_stop)))
                     except IndexError:
                         read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
@@ -68,9 +70,9 @@ def validate_read(
                             mut_rng[-1][1] != c.Ops.MATCH.value or
                             any(x[1] != c.Ops.DEL.value for x in mut_rng[1:-2])):
                             read_flag |= c.ValidatorFlags.BAD_OP.value
-                elif vcf_record.rlen == 1:  # INS
+                elif vcf_rlen == 1:  # INS
                     try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_record.start, (vcf_record.start + len(alt)))))
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_start, (vcf_start + len(alt)))))
                     except IndexError:
                         read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
@@ -81,7 +83,7 @@ def validate_read(
                         if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
                             read_flag |= c.ValidatorFlags.NOT_ALT.value
                 else:  # COMPLEX
-                    max_rng = range(vcf_record.start, vcf_record.stop) if (vcf_record.start + vcf_record.rlen) > (vcf_record.start + len(alt)) else range(vcf_record.start, (vcf_record.start + len(alt)))
+                    max_rng = range(vcf_start, vcf_stop) if (vcf_start + vcf_rlen) > (vcf_start + len(alt)) else range(vcf_start, (vcf_start + len(alt)))
                     try:
                         mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), max_rng))
                     except IndexError:
@@ -107,7 +109,7 @@ def validate_read(
                 else:
                     if pair_end >= read.next_reference_start:  # type:ignore
                         pair_end = read.next_reference_start - 1
-                if not (pair_start <= vcf_record.start <= pair_end):  # type:ignore
+                if not (pair_start <= vcf_start <= pair_end):  # type:ignore
                     read_flag |= c.ValidatorFlags.OVERLAP.value
     return read_flag
 
@@ -143,7 +145,7 @@ def test_variant(
         read = None
         for read in read_iter: # type: ignore
             read_flag = c.ValidatorFlags.CLEAR.value
-            read_flag = read_validator(vcf_record=vcf_rec, read=read, alt=alt)
+            read_flag = read_validator(read=read, alt=alt, vcf_start=vcf_rec.start, vcf_stop=vcf_rec.stop, vcf_rlen=vcf_rec.rlen)
 
             if read_flag == c.ValidatorFlags.CLEAR.value:
                 mut_reads[mut_sample].append(read)
diff --git a/test/single_var_silico.vcf b/test/single_var_silico.vcf
new file mode 100644
index 0000000..3c06019
--- /dev/null
+++ b/test/single_var_silico.vcf
@@ -0,0 +1,141 @@
+##fileformat=VCFv4.1
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=DTH,Description="Less than 1/3 mutant alleles were >= 25 base quality">
+##FILTER=<ID=RP,Description="Coverage was less than 8 and no mutant alleles were found in the first 2/3 of a read (shifted 0.08 from the start and extended 0.08 more than 2/3 of the read length)">
+##FILTER=<ID=MN,Description="More than 0.03 of mutant alleles that were >= 15 base quality found in the matched normal">
+##FILTER=<ID=PT,Description="Mutant alleles all on one direction of read (1rd allowed on opposite strand) and in second half of the read. Second half of read contains the motif GGC[AT]G in sequenced orientation and the mean base quality of all bases after the motif was less than 20">
+##FILTER=<ID=MQ,Description="Mean mapping quality of the mutant allele reads was < 21">
+##FILTER=<ID=SR,Description="Position falls within a simple repeat using the supplied bed file">
+##FILTER=<ID=CR,Description="Position falls within a centromeric repeat using the supplied bed file">
+##FILTER=<ID=PH,Description="Mutant reads were on one strand (permitted proportion on other strand: 0.04), and mean mutant base quality was less than 21">
+##FILTER=<ID=TI,Description="More than 10 percent of reads covering this position contained an indel according to mapping">
+##FILTER=<ID=SRP,Description="More than 80 percent of reads contain the mutant allele at the same read position">
+##FILTER=<ID=HSD,Description="Position falls within a high sequencing depth region using the supplied bed file">
+##FILTER=<ID=AN,Description="Position could not be annotated against a transcript using the supplied bed file">
+##FILTER=<ID=VUM,Description="Position has >= 3 mutant allele present in at least 1 percent unmatched normal samples in the unmatched VCF.">
+##FILTER=<ID=SE,Description="Coverage is >= 10 on each strand but mutant allele is only present on one strand">
+##FILTER=<ID=MNP,Description="Tumour sample mutant allele proportion - normal sample mutant allele proportion < 0.2">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=FAZ,Number=1,Type=Integer,Description="Reads presenting a A for this position, forward strand">
+##FORMAT=<ID=FCZ,Number=1,Type=Integer,Description="Reads presenting a C for this position, forward strand">
+##FORMAT=<ID=FGZ,Number=1,Type=Integer,Description="Reads presenting a G for this position, forward strand">
+##FORMAT=<ID=FTZ,Number=1,Type=Integer,Description="Reads presenting a T for this position, forward strand">
+##FORMAT=<ID=RAZ,Number=1,Type=Integer,Description="Reads presenting a A for this position, reverse strand">
+##FORMAT=<ID=RCZ,Number=1,Type=Integer,Description="Reads presenting a C for this position, reverse strand">
+##FORMAT=<ID=RGZ,Number=1,Type=Integer,Description="Reads presenting a G for this position, reverse strand">
+##FORMAT=<ID=RTZ,Number=1,Type=Integer,Description="Reads presenting a T for this position, reverse strand">
+##FORMAT=<ID=PM,Number=1,Type=Float,Description="Proportion of mut allele">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=MP,Number=1,Type=Float,Description="Sum of CaVEMan somatic genotype probabilities">
+##INFO=<ID=GP,Number=1,Type=Float,Description="Sum of CaVEMan germline genotype probabilities">
+##INFO=<ID=TG,Number=1,Type=String,Description="Most probable genotype as called by CaVEMan">
+##INFO=<ID=TP,Number=1,Type=Float,Description="Probability of most probable genotype as called by CaVEMan">
+##INFO=<ID=SG,Number=1,Type=String,Description="2nd most probable genotype as called by CaVEMan">
+##INFO=<ID=SP,Number=1,Type=Float,Description="Probability of 2nd most probable genotype as called by CaVEMan">
+##INFO=<ID=DS,Number=.,Type=String,Description="DBSnp ID of known SNP">
+##INFO=<ID=CA,Number=0,Type=Flag,Description="Position could not be annotated to a coding region of a transcript using the supplied bed file">
+##INFO=<ID=SNP,Number=0,Type=Flag,Description="Position matches a dbSNP entry using the supplied bed file">
+##INFO=<ID=ASRD,Number=1,Type=Float,Description="A soft flag median (read length adjusted) alignment score of reads showing the variant allele">
+##INFO=<ID=CLPM,Number=1,Type=Float,Description="A soft flag median number of soft clipped bases in variant supporting reads">
+##INFO=<ID=ASMD,Number=1,Type=Float,Description="A soft flag median alignement score of reads showing the variant allele">
+##INFO=<ID=VD,Number=1,Type=String,Description="Vagrent Default Annotation">
+##INFO=<ID=VW,Number=1,Type=String,Description="Vagrent Most Deleterious Annotation">
+##INFO=<ID=VT,Number=1,Type=String,Description="Variant type based on the Vagrent Default Annotation">
+##INFO=<ID=VC,Number=.,Type=String,Description="Variant consequence based on the Vagrent Default Annotation">
+##SAMPLE=<ID=NORMAL,Description="Normal",Accession=.,Platform=ILLUMINA,Protocol=WXS,SampleName=PD65153f_lo0001,Source=.>
+##SAMPLE=<ID=TUMOUR,Description="Tumour",Accession=.,Platform=ILLUMINA,Protocol=WXS,SampleName=PD65153b_lo0001,Source=.>
+##cavemanVersion=1.15.2
+##cgpAnalysisProc_20240726.1=11255789
+##contig=<ID=1,assembly=NCBI37,length=249250621,species=Human>
+##contig=<ID=2,assembly=NCBI37,length=243199373,species=Human>
+##contig=<ID=3,assembly=NCBI37,length=198022430,species=Human>
+##contig=<ID=4,assembly=NCBI37,length=191154276,species=Human>
+##contig=<ID=5,assembly=NCBI37,length=180915260,species=Human>
+##contig=<ID=6,assembly=NCBI37,length=171115067,species=Human>
+##contig=<ID=7,assembly=NCBI37,length=159138663,species=Human>
+##contig=<ID=8,assembly=NCBI37,length=146364022,species=Human>
+##contig=<ID=9,assembly=NCBI37,length=141213431,species=Human>
+##contig=<ID=10,assembly=NCBI37,length=135534747,species=Human>
+##contig=<ID=11,assembly=NCBI37,length=135006516,species=Human>
+##contig=<ID=12,assembly=NCBI37,length=133851895,species=Human>
+##contig=<ID=13,assembly=NCBI37,length=115169878,species=Human>
+##contig=<ID=14,assembly=NCBI37,length=107349540,species=Human>
+##contig=<ID=15,assembly=NCBI37,length=102531392,species=Human>
+##contig=<ID=16,assembly=NCBI37,length=90354753,species=Human>
+##contig=<ID=17,assembly=NCBI37,length=81195210,species=Human>
+##contig=<ID=18,assembly=NCBI37,length=78077248,species=Human>
+##contig=<ID=19,assembly=NCBI37,length=59128983,species=Human>
+##contig=<ID=20,assembly=NCBI37,length=63025520,species=Human>
+##contig=<ID=21,assembly=NCBI37,length=48129895,species=Human>
+##contig=<ID=22,assembly=NCBI37,length=51304566,species=Human>
+##contig=<ID=X,assembly=NCBI37,length=155270560,species=Human>
+##contig=<ID=Y,assembly=NCBI37,length=59373566,species=Human>
+##contig=<ID=MT,assembly=NCBI37,length=16569,species=Human>
+##contig=<ID=GL000207.1,assembly=NCBI37,length=4262,species=Human>
+##contig=<ID=GL000226.1,assembly=NCBI37,length=15008,species=Human>
+##contig=<ID=GL000229.1,assembly=NCBI37,length=19913,species=Human>
+##contig=<ID=GL000231.1,assembly=NCBI37,length=27386,species=Human>
+##contig=<ID=GL000210.1,assembly=NCBI37,length=27682,species=Human>
+##contig=<ID=GL000239.1,assembly=NCBI37,length=33824,species=Human>
+##contig=<ID=GL000235.1,assembly=NCBI37,length=34474,species=Human>
+##contig=<ID=GL000201.1,assembly=NCBI37,length=36148,species=Human>
+##contig=<ID=GL000247.1,assembly=NCBI37,length=36422,species=Human>
+##contig=<ID=GL000245.1,assembly=NCBI37,length=36651,species=Human>
+##contig=<ID=GL000197.1,assembly=NCBI37,length=37175,species=Human>
+##contig=<ID=GL000203.1,assembly=NCBI37,length=37498,species=Human>
+##contig=<ID=GL000246.1,assembly=NCBI37,length=38154,species=Human>
+##contig=<ID=GL000249.1,assembly=NCBI37,length=38502,species=Human>
+##contig=<ID=GL000196.1,assembly=NCBI37,length=38914,species=Human>
+##contig=<ID=GL000248.1,assembly=NCBI37,length=39786,species=Human>
+##contig=<ID=GL000244.1,assembly=NCBI37,length=39929,species=Human>
+##contig=<ID=GL000238.1,assembly=NCBI37,length=39939,species=Human>
+##contig=<ID=GL000202.1,assembly=NCBI37,length=40103,species=Human>
+##contig=<ID=GL000234.1,assembly=NCBI37,length=40531,species=Human>
+##contig=<ID=GL000232.1,assembly=NCBI37,length=40652,species=Human>
+##contig=<ID=GL000206.1,assembly=NCBI37,length=41001,species=Human>
+##contig=<ID=GL000240.1,assembly=NCBI37,length=41933,species=Human>
+##contig=<ID=GL000236.1,assembly=NCBI37,length=41934,species=Human>
+##contig=<ID=GL000241.1,assembly=NCBI37,length=42152,species=Human>
+##contig=<ID=GL000243.1,assembly=NCBI37,length=43341,species=Human>
+##contig=<ID=GL000242.1,assembly=NCBI37,length=43523,species=Human>
+##contig=<ID=GL000230.1,assembly=NCBI37,length=43691,species=Human>
+##contig=<ID=GL000237.1,assembly=NCBI37,length=45867,species=Human>
+##contig=<ID=GL000233.1,assembly=NCBI37,length=45941,species=Human>
+##contig=<ID=GL000204.1,assembly=NCBI37,length=81310,species=Human>
+##contig=<ID=GL000198.1,assembly=NCBI37,length=90085,species=Human>
+##contig=<ID=GL000208.1,assembly=NCBI37,length=92689,species=Human>
+##contig=<ID=GL000191.1,assembly=NCBI37,length=106433,species=Human>
+##contig=<ID=GL000227.1,assembly=NCBI37,length=128374,species=Human>
+##contig=<ID=GL000228.1,assembly=NCBI37,length=129120,species=Human>
+##contig=<ID=GL000214.1,assembly=NCBI37,length=137718,species=Human>
+##contig=<ID=GL000221.1,assembly=NCBI37,length=155397,species=Human>
+##contig=<ID=GL000209.1,assembly=NCBI37,length=159169,species=Human>
+##contig=<ID=GL000218.1,assembly=NCBI37,length=161147,species=Human>
+##contig=<ID=GL000220.1,assembly=NCBI37,length=161802,species=Human>
+##contig=<ID=GL000213.1,assembly=NCBI37,length=164239,species=Human>
+##contig=<ID=GL000211.1,assembly=NCBI37,length=166566,species=Human>
+##contig=<ID=GL000199.1,assembly=NCBI37,length=169874,species=Human>
+##contig=<ID=GL000217.1,assembly=NCBI37,length=172149,species=Human>
+##contig=<ID=GL000216.1,assembly=NCBI37,length=172294,species=Human>
+##contig=<ID=GL000215.1,assembly=NCBI37,length=172545,species=Human>
+##contig=<ID=GL000205.1,assembly=NCBI37,length=174588,species=Human>
+##contig=<ID=GL000219.1,assembly=NCBI37,length=179198,species=Human>
+##contig=<ID=GL000224.1,assembly=NCBI37,length=179693,species=Human>
+##contig=<ID=GL000223.1,assembly=NCBI37,length=180455,species=Human>
+##contig=<ID=GL000195.1,assembly=NCBI37,length=182896,species=Human>
+##contig=<ID=GL000212.1,assembly=NCBI37,length=186858,species=Human>
+##contig=<ID=GL000222.1,assembly=NCBI37,length=186861,species=Human>
+##contig=<ID=GL000200.1,assembly=NCBI37,length=187035,species=Human>
+##contig=<ID=GL000193.1,assembly=NCBI37,length=189789,species=Human>
+##contig=<ID=GL000194.1,assembly=NCBI37,length=191469,species=Human>
+##contig=<ID=GL000225.1,assembly=NCBI37,length=211173,species=Human>
+##contig=<ID=GL000192.1,assembly=NCBI37,length=547496,species=Human>
+##contig=<ID=NC_007605,assembly=NCBI37,length=171823,species=Human>
+##contig=<ID=hs37d5,assembly=NCBI37,length=35477943,species=Human>
+##fileDate=20240726
+##source_20240726.1=AnnotateVcf.pl
+##vcfProcessLog=<InputVCF=<.>,InputVCFSource=<CaVEMan>,InputVCFParam=<NORMAL_CONTAMINATION=0,SNP_CUTOFF=0.95,REF_BIAS=0.95,PRIOR_MUT_RATE=6e-06,MUT_CUTOFF=0.8,PRIOR_SNP_RATE=0.0001>>
+##vcfProcessLog_20240726.1=<InputVCF=<split.vcf.1>,InputVCFSource=<FlagCaVEManVCF.pl>,InputVCFVer=<1.12.0>,InputVCFParam=<sa=.,umv=caveman,idx=.,sp=.,p=11255789,ab=vagrent_ref,h=.,t=WXS,flagmnv=.,v=caveman_flagging_flag_config_ref.ini,b=flagging,ref=genome.fa.fai,c=flag.vcf.config.yaml,n=3175141.bam,f=split.vcf.1,o=flagged_split.vcf.1,s=HUMAN,m=3175114.bam,l=2000,g=.,loud=.>>
+##bcftools_viewVersion=1.19+htslib-1.19.1
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOUR
+1	100	e2585868-4b6d-11ef-a179-d73695b4a6ba	G	A	.	AN;MN;MNP	DP=223;MP=1;GP=0.00053;TG=GG/AGGGGGGGGG;TP=1;SG=GG/AAGGGGGGGG;SP=0.0015;ASRD=0.91;CLPM=0;ASMD=138;VT=Sub	GT:FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ:PM	0/0:3:0:29:0:3:0:44:0:0.076	0/1:4:0:54:0:8:0:78:0:0.083
diff --git a/test_validate_read.py b/test_validate_read.py
new file mode 100644
index 0000000..a38009f
--- /dev/null
+++ b/test_validate_read.py
@@ -0,0 +1,27 @@
+from hairpin2 import main as hp2
+from hairpin2 import constants as c
+import pysam
+
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC'
+r.query_qualities = pysam.qualitystring_to_array('KKKKKKKKKK')
+r.flag = 0x2
+r.reference_id = 0
+r.reference_start = 95
+r.next_reference_start = 105
+r.mapping_quality = 20
+r.cigarstring = '10M'
+r.cigartuples = [(0,10)]
+r.set_tag('MC', '10M')
+
+
+def test_validate_read_ideal():
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+def test_validate_read_missing_cigar():
+    rc = r
+    rc.cigartuples = None
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+

From 2d53bc44672ee04e74e551dd63e01dca99c2200f Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 11 Sep 2024 16:35:44 +0100
Subject: [PATCH 110/165] extended unit tests and broken CI

---
 .gitlab-ci.yml           | 76 ++++++++++++++++++++++++++++++++++++++++
 README.md                |  3 +-
 docker-run-unit-tests.sh | 16 +++++++++
 hairpin2/constants.py    |  2 +-
 hairpin2/main.py         | 14 ++++----
 test_validate_read.py    | 70 ++++++++++++++++++++++++++++++++----
 6 files changed, 166 insertions(+), 15 deletions(-)
 create mode 100644 .gitlab-ci.yml
 create mode 100644 docker-run-unit-tests.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..2b341be
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,76 @@
+stages:
+  - build
+  - test
+  - publish
+
+include:
+# DOCS: https://gitlab.internal.sanger.ac.uk/team113sanger/common/cicd-template/-/blob/develop/README.md
+  - project: 'team113sanger/common/cicd-template'
+    ref: 0.3.1
+    file: '.gitlab-ci-components.yml'
+
+#############
+# TEMPLATES #
+#############
+
+.generic-wo-script-or-rules:
+  extends:
+    - .component-variables
+    - .component-before_script
+    - .component-tags-shared-large-runner
+
+.specific-variables:
+  variables:
+    UNIT_TEST_MOUNT_POINT: /opt/tests
+    RUN_SCRIPT_MOUNT_POINT: /tmp/run.sh
+    # We need to set this to 1 to enable BuildKit as the Dockerfile uses BuildKit features to speed up the build
+    DOCKER_BUILDKIT: 1
+    PRE_FETCH_BASE_IMAGE: python:3.12-slim
+    # Incase 'docker compose' build is ever used we want to ensure the image
+    # does not have sudo. By default CICD jobs do not build with 'docker
+    # compose' but use 'docker' - so this is just a safety measure.
+    HAS_SUDO: 0
+
+############
+#   JOBS   #
+############
+
+build:
+  stage: build
+  extends:
+    - .generic-wo-script-or-rules
+    - .specific-variables
+    - .component-script_docker-build
+    - .component-rules-except-release
+
+unit-test:
+  stage: test
+  extends:
+    - .generic-wo-script-or-rules
+    - .specific-variables
+    - .component-rules-except-release
+  script:
+    - echo "*** [SCRIPT] START ***"
+    - echo "I am a script - I run the Python unit tests in a docker container"
+    - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
+    - docker pull "${CANDIDATE_IMAGE}"
+    # Test image against unit tests - it requires env vars
+    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}/tests:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/scripts/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - echo "*** [SCRIPT] END ***"
+
+publish-develop:
+  stage: publish
+  extends:
+    - .generic-wo-script-or-rules
+    - .specific-variables
+    - .component-script-publish-develop-docker-image
+    - .component-rules-develop-only
+
+publish-tagged_and_latest_docker_images:
+  stage: publish
+  extends:
+    - .generic-wo-script-or-rules
+    - .specific-variables
+    - .component-script-publish-tagged+latest-docker-image
+    - .component-rules-tag-only
+
diff --git a/README.md b/README.md
index 1a87628..1c1354f 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 
 * Python >= 3.10 – required
 * pysam >= 0.22.1 – installed automatically during install process (tested with 0.22.1 only)
+* pytest - optional, only necessary to run tests
 
 ### INSTALLATION
 
@@ -106,4 +107,4 @@ The basic procedure of this implementation is as follows:
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
 >   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
 
-The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
\ No newline at end of file
+The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
diff --git a/docker-run-unit-tests.sh b/docker-run-unit-tests.sh
new file mode 100644
index 0000000..b4f9169
--- /dev/null
+++ b/docker-run-unit-tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+if [ -z ${TEST_DIR} ]; then
+    echo "TEST_DIR not set!"
+    exit 1
+fi
+PKG_DIR=$(python -c "import os;import crispr_lib_matching;import inspect;print(os.path.dirname(inspect.getfile(crispr_lib_matching)))")
+
+echo "$(python --version)"
+echo "Package source directory: ${PKG_DIR}"
+
+pip install \
+    pytest==8.2.2 \
+    pytest-cov==5.0.0 && \
+pytest --cov="${PKG_DIR}" "${TEST_DIR}"
+
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index dabc36b..30e6399 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -15,7 +15,7 @@
             ['MATCH', 'INS', 'DEL', 'SKIP', 'SOFT', 'HARD', 'PAD', 'EQUAL', 'DIFF', 'BACK'],
             start = 0)
 ValidatorFlags = Flag('ReadFlags',
-            ['CLEAR', 'FLAG', 'MAPQUAL', 'READ_FIELDS_MISSING', 'NOT_ALIGNED', 'BAD_OP', 'NOT_ALT', 'BASEQUAL', 'SHORT', 'CLIPQUAL', 'MATE_MISSING_FIELDS', 'OVERLAP'],
+            ['CLEAR', 'FLAG', 'MAPQUAL', 'READ_FIELDS_MISSING', 'NOT_ALIGNED', 'BAD_OP', 'NOT_ALT', 'BASEQUAL', 'SHORT', 'CLIPQUAL', 'NO_OVERLAP'],
             start=0)
 
 class NoAlts(ValueError):
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 52fb530..2652570 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -19,7 +19,7 @@ def validate_read(
 ) -> int:
     read_flag = c.ValidatorFlags.CLEAR.value
 
-    if not (read.flag & 0x2) or read.flag & 0xE00:
+    if not (read.flag & 0x2) or read.flag & 0xE00:  # move flag codes to constants
         read_flag |= c.ValidatorFlags.FLAG.value
 
     if read.mapping_quality < min_mapqual:
@@ -95,22 +95,21 @@ def validate_read(
                         if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
                             read_flag |= c.ValidatorFlags.NOT_ALT.value
 
-                # n.b. nothing done if complex read
         if read_flag == c.ValidatorFlags.CLEAR.value:
-            # "next", through an unfortunate quirk of history, means "mate", so this is reliable (pulls RNEXT)
-            mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # type:ignore
             if not (read.flag & 0x40):
                 # this looks like it should be checked for indexing snags
                 pair_start = read.reference_start
                 pair_end = read.reference_end
                 if read.flag & 0x10:
-                    if pair_start <= mate_end:
+                    # "next", through an unfortunate quirk of history, means "mate", so this is reliable (pulls RNEXT)
+                    mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # type:ignore
+                    if read.reference_start <= mate_end:
                         pair_start = mate_end + 1
                 else:
-                    if pair_end >= read.next_reference_start:  # type:ignore
+                    if read.reference_end >= read.next_reference_start:  # type:ignore
                         pair_end = read.next_reference_start - 1
                 if not (pair_start <= vcf_start <= pair_end):  # type:ignore
-                    read_flag |= c.ValidatorFlags.OVERLAP.value
+                    read_flag |= c.ValidatorFlags.NO_OVERLAP.value
     return read_flag
 
 
@@ -380,6 +379,7 @@ def main_cli() -> None:
             h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 
     for record in vcf_in_handle.fetch():  # type:ignore
+        # need to test pysam's vcf record validation - e.g. what if start is after end
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
                 alignments=vcf_sample_to_alignment_map,
diff --git a/test_validate_read.py b/test_validate_read.py
index a38009f..1370e5f 100644
--- a/test_validate_read.py
+++ b/test_validate_read.py
@@ -1,27 +1,85 @@
 from hairpin2 import main as hp2
 from hairpin2 import constants as c
 import pysam
+import copy
+
+# pysam guards against:
+# quality and seq length mismatch
+# flag not set
+# reference id is none
 
 r = pysam.AlignedSegment()
 r.query_name = 'read1'
 r.query_sequence = 'CTGDAAAACC'
-r.query_qualities = pysam.qualitystring_to_array('KKKKKKKKKK')
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
 r.flag = 0x2
 r.reference_id = 0
 r.reference_start = 95
 r.next_reference_start = 105
 r.mapping_quality = 20
 r.cigarstring = '10M'
-r.cigartuples = [(0,10)]
 r.set_tag('MC', '10M')
 
 
-def test_validate_read_ideal():
+# ideally there would be a test for each time read_flag is set
+# i.e. test every path of achieving a given flag
+# so far there's a test for each flag
+
+def test_ideal():
     assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.CLEAR.value
 
 
-def test_validate_read_missing_cigar():
-    rc = r
+def test_mapqual():
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=30, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
+
+
+def test_not_aligned():
+    assert hp2.validate_read(read=r, vcf_start=200, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+def test_not_alt():
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='T', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NOT_ALT.value
+
+
+def test_basequal():
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
+
+
+def test_read_short():
+    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='ATTTTTTTTTTTTTT', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.SHORT.value
+
+
+
+
+def test_missing_cigar():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = None
     rc.cigartuples = None
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+def test_bad_op():
+    rc = copy.deepcopy(r)
+    rc.cigartuples = [(c.Ops.EQUAL.value, 10)]
+    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.BAD_OP.value
+
+
+def test_clipqual():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '1S9M'
+    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=40, min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
+
+
+def test_no_overlap_0x10():
+    rc = copy.deepcopy(r)
+    rc.flag |= 0x10
+    rc.set_tag('MC', '3M')
+    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+
+
+def test_no_overlap():
+    rc = copy.deepcopy(r)
+    rc.next_reference_start = 98
+    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+
 

From 4e52625ba930351195abdab864db4e4d2444d64e Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 11 Sep 2024 17:10:06 +0100
Subject: [PATCH 111/165] add dockerfile

---
 Dockerfile      | 20 ++++++++++++++++++++
 Singularity.def |  4 ++--
 2 files changed, 22 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..3e0a602
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.12-slim
+
+# Set the working directory inside the container
+WORKDIR /hairpin2
+
+# Copy the current working directory contents into the container
+COPY . /hairpin2
+
+# Install the hairpin package
+RUN pip install --root-user-action ignore /hairpin2/
+
+# Define a test script to check the installation of hairpin
+RUN LOC=$(which hairpin2) \
+    && if [ -z "$LOC" ]; then \
+    echo "hairpin install failed" && exit 1; \
+    else echo "hairpin install successful"; fi
+
+# Set up the default command for the container
+ENTRYPOINT ["hairpin2"]
+
diff --git a/Singularity.def b/Singularity.def
index d01e4e1..1c22bea 100644
--- a/Singularity.def
+++ b/Singularity.def
@@ -2,10 +2,10 @@ Bootstrap: docker
 From: python:3.12-slim
 
 %files
-. hairpin/
+. hairpin2/
 
 %post
-pip install --root-user-action ignore hairpin/
+pip install --root-user-action ignore hairpin2/
 
 %test
 	LOC=$(which hairpin2)

From 246c9c938d78b43a892dc18bb030faa3c0b2ccf7 Mon Sep 17 00:00:00 2001
From: Ian Vermes <iv3@sanger.ac.uk>
Date: Mon, 16 Sep 2024 14:57:37 +0100
Subject: [PATCH 112/165] Ignore venv and .venv

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 5a67585..8f6224c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 data*
 .env/
+venv/
+.venv/
 dist/
 *.egg-info/
 __pycache__/

From 376cc4410913414db75a55ec5fdf97e0327e2291 Mon Sep 17 00:00:00 2001
From: Ian Vermes <iv3@sanger.ac.uk>
Date: Mon, 16 Sep 2024 14:57:57 +0100
Subject: [PATCH 113/165] Add a poetry compliant pyproject.toml

---
 pyproject.toml | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a9f252e..efd7784 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,17 +1,22 @@
-[tool.setuptools]
-packages = ["hairpin2"]
-
-[build-system]
-requires = ["setuptools"]
-build-backend = "setuptools.build_meta"
-
-[project]
+[tool.poetry]
 name = "hairpin2"
 version = "0.0.2a"
-requires-python = ">= 3.10"
-dependencies = [
-    'pysam >= 0.22.1'
-]
+description = "CLI implementation of the hairpin detection algorithm concieved by Ellis et al, 2020."
+authors = ["Alex Byrne <ab63@sanger.ac.uk>"]
+license = "AGPL3"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.10"
+pysam = "^0.22"
 
-[project.scripts]
+[tool.poetry.plugins."console_scripts"]
 hairpin2 = "hairpin2.main:main_cli"
+
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.3"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

From fa30697dc2824e41daafa0ad5bcc1f26ba8a77c1 Mon Sep 17 00:00:00 2001
From: Ian Vermes <iv3@sanger.ac.uk>
Date: Mon, 16 Sep 2024 15:00:02 +0100
Subject: [PATCH 114/165] Streamline version handling to have use
 pyproject.toml as single source of truth

---
 hairpin2/__init__.py  | 59 +++++++++++++++++++++++++++++++++++++++++++
 hairpin2/constants.py |  1 -
 hairpin2/main.py      |  3 ++-
 3 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/hairpin2/__init__.py b/hairpin2/__init__.py
index e69de29..8737543 100644
--- a/hairpin2/__init__.py
+++ b/hairpin2/__init__.py
@@ -0,0 +1,59 @@
+# You can single-source your package version setting the package version only in
+# the project data in `pyproject.toml` or `setup.py`, using `importlib.metadata`.
+#
+# If your package requires Python 3.8 or above, you can use the standard library
+# package `importlib.metadata` and not add a dependency.
+#
+# If you're supporting versions below 3.8, you need to add a dependency for the
+# shim package `importlib-metadata` and import it if `importlib.metadata`
+# is not present.
+#
+# You use the `version()` function to retrieve the version string for a package.
+# The value of `__name__` normally provides the package name, but if you're
+# running the package as a script (either `python -m my_package` or through
+# a script installed with the package), `__name__` will be `"__main__"`,
+# in which case you need to use `__package__` to get the package name.
+# As far as I can tell there isn't a variable that covers both situations.
+#
+# Set the package version in your pyproject.toml or setup.py. If you're
+# supporting Python versions before 3.8, add a conditional dependency for
+# importlib-metadata (examples below).
+#
+# Choose one of the following code snippets, depending on what Python versions
+# your package supports.
+# Put the snippet in the __init__.py of your top-level package
+
+###### supporting Python versions below 3.8 ######
+
+
+def _set_version() -> str:  # noqa: C901
+    """Set the package version from the project metadata in pyproject.toml."""
+    from warnings import warn
+
+    fallback_version = "0.0.0"
+    try:
+        # importlib.metadata is present in Python 3.8 and later
+        import importlib.metadata as importlib_metadata
+    except ImportError:
+        # use the shim package importlib-metadata pre-3.8
+        import importlib_metadata as importlib_metadata
+
+    try:
+        # __package__ allows for the case where __name__ is "__main__"
+        version = importlib_metadata.version(__package__ or __name__)
+    except importlib_metadata.PackageNotFoundError:
+        version = fallback_version
+
+    if version == fallback_version:
+        msg = (
+            f"Package version will be {fallback_version} because Python could not find "
+            f"package {__package__ or __name__} in project metadata. Either the "
+            "version was not set in pyproject.toml or the package was not installed. "
+            "If developing code, please install the package in editable "
+            "mode with `poetry install` or `pip install -e .`"
+        )
+        warn(msg)
+    return version
+
+
+__version__ = _set_version()
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 30e6399..609ead8 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -2,7 +2,6 @@
 from typing import Callable
 import dataclasses as d
 
-VERSION = '0.0.2a'
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 2652570..9f424ec 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,5 +1,6 @@
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c, helpers as h
+import hairpin2
 from statistics import mean, median, stdev
 import argparse
 import logging
@@ -253,7 +254,7 @@ def main_cli() -> None:
 
     parser = argparse.ArgumentParser(prog="hairpin2", description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
     parser._optionals.title = 'info'
-    parser.add_argument('-v', '--version', help='print version', action='version', version=c.VERSION)
+    parser.add_argument('-v', '--version', help='print version', action='version', version=hairpin2.__version__)
     req = parser.add_argument_group('mandatory')
     req.add_argument('-i', '--vcf-in', help="path to input VCF", required=True)
     req.add_argument('-o', '--vcf-out', help="path to write output VCF", required=True)

From b5a417a3891312459f197638e82877ec8eb1e15e Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 10:18:20 +0100
Subject: [PATCH 115/165] test CI permissions

---
 docker-run-unit-tests.sh | 2 +-
 pyproject.toml           | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docker-run-unit-tests.sh b/docker-run-unit-tests.sh
index b4f9169..2bde71f 100644
--- a/docker-run-unit-tests.sh
+++ b/docker-run-unit-tests.sh
@@ -4,7 +4,7 @@ if [ -z ${TEST_DIR} ]; then
     echo "TEST_DIR not set!"
     exit 1
 fi
-PKG_DIR=$(python -c "import os;import crispr_lib_matching;import inspect;print(os.path.dirname(inspect.getfile(crispr_lib_matching)))")
+PKG_DIR=$(python -c "import os;import hairpin2;import inspect;print(os.path.dirname(inspect.getfile(hairpin2)))")
 
 echo "$(python --version)"
 echo "Package source directory: ${PKG_DIR}"
diff --git a/pyproject.toml b/pyproject.toml
index efd7784..45d0788 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,6 @@ pysam = "^0.22"
 [tool.poetry.plugins."console_scripts"]
 hairpin2 = "hairpin2.main:main_cli"
 
-
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.3"
 

From 791a865d7f410e73a3ef4643550e36434f61f144 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 11:15:17 +0100
Subject: [PATCH 116/165] CI tweaks

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2b341be..8bace7f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,8 +21,8 @@ include:
 
 .specific-variables:
   variables:
-    UNIT_TEST_MOUNT_POINT: /opt/tests
-    RUN_SCRIPT_MOUNT_POINT: /tmp/run.sh
+    UNIT_TEST_MOUNT_POINT: /hairpin2
+    RUN_SCRIPT_MOUNT_POINT: /hairpin2/docker_run_unit_tests.sh
     # We need to set this to 1 to enable BuildKit as the Dockerfile uses BuildKit features to speed up the build
     DOCKER_BUILDKIT: 1
     PRE_FETCH_BASE_IMAGE: python:3.12-slim
@@ -55,7 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}/tests:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/scripts/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From ac6cfb38fbe0423758e1a5df1370dff10201c4f6 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 11:26:08 +0100
Subject: [PATCH 117/165] CI tweaks

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8bace7f..9dc7421 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,7 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" bash "${RUN_SCRIPT_MOUNT_POINT}"
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From eeac5f834be53e6cffedca9c584062c59734909b Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 11:28:03 +0100
Subject: [PATCH 118/165] missed a spot

---
 .gitlab-ci.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9dc7421..5d3badd 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,7 +55,12 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" bash "${RUN_SCRIPT_MOUNT_POINT}"
+    - docker run --entrypoint "" \
+      -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" \
+      -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" \
+      -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" \
+      --rm \
+      "${CANDIDATE_IMAGE}" bash "${RUN_SCRIPT_MOUNT_POINT}"
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From 29441cdc91636a4ee428eb5c6541a5e653987ede Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 11:35:28 +0100
Subject: [PATCH 119/165] fix formatting

---
 .gitlab-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5d3badd..fcdab47 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -59,8 +59,8 @@ unit-test:
       -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" \
       -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" \
       -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" \
-      --rm \
-      "${CANDIDATE_IMAGE}" bash "${RUN_SCRIPT_MOUNT_POINT}"
+      --rm "${CANDIDATE_IMAGE}" \
+      bash "${RUN_SCRIPT_MOUNT_POINT}"
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From 2d8b59696286c24745a76833f170a3f5f7b121fa Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 11:48:17 +0100
Subject: [PATCH 120/165] ok now I understand I think

---
 .gitlab-ci.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fcdab47..6c2fb2a 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -21,8 +21,8 @@ include:
 
 .specific-variables:
   variables:
-    UNIT_TEST_MOUNT_POINT: /hairpin2
-    RUN_SCRIPT_MOUNT_POINT: /hairpin2/docker_run_unit_tests.sh
+    UNIT_TEST_MOUNT_POINT: /opt/tests
+    RUN_SCRIPT_MOUNT_POINT: /tmp/run.sh
     # We need to set this to 1 to enable BuildKit as the Dockerfile uses BuildKit features to speed up the build
     DOCKER_BUILDKIT: 1
     PRE_FETCH_BASE_IMAGE: python:3.12-slim
@@ -55,12 +55,10 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run --entrypoint "" \
-      -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" \
+    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" \
       -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" \
       -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" \
-      --rm "${CANDIDATE_IMAGE}" \
-      bash "${RUN_SCRIPT_MOUNT_POINT}"
+      --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From a5616687eb3c76e54b8a3f71287bcb3877d41931 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 13:35:38 +0100
Subject: [PATCH 121/165] no bash line breaks

---
 .gitlab-ci.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6c2fb2a..77fd55d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,10 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" \
-      -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" \
-      -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" \
-      --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From d92bd5456b033297037ca5926cd23be7c0e44ad3 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 13:44:53 +0100
Subject: [PATCH 122/165] override entrypoint

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 77fd55d..7b0ded9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,7 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run --entrypoint "bash" -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From a18f15de1086f3d9afaf3b17fa3a9f67bbb17b20 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 13:53:55 +0100
Subject: [PATCH 123/165] correct path

---
 .gitignore     | 1 +
 .gitlab-ci.yml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8f6224c..7315945 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ test_data_creation/
 *.txt
 *.sif
 *.json
+poetry.lock
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 7b0ded9..f9e1181 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,7 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run --entrypoint "bash" -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker_run_unit_tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run --entrypoint "bash" -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker-run-unit-tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:

From 1eb120d943aa6ac0a0eba1f908d67c23578a2daf Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 15:51:44 +0100
Subject: [PATCH 124/165] restructure for cov testing

---
 .gitlab-ci.yml                                |   2 +-
 test/single_var_silico.vcf                    | 141 ------------------
 .../test_validate_read.py                     |   0
 3 files changed, 1 insertion(+), 142 deletions(-)
 delete mode 100644 test/single_var_silico.vcf
 rename test_validate_read.py => test/test_validate_read.py (100%)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f9e1181..f9759f9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -55,7 +55,7 @@ unit-test:
     - echo "Unit test against CANDIDATE_IMAGE='${CANDIDATE_IMAGE:?not-set-in-before_script}'"
     - docker pull "${CANDIDATE_IMAGE}"
     # Test image against unit tests - it requires env vars
-    - docker run --entrypoint "bash" -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker-run-unit-tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
+    - docker run --entrypoint "bash" -e TEST_DIR="${UNIT_TEST_MOUNT_POINT}" -v "${PWD}/test:${UNIT_TEST_MOUNT_POINT}:ro" -v "${PWD}/docker-run-unit-tests.sh:${RUN_SCRIPT_MOUNT_POINT}:ro" --rm "${CANDIDATE_IMAGE}" ${RUN_SCRIPT_MOUNT_POINT}
     - echo "*** [SCRIPT] END ***"
 
 publish-develop:
diff --git a/test/single_var_silico.vcf b/test/single_var_silico.vcf
deleted file mode 100644
index 3c06019..0000000
--- a/test/single_var_silico.vcf
+++ /dev/null
@@ -1,141 +0,0 @@
-##fileformat=VCFv4.1
-##FILTER=<ID=PASS,Description="All filters passed">
-##FILTER=<ID=DTH,Description="Less than 1/3 mutant alleles were >= 25 base quality">
-##FILTER=<ID=RP,Description="Coverage was less than 8 and no mutant alleles were found in the first 2/3 of a read (shifted 0.08 from the start and extended 0.08 more than 2/3 of the read length)">
-##FILTER=<ID=MN,Description="More than 0.03 of mutant alleles that were >= 15 base quality found in the matched normal">
-##FILTER=<ID=PT,Description="Mutant alleles all on one direction of read (1rd allowed on opposite strand) and in second half of the read. Second half of read contains the motif GGC[AT]G in sequenced orientation and the mean base quality of all bases after the motif was less than 20">
-##FILTER=<ID=MQ,Description="Mean mapping quality of the mutant allele reads was < 21">
-##FILTER=<ID=SR,Description="Position falls within a simple repeat using the supplied bed file">
-##FILTER=<ID=CR,Description="Position falls within a centromeric repeat using the supplied bed file">
-##FILTER=<ID=PH,Description="Mutant reads were on one strand (permitted proportion on other strand: 0.04), and mean mutant base quality was less than 21">
-##FILTER=<ID=TI,Description="More than 10 percent of reads covering this position contained an indel according to mapping">
-##FILTER=<ID=SRP,Description="More than 80 percent of reads contain the mutant allele at the same read position">
-##FILTER=<ID=HSD,Description="Position falls within a high sequencing depth region using the supplied bed file">
-##FILTER=<ID=AN,Description="Position could not be annotated against a transcript using the supplied bed file">
-##FILTER=<ID=VUM,Description="Position has >= 3 mutant allele present in at least 1 percent unmatched normal samples in the unmatched VCF.">
-##FILTER=<ID=SE,Description="Coverage is >= 10 on each strand but mutant allele is only present on one strand">
-##FILTER=<ID=MNP,Description="Tumour sample mutant allele proportion - normal sample mutant allele proportion < 0.2">
-##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=FAZ,Number=1,Type=Integer,Description="Reads presenting a A for this position, forward strand">
-##FORMAT=<ID=FCZ,Number=1,Type=Integer,Description="Reads presenting a C for this position, forward strand">
-##FORMAT=<ID=FGZ,Number=1,Type=Integer,Description="Reads presenting a G for this position, forward strand">
-##FORMAT=<ID=FTZ,Number=1,Type=Integer,Description="Reads presenting a T for this position, forward strand">
-##FORMAT=<ID=RAZ,Number=1,Type=Integer,Description="Reads presenting a A for this position, reverse strand">
-##FORMAT=<ID=RCZ,Number=1,Type=Integer,Description="Reads presenting a C for this position, reverse strand">
-##FORMAT=<ID=RGZ,Number=1,Type=Integer,Description="Reads presenting a G for this position, reverse strand">
-##FORMAT=<ID=RTZ,Number=1,Type=Integer,Description="Reads presenting a T for this position, reverse strand">
-##FORMAT=<ID=PM,Number=1,Type=Float,Description="Proportion of mut allele">
-##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
-##INFO=<ID=MP,Number=1,Type=Float,Description="Sum of CaVEMan somatic genotype probabilities">
-##INFO=<ID=GP,Number=1,Type=Float,Description="Sum of CaVEMan germline genotype probabilities">
-##INFO=<ID=TG,Number=1,Type=String,Description="Most probable genotype as called by CaVEMan">
-##INFO=<ID=TP,Number=1,Type=Float,Description="Probability of most probable genotype as called by CaVEMan">
-##INFO=<ID=SG,Number=1,Type=String,Description="2nd most probable genotype as called by CaVEMan">
-##INFO=<ID=SP,Number=1,Type=Float,Description="Probability of 2nd most probable genotype as called by CaVEMan">
-##INFO=<ID=DS,Number=.,Type=String,Description="DBSnp ID of known SNP">
-##INFO=<ID=CA,Number=0,Type=Flag,Description="Position could not be annotated to a coding region of a transcript using the supplied bed file">
-##INFO=<ID=SNP,Number=0,Type=Flag,Description="Position matches a dbSNP entry using the supplied bed file">
-##INFO=<ID=ASRD,Number=1,Type=Float,Description="A soft flag median (read length adjusted) alignment score of reads showing the variant allele">
-##INFO=<ID=CLPM,Number=1,Type=Float,Description="A soft flag median number of soft clipped bases in variant supporting reads">
-##INFO=<ID=ASMD,Number=1,Type=Float,Description="A soft flag median alignement score of reads showing the variant allele">
-##INFO=<ID=VD,Number=1,Type=String,Description="Vagrent Default Annotation">
-##INFO=<ID=VW,Number=1,Type=String,Description="Vagrent Most Deleterious Annotation">
-##INFO=<ID=VT,Number=1,Type=String,Description="Variant type based on the Vagrent Default Annotation">
-##INFO=<ID=VC,Number=.,Type=String,Description="Variant consequence based on the Vagrent Default Annotation">
-##SAMPLE=<ID=NORMAL,Description="Normal",Accession=.,Platform=ILLUMINA,Protocol=WXS,SampleName=PD65153f_lo0001,Source=.>
-##SAMPLE=<ID=TUMOUR,Description="Tumour",Accession=.,Platform=ILLUMINA,Protocol=WXS,SampleName=PD65153b_lo0001,Source=.>
-##cavemanVersion=1.15.2
-##cgpAnalysisProc_20240726.1=11255789
-##contig=<ID=1,assembly=NCBI37,length=249250621,species=Human>
-##contig=<ID=2,assembly=NCBI37,length=243199373,species=Human>
-##contig=<ID=3,assembly=NCBI37,length=198022430,species=Human>
-##contig=<ID=4,assembly=NCBI37,length=191154276,species=Human>
-##contig=<ID=5,assembly=NCBI37,length=180915260,species=Human>
-##contig=<ID=6,assembly=NCBI37,length=171115067,species=Human>
-##contig=<ID=7,assembly=NCBI37,length=159138663,species=Human>
-##contig=<ID=8,assembly=NCBI37,length=146364022,species=Human>
-##contig=<ID=9,assembly=NCBI37,length=141213431,species=Human>
-##contig=<ID=10,assembly=NCBI37,length=135534747,species=Human>
-##contig=<ID=11,assembly=NCBI37,length=135006516,species=Human>
-##contig=<ID=12,assembly=NCBI37,length=133851895,species=Human>
-##contig=<ID=13,assembly=NCBI37,length=115169878,species=Human>
-##contig=<ID=14,assembly=NCBI37,length=107349540,species=Human>
-##contig=<ID=15,assembly=NCBI37,length=102531392,species=Human>
-##contig=<ID=16,assembly=NCBI37,length=90354753,species=Human>
-##contig=<ID=17,assembly=NCBI37,length=81195210,species=Human>
-##contig=<ID=18,assembly=NCBI37,length=78077248,species=Human>
-##contig=<ID=19,assembly=NCBI37,length=59128983,species=Human>
-##contig=<ID=20,assembly=NCBI37,length=63025520,species=Human>
-##contig=<ID=21,assembly=NCBI37,length=48129895,species=Human>
-##contig=<ID=22,assembly=NCBI37,length=51304566,species=Human>
-##contig=<ID=X,assembly=NCBI37,length=155270560,species=Human>
-##contig=<ID=Y,assembly=NCBI37,length=59373566,species=Human>
-##contig=<ID=MT,assembly=NCBI37,length=16569,species=Human>
-##contig=<ID=GL000207.1,assembly=NCBI37,length=4262,species=Human>
-##contig=<ID=GL000226.1,assembly=NCBI37,length=15008,species=Human>
-##contig=<ID=GL000229.1,assembly=NCBI37,length=19913,species=Human>
-##contig=<ID=GL000231.1,assembly=NCBI37,length=27386,species=Human>
-##contig=<ID=GL000210.1,assembly=NCBI37,length=27682,species=Human>
-##contig=<ID=GL000239.1,assembly=NCBI37,length=33824,species=Human>
-##contig=<ID=GL000235.1,assembly=NCBI37,length=34474,species=Human>
-##contig=<ID=GL000201.1,assembly=NCBI37,length=36148,species=Human>
-##contig=<ID=GL000247.1,assembly=NCBI37,length=36422,species=Human>
-##contig=<ID=GL000245.1,assembly=NCBI37,length=36651,species=Human>
-##contig=<ID=GL000197.1,assembly=NCBI37,length=37175,species=Human>
-##contig=<ID=GL000203.1,assembly=NCBI37,length=37498,species=Human>
-##contig=<ID=GL000246.1,assembly=NCBI37,length=38154,species=Human>
-##contig=<ID=GL000249.1,assembly=NCBI37,length=38502,species=Human>
-##contig=<ID=GL000196.1,assembly=NCBI37,length=38914,species=Human>
-##contig=<ID=GL000248.1,assembly=NCBI37,length=39786,species=Human>
-##contig=<ID=GL000244.1,assembly=NCBI37,length=39929,species=Human>
-##contig=<ID=GL000238.1,assembly=NCBI37,length=39939,species=Human>
-##contig=<ID=GL000202.1,assembly=NCBI37,length=40103,species=Human>
-##contig=<ID=GL000234.1,assembly=NCBI37,length=40531,species=Human>
-##contig=<ID=GL000232.1,assembly=NCBI37,length=40652,species=Human>
-##contig=<ID=GL000206.1,assembly=NCBI37,length=41001,species=Human>
-##contig=<ID=GL000240.1,assembly=NCBI37,length=41933,species=Human>
-##contig=<ID=GL000236.1,assembly=NCBI37,length=41934,species=Human>
-##contig=<ID=GL000241.1,assembly=NCBI37,length=42152,species=Human>
-##contig=<ID=GL000243.1,assembly=NCBI37,length=43341,species=Human>
-##contig=<ID=GL000242.1,assembly=NCBI37,length=43523,species=Human>
-##contig=<ID=GL000230.1,assembly=NCBI37,length=43691,species=Human>
-##contig=<ID=GL000237.1,assembly=NCBI37,length=45867,species=Human>
-##contig=<ID=GL000233.1,assembly=NCBI37,length=45941,species=Human>
-##contig=<ID=GL000204.1,assembly=NCBI37,length=81310,species=Human>
-##contig=<ID=GL000198.1,assembly=NCBI37,length=90085,species=Human>
-##contig=<ID=GL000208.1,assembly=NCBI37,length=92689,species=Human>
-##contig=<ID=GL000191.1,assembly=NCBI37,length=106433,species=Human>
-##contig=<ID=GL000227.1,assembly=NCBI37,length=128374,species=Human>
-##contig=<ID=GL000228.1,assembly=NCBI37,length=129120,species=Human>
-##contig=<ID=GL000214.1,assembly=NCBI37,length=137718,species=Human>
-##contig=<ID=GL000221.1,assembly=NCBI37,length=155397,species=Human>
-##contig=<ID=GL000209.1,assembly=NCBI37,length=159169,species=Human>
-##contig=<ID=GL000218.1,assembly=NCBI37,length=161147,species=Human>
-##contig=<ID=GL000220.1,assembly=NCBI37,length=161802,species=Human>
-##contig=<ID=GL000213.1,assembly=NCBI37,length=164239,species=Human>
-##contig=<ID=GL000211.1,assembly=NCBI37,length=166566,species=Human>
-##contig=<ID=GL000199.1,assembly=NCBI37,length=169874,species=Human>
-##contig=<ID=GL000217.1,assembly=NCBI37,length=172149,species=Human>
-##contig=<ID=GL000216.1,assembly=NCBI37,length=172294,species=Human>
-##contig=<ID=GL000215.1,assembly=NCBI37,length=172545,species=Human>
-##contig=<ID=GL000205.1,assembly=NCBI37,length=174588,species=Human>
-##contig=<ID=GL000219.1,assembly=NCBI37,length=179198,species=Human>
-##contig=<ID=GL000224.1,assembly=NCBI37,length=179693,species=Human>
-##contig=<ID=GL000223.1,assembly=NCBI37,length=180455,species=Human>
-##contig=<ID=GL000195.1,assembly=NCBI37,length=182896,species=Human>
-##contig=<ID=GL000212.1,assembly=NCBI37,length=186858,species=Human>
-##contig=<ID=GL000222.1,assembly=NCBI37,length=186861,species=Human>
-##contig=<ID=GL000200.1,assembly=NCBI37,length=187035,species=Human>
-##contig=<ID=GL000193.1,assembly=NCBI37,length=189789,species=Human>
-##contig=<ID=GL000194.1,assembly=NCBI37,length=191469,species=Human>
-##contig=<ID=GL000225.1,assembly=NCBI37,length=211173,species=Human>
-##contig=<ID=GL000192.1,assembly=NCBI37,length=547496,species=Human>
-##contig=<ID=NC_007605,assembly=NCBI37,length=171823,species=Human>
-##contig=<ID=hs37d5,assembly=NCBI37,length=35477943,species=Human>
-##fileDate=20240726
-##source_20240726.1=AnnotateVcf.pl
-##vcfProcessLog=<InputVCF=<.>,InputVCFSource=<CaVEMan>,InputVCFParam=<NORMAL_CONTAMINATION=0,SNP_CUTOFF=0.95,REF_BIAS=0.95,PRIOR_MUT_RATE=6e-06,MUT_CUTOFF=0.8,PRIOR_SNP_RATE=0.0001>>
-##vcfProcessLog_20240726.1=<InputVCF=<split.vcf.1>,InputVCFSource=<FlagCaVEManVCF.pl>,InputVCFVer=<1.12.0>,InputVCFParam=<sa=.,umv=caveman,idx=.,sp=.,p=11255789,ab=vagrent_ref,h=.,t=WXS,flagmnv=.,v=caveman_flagging_flag_config_ref.ini,b=flagging,ref=genome.fa.fai,c=flag.vcf.config.yaml,n=3175141.bam,f=split.vcf.1,o=flagged_split.vcf.1,s=HUMAN,m=3175114.bam,l=2000,g=.,loud=.>>
-##bcftools_viewVersion=1.19+htslib-1.19.1
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOUR
-1	100	e2585868-4b6d-11ef-a179-d73695b4a6ba	G	A	.	AN;MN;MNP	DP=223;MP=1;GP=0.00053;TG=GG/AGGGGGGGGG;TP=1;SG=GG/AAGGGGGGGG;SP=0.0015;ASRD=0.91;CLPM=0;ASMD=138;VT=Sub	GT:FAZ:FCZ:FGZ:FTZ:RAZ:RCZ:RGZ:RTZ:PM	0/0:3:0:29:0:3:0:44:0:0.076	0/1:4:0:54:0:8:0:78:0:0.083
diff --git a/test_validate_read.py b/test/test_validate_read.py
similarity index 100%
rename from test_validate_read.py
rename to test/test_validate_read.py

From 2c60342f4054ad87ffe80d2409f5117ea442c84c Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 17 Sep 2024 16:12:18 +0100
Subject: [PATCH 125/165] add license

---
 LICENSE                    | 662 +++++++++++++++++++++++++++++++++++++
 hairpin2/__init__.py       |  20 ++
 hairpin2/constants.py      |  20 ++
 hairpin2/helpers.py        |  20 ++
 hairpin2/main.py           |  20 ++
 hairpin2/ref2seq.py        |  20 ++
 test/test_validate_read.py |  20 ++
 7 files changed, 782 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..fe6b903
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,662 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
+
diff --git a/hairpin2/__init__.py b/hairpin2/__init__.py
index 8737543..52cc277 100644
--- a/hairpin2/__init__.py
+++ b/hairpin2/__init__.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 # You can single-source your package version setting the package version only in
 # the project data in `pyproject.toml` or `setup.py`, using `importlib.metadata`.
 #
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 609ead8..0c356bd 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 from enum import IntEnum, Flag
 from typing import Callable
 import dataclasses as d
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 387a2ea..441830d 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 from enum import IntEnum, Flag
 import logging
 import sys
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 9f424ec..911d284 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c, helpers as h
 import hairpin2
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index 436fd3a..d84b482 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 import pysam
 from hairpin2 import constants as c
 
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index 1370e5f..70122f3 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -1,3 +1,23 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
 from hairpin2 import main as hp2
 from hairpin2 import constants as c
 import pysam

From cc14d89a87a69fffb7414ceacbf4d11238540a18 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 27 Sep 2024 10:53:47 +0000
Subject: [PATCH 126/165] Luca's suggestions

---
 hairpin2/__init__.py       |  38 +----
 hairpin2/constants.py      |  59 ++++++--
 hairpin2/helpers.py        |  12 +-
 hairpin2/main.py           | 286 +++++++++++++++++++++++++++----------
 hairpin2/ref2seq.py        |  19 +--
 test/test_validate_read.py | 115 ++++++++++++---
 6 files changed, 372 insertions(+), 157 deletions(-)

diff --git a/hairpin2/__init__.py b/hairpin2/__init__.py
index 52cc277..c7fe64c 100644
--- a/hairpin2/__init__.py
+++ b/hairpin2/__init__.py
@@ -5,45 +5,17 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-
-# You can single-source your package version setting the package version only in
-# the project data in `pyproject.toml` or `setup.py`, using `importlib.metadata`.
-#
-# If your package requires Python 3.8 or above, you can use the standard library
-# package `importlib.metadata` and not add a dependency.
-#
-# If you're supporting versions below 3.8, you need to add a dependency for the
-# shim package `importlib-metadata` and import it if `importlib.metadata`
-# is not present.
-#
-# You use the `version()` function to retrieve the version string for a package.
-# The value of `__name__` normally provides the package name, but if you're
-# running the package as a script (either `python -m my_package` or through
-# a script installed with the package), `__name__` will be `"__main__"`,
-# in which case you need to use `__package__` to get the package name.
-# As far as I can tell there isn't a variable that covers both situations.
-#
-# Set the package version in your pyproject.toml or setup.py. If you're
-# supporting Python versions before 3.8, add a conditional dependency for
-# importlib-metadata (examples below).
-#
-# Choose one of the following code snippets, depending on what Python versions
-# your package supports.
-# Put the snippet in the __init__.py of your top-level package
+# GNU Affero General Public License for more details.
 
-###### supporting Python versions below 3.8 ######
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 def _set_version() -> str:  # noqa: C901
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 0c356bd..bf98afb 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -5,17 +5,17 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 from enum import IntEnum, Flag
@@ -25,21 +25,51 @@
 EXIT_SUCCESS = 0
 EXIT_FAILURE = 1
 
-DEFAULTS: dict[str, int | float] = dict((('al_filter_threshold', 0.93), ('min_clip_quality', 35), ('min_mapping_quality', 11), ('min_base_quality', 25), ('max_read_span', 6), ('position_fraction', 0.15)))
+DEFAULTS: dict[str, int | float] = dict((('al_filter_threshold', 0.93),
+                                        ('min_clip_quality', 35),
+                                        ('min_mapping_quality', 11),
+                                        ('min_base_quality', 25),
+                                        ('max_read_span', 6),
+                                        ('position_fraction', 0.15)))
 
 FiltCodes = IntEnum('FiltCodes',
-            ['SIXTYAI', 'SIXTYBI', 'ON_THRESHOLD', 'INSUFFICIENT_READS', 'NO_MUTANTS'],
-            start=0)
+                    ['SIXTYAI',
+                        'SIXTYBI',
+                        'ON_THRESHOLD',
+                        'INSUFFICIENT_READS',
+                        'NO_MUTANTS'],
+                    start=0)
 Ops = IntEnum('Ops',
-            ['MATCH', 'INS', 'DEL', 'SKIP', 'SOFT', 'HARD', 'PAD', 'EQUAL', 'DIFF', 'BACK'],
-            start = 0)
+              ['MATCH',
+                  'INS',
+                  'DEL',
+                  'SKIP',
+                  'SOFT',
+                  'HARD',
+                  'PAD',
+                  'EQUAL',
+                  'DIFF',
+                  'BACK'],
+              start=0)
 ValidatorFlags = Flag('ReadFlags',
-            ['CLEAR', 'FLAG', 'MAPQUAL', 'READ_FIELDS_MISSING', 'NOT_ALIGNED', 'BAD_OP', 'NOT_ALT', 'BASEQUAL', 'SHORT', 'CLIPQUAL', 'NO_OVERLAP'],
-            start=0)
+                      ['CLEAR',
+                          'FLAG',
+                          'MAPQUAL',
+                          'READ_FIELDS_MISSING',
+                          'NOT_ALIGNED',
+                          'BAD_OP',
+                          'NOT_ALT',
+                          'BASEQUAL',
+                          'SHORT',
+                          'CLIPQUAL',
+                          'NO_OVERLAP'],
+                      start=0)
+
 
 class NoAlts(ValueError):
     pass
 
+
 class NoMutants(ValueError):
     pass
 
@@ -61,11 +91,13 @@ def __iter__(self):
 class HPFilter(FilterData):
     name: str = d.field(default='HPF')
 
+
 @d.dataclass
 class ALFilter(FilterData):
     name: str = d.field(default='ALF')
     avg_as: float | None = None
 
+
 @d.dataclass
 class Filters:
     AL: ALFilter
@@ -89,4 +121,3 @@ def get_field(self, field_name):
 
 FiltReturn = Callable[..., Filters]
 FlagReturn = Callable[..., int]
-
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 441830d..b65d295 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -5,17 +5,17 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 from enum import IntEnum, Flag
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 911d284..31427c8 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -5,17 +5,17 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 import pysam
@@ -28,6 +28,7 @@
 from itertools import tee
 from functools import partial
 
+
 def validate_read(
     read: pysam.AlignedSegment,
     vcf_start: int,
@@ -50,22 +51,25 @@ def validate_read(
         mate_cig = read.get_tag('MC')
     except KeyError:
         mate_cig = None
-    if any(x is None for x in [read.reference_start,
-                            read.reference_end,
-                            read.query_sequence,
-                            read.query_qualities,
-                            read.query_alignment_qualities,
-                            read.cigarstring,
-                            read.cigartuples,
-                            mate_cig]):
+    if any(x is None for x in
+            [read.reference_start,
+                read.reference_end,
+                read.query_sequence,
+                read.query_qualities,
+                read.query_alignment_qualities,
+                read.cigarstring,
+                read.cigartuples,
+                mate_cig]):
         read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
     else:
         if ('S' in read.cigarstring and  # type: ignore
-            mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
+                mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start) # VCF 1-INDEXED, alignments 0-INDEXED (vcf_start = 0-indexed mutation position)
+            # VCF 1-INDEXED, alignments 0-INDEXED
+            # (vcf_start = 0-indexed mutation position)
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
         except IndexError:
             read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
@@ -75,7 +79,7 @@ def validate_read(
                 if read.query_sequence[mut_pos] != alt:  # type: ignore
                     read_flag |= c.ValidatorFlags.NOT_ALT.value
                 if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
-                        read_flag |= c.ValidatorFlags.BASEQUAL.value
+                    read_flag |= c.ValidatorFlags.BASEQUAL.value
             # Second, check whether length of read can accommodate size of indel
             elif (mut_pos + vcf_rlen > read.query_length or
                   mut_pos + len(alt) > read.query_length):
@@ -83,35 +87,44 @@ def validate_read(
             else:
                 if len(alt) == 1:  # DEL
                     try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_start, vcf_stop)))
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x),
+                                           range(vcf_start, vcf_stop)))
                     except IndexError:
                         read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
                         if (mut_rng[0][1] != c.Ops.MATCH.value or
                             mut_rng[-1][1] != c.Ops.MATCH.value or
-                            any(x[1] != c.Ops.DEL.value for x in mut_rng[1:-2])):
+                            any(x[1] != c.Ops.DEL.value for x in
+                                mut_rng[1:-2])):
                             read_flag |= c.ValidatorFlags.BAD_OP.value
                 elif vcf_rlen == 1:  # INS
                     try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), range(vcf_start, (vcf_start + len(alt)))))
+                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x),
+                                           range(vcf_start,
+                                                 (vcf_start + len(alt)))))
                     except IndexError:
                         read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
                         if (mut_rng[0][1] != c.Ops.MATCH.value or
                             mut_rng[-1][1] != c.Ops.MATCH.value or
-                            any(x[1] != c.Ops.INS.value for x in mut_rng[1:-2])):
+                                any(x[1] != c.Ops.INS.value for x in
+                                    mut_rng[1:-2])):
                             read_flag |= c.ValidatorFlags.BAD_OP.value
                         if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
                             read_flag |= c.ValidatorFlags.NOT_ALT.value
                 else:  # COMPLEX
-                    max_rng = range(vcf_start, vcf_stop) if (vcf_start + vcf_rlen) > (vcf_start + len(alt)) else range(vcf_start, (vcf_start + len(alt)))
+                    max_rng = (range(vcf_start, vcf_stop)
+                               if (vcf_start + vcf_rlen) >
+                               (vcf_start + len(alt))
+                               else range(vcf_start, (vcf_start + len(alt))))
                     try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x), max_rng))
+                        mut_rng = list(map(
+                                lambda x: r2s.ref2querypos(read, x), max_rng))
                     except IndexError:
                         read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
                     else:
                         if (mut_rng[0][1] != c.Ops.MATCH.value or
-                            mut_rng[-1][1] != c.Ops.MATCH.value):
+                                mut_rng[-1][1] != c.Ops.MATCH.value):
                             read_flag |= c.ValidatorFlags.BAD_OP.value
                         if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
                             read_flag |= c.ValidatorFlags.NOT_ALT.value
@@ -122,8 +135,11 @@ def validate_read(
                 pair_start = read.reference_start
                 pair_end = read.reference_end
                 if read.flag & 0x10:
-                    # "next", through an unfortunate quirk of history, means "mate", so this is reliable (pulls RNEXT)
-                    mate_end = r2s.ref_end_via_cigar(mate_cig, read.next_reference_start)  # type:ignore
+                    # through an unfortunate quirk of history
+                    # "next" means "mate"
+                    # so this is reliable (pulls RNEXT)
+                    mate_end = r2s.ref_end_via_cigar(mate_cig,
+                                                     read.next_reference_start)  # type:ignore
                     if read.reference_start <= mate_end:
                         pair_start = mate_end + 1
                 else:
@@ -156,34 +172,59 @@ def test_variant(
     aln_scores: list[float] = []
 
     for mut_sample, alignment in mutant_alignments.items():
-        read_iter, test_iter = tee(alignment.fetch(vcf_rec.chrom, vcf_rec.start, (vcf_rec.start + 1)))
+        read_iter, test_iter = tee(alignment.fetch(vcf_rec.chrom,
+                                                   vcf_rec.start,
+                                                   (vcf_rec.start + 1)))
         try:
             next(test_iter)
         except StopIteration:
             continue
         sample_readpair_ends = []
         read = None
-        for read in read_iter: # type: ignore
+        for read in read_iter:  # type: ignore
             read_flag = c.ValidatorFlags.CLEAR.value
-            read_flag = read_validator(read=read, alt=alt, vcf_start=vcf_rec.start, vcf_stop=vcf_rec.stop, vcf_rlen=vcf_rec.rlen)
+            read_flag = read_validator(read=read,
+                                       alt=alt,
+                                       vcf_start=vcf_rec.start,
+                                       vcf_stop=vcf_rec.stop,
+                                       vcf_rlen=vcf_rec.rlen)
 
             if read_flag == c.ValidatorFlags.CLEAR.value:
                 mut_reads[mut_sample].append(read)
-                sample_readpair_ends.append([read.reference_start, read.reference_end, read.next_reference_start, r2s.ref_end_via_cigar(read.get_tag('MC'), read.next_reference_start)])  # type: ignore
+                sample_readpair_ends.append(
+                            [read.reference_start,
+                                read.reference_end,
+                                read.next_reference_start,
+                                r2s.ref_end_via_cigar(
+                                                read.get_tag('MC'),
+                                                read.next_reference_start)])  # type: ignore
             mut_reads_log[mut_sample].append((read.query_name, read_flag))
-            del(read)
+            del (read)
         if len(mut_reads[mut_sample]) > 1:
-            sample_readpair_ends_sorted: list[list[int]] = sorted(list(map(sorted, sample_readpair_ends)))
+            sample_readpair_ends_sorted: list[list[int]] = sorted(list(map(
+                                                        sorted,
+                                                        sample_readpair_ends)))
             curr_ends = [sample_readpair_ends_sorted[0]]
             drop_idx = []
             for i in range(1, len(sample_readpair_ends_sorted)):
-                max_spans = map(lambda sublist: max([abs(x - y) for x, y in zip(sublist, sample_readpair_ends_sorted[i])]), curr_ends)
+                max_spans = map(lambda sublist:
+                                max(
+                                    [abs(x - y)
+                                        for x, y
+                                        in zip(sublist,
+                                               sample_readpair_ends_sorted[i])
+                                     ]
+                                ),
+                                curr_ends)
                 if all([x <= max_span for x in max_spans]):
                     curr_ends.append(sample_readpair_ends_sorted[i])
                     drop_idx.append(i)
                 else:
                     curr_ends = [sample_readpair_ends_sorted[i]]
-            mut_reads[mut_sample] = [j for i, j in enumerate(mut_reads[mut_sample]) if i not in drop_idx]
+            mut_reads[mut_sample] = [j
+                                     for i, j
+                                     in enumerate(mut_reads[mut_sample])
+                                     if i not in drop_idx]
     if all([len(x) == 0 for x in mut_reads.values()]):
         al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
         hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
@@ -192,12 +233,15 @@ def test_variant(
             for read in read_list:
                 mut_pos, _ = r2s.ref2querypos(read, vcf_rec.start)
                 if read.flag & 0x10:
-                    read_idx_wrt_aln = read.query_alignment_end - mut_pos  # 1-based position where start, idx 1, is alignment end
-                    mut_read_fracs_r.append(read_idx_wrt_aln / read.query_alignment_length)
+                    # 1-based position where start, idx 1, is alignment end
+                    read_idx_wrt_aln = read.query_alignment_end - mut_pos
+                    mut_read_fracs_r.append(read_idx_wrt_aln
+                                            / read.query_alignment_length)
                     mut_read_pos_r.append(read_idx_wrt_aln)
                 else:
-                    read_idx_wrt_aln  = mut_pos - read.query_alignment_start + 1
-                    mut_read_fracs_f.append(read_idx_wrt_aln / read.query_alignment_length)
+                    read_idx_wrt_aln = mut_pos - read.query_alignment_start + 1
+                    mut_read_fracs_f.append(read_idx_wrt_aln
+                                            / read.query_alignment_length)
                     mut_read_pos_f.append(read_idx_wrt_aln)
                 try:
                     aln_scores.append(read.get_tag('AS') / read.query_length)  # type:ignore
@@ -214,9 +258,12 @@ def test_variant(
         if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
             mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
             sd_f = stdev(mut_read_pos_f)
-            if (((sum([x <= position_fraction_thresh for x in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
-                  mad_f > 0 and
-                  sd_f > 4):
+            if (
+                ((sum([x <= position_fraction_thresh
+                      for x
+                      in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
+                mad_f > 0 and
+                    sd_f > 4):
                 hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
             else:
                 hp_filt.code = c.FiltCodes.SIXTYAI.value
@@ -224,9 +271,12 @@ def test_variant(
         elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
-            if (((sum([x <= position_fraction_thresh for x in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
-                  mad_r > 0 and
-                  sd_r > 4):
+            if (
+                ((sum([x <= position_fraction_thresh
+                      for x
+                      in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
+                mad_r > 0 and
+                    sd_r > 4):
                 hp_filt.code = c.FiltCodes.SIXTYAI.value
             else:
                 hp_filt.code = c.FiltCodes.SIXTYAI.value
@@ -236,7 +286,10 @@ def test_variant(
             sd_f = stdev(mut_read_pos_f)
             mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
             sd_r = stdev(mut_read_pos_r)
-            frac_lt_thresh = sum([x <= position_fraction_thresh for x in mut_read_fracs_f + mut_read_fracs_r]) / (len(mut_read_pos_f) + len(mut_read_pos_r))
+            frac_lt_thresh = (sum([x <= position_fraction_thresh
+                                  for x
+                                  in mut_read_fracs_f + mut_read_fracs_r]) /
+                              (len(mut_read_pos_f) + len(mut_read_pos_r)))
             if (frac_lt_thresh < 0.9 or
                (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
                (mad_f > 1 and sd_f > 10) or
@@ -258,11 +311,17 @@ def test_record_per_alt(
 
     if vcf_rec.alts is None:
         raise c.NoAlts
-    samples_w_mutants = [name for name in vcf_rec.samples if vcf_rec.samples[name]["GT"] != (0, 0)]
+    samples_w_mutants = [name
+                         for name
+                         in vcf_rec.samples
+                         if vcf_rec.samples[name]["GT"] != (0, 0)]
     if len(samples_w_mutants) == 0:
         raise c.NoMutants
 
-    alignments_w_mutants = {k: v for k, v in alignments.items() if k in samples_w_mutants}
+    alignments_w_mutants = {k: v
+                            for k, v
+                            in alignments.items()
+                            if k in samples_w_mutants}
     filt_d = {}
     for alt in vcf_rec.alts:
         filt_d[alt] = variant_tester(vcf_rec, alignments_w_mutants, alt)
@@ -270,28 +329,80 @@ def test_record_per_alt(
 
 
 def main_cli() -> None:
-    logging.basicConfig(level=logging.INFO, format='%(asctime)s ¦ %(levelname)-8s ¦ %(message)s', datefmt='%I:%M:%S')
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s ¦ %(levelname)-8s ¦ %(message)s',
+                        datefmt='%I:%M:%S')
 
-    parser = argparse.ArgumentParser(prog="hairpin2", description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
+    parser = argparse.ArgumentParser(prog="hairpin2",
+                                     description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
     parser._optionals.title = 'info'
-    parser.add_argument('-v', '--version', help='print version', action='version', version=hairpin2.__version__)
+    parser.add_argument('-v',
+                        '--version',
+                        help='print version',
+                        action='version',
+                        version=hairpin2.__version__)
     req = parser.add_argument_group('mandatory')
-    req.add_argument('-i', '--vcf-in', help="path to input VCF", required=True)
-    req.add_argument('-o', '--vcf-out', help="path to write output VCF", required=True)
-    req.add_argument('-a', '--alignments', help="list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories", nargs='+', required=True)
-    req.add_argument('-f', "--format", help="format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM", choices=["s", "b", "c"], type=str, required=True)
+    req.add_argument('-i',
+                     '--vcf-in',
+                     help="path to input VCF",
+                     required=True)
+    req.add_argument('-o',
+                     '--vcf-out',
+                     help="path to write output VCF",
+                     required=True)
+    req.add_argument('-a',
+                     '--alignments',
+                     help="list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories",
+                     nargs='+',
+                     required=True)
+    req.add_argument('-f',
+                     "--format",
+                     help="format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM",
+                     choices=["s", "b", "c"],
+                     type=str,
+                     required=True)
     opt = parser.add_argument_group('extended')
-    opt.add_argument('-al', '--al-filter-threshold', help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93', type=float)
-    opt.add_argument('-mc', '--min-clip-quality', help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35', type=int)
-    opt.add_argument('-mq', '--min-mapping-quality', help='discard reads with mapping quality below this value - default: 11', type=int)
-    opt.add_argument('-mb', '--min-base-quality', help='discard reads with base quality at variant position below this value - default: 25', type=int )
-    opt.add_argument('-ms', '--max-read-span', help='maximum +- position to use when detecting PCR duplicates - default: 6', type=int)
-    opt.add_argument('-pf', '--position-fraction', help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15', type=float)
+    opt.add_argument('-al',
+                     '--al-filter-threshold',
+                     help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93',
+                     type=float)
+    opt.add_argument('-mc',
+                     '--min-clip-quality',
+                     help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35',
+                     type=int)
+    opt.add_argument('-mq',
+                     '--min-mapping-quality',
+                     help='discard reads with mapping quality below this value - default: 11',
+                     type=int)
+    opt.add_argument('-mb',
+                     '--min-base-quality',
+                     help='discard reads with base quality at variant position below this value - default: 25',
+                     type=int)
+    opt.add_argument('-ms',
+                     '--max-read-span',
+                     help='maximum +- position to use when detecting PCR duplicates - default: 6',
+                     type=int)
+    opt.add_argument('-pf',
+                     '--position-fraction',
+                     help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15',
+                     type=float)
     proc = parser.add_argument_group('procedural')
-    proc.add_argument('-r', '--cram-reference', help="path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM")
-    proc.add_argument('-m', '--name-mapping', help='map VCF sample names to alignment SM tags; useful if they differ', metavar='VCF:aln', nargs='+')
-    proc.add_argument('-ji', '--input-json', help='path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line', type=str)
-    proc.add_argument('-jo', '--output-json', help='log input arguments to JSON', type=str)
+    proc.add_argument('-r',
+                      '--cram-reference',
+                      help="path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM")
+    proc.add_argument('-m',
+                      '--name-mapping',
+                      help='map VCF sample names to alignment SM tags; useful if they differ',
+                      metavar='VCF:aln',
+                      nargs='+')
+    proc.add_argument('-ji',
+                      '--input-json',
+                      help='path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line',
+                      type=str)
+    proc.add_argument('-jo',
+                      '--output-json',
+                      help='log input arguments to JSON',
+                      type=str)
 
     args = parser.parse_args()
 
@@ -307,7 +418,9 @@ def main_cli() -> None:
     # set arg defaults
     for k in vars(args).keys():
         if not vars(args)[k]:
-            if json_config and k in json_config.keys() and k in c.DEFAULTS.keys():
+            if (json_config and k
+                in json_config.keys()
+                    and k in c.DEFAULTS.keys()):
                 setattr(args, k, json_config[k])
             elif k in c.DEFAULTS.keys():
                 setattr(args, k, c.DEFAULTS[k])
@@ -320,7 +433,11 @@ def main_cli() -> None:
                                    min_clipqual=args.min_clip_quality,
                                    min_basequal=args.min_base_quality)
 
-    primed_variant_tester = partial(test_variant, al_thresh=args.al_filter_threshold, max_span=args.max_read_span, position_fraction_thresh=args.position_fraction, read_validator=primed_validate_read)
+    primed_variant_tester = partial(test_variant,
+                                    al_thresh=args.al_filter_threshold,
+                                    max_span=args.max_read_span,
+                                    position_fraction_thresh=args.position_fraction,
+                                    read_validator=primed_validate_read)
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
@@ -344,13 +461,16 @@ def main_cli() -> None:
             logging.info("CRAM format specified")
     for path in args.alignments:
         try:
-            alignment = pysam.AlignmentFile(path, mode, reference_filename=args.cram_reference if args.cram_reference and args.format == "c" else None)
+            alignment = pysam.AlignmentFile(path,
+                                            mode,
+                                            reference_filename=(args.cram_reference
+                                                                if args.cram_reference
+                                                                and args.format == "c"
+                                                                else None))
         except Exception as e:
             h.cleanup(msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
-        # this may cause problems?
-        # check with Peter
         alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']
         vcf_sample_to_alignment_map[alignment_sample_name] = alignment
     if args.name_mapping:
@@ -370,9 +490,12 @@ def main_cli() -> None:
             h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(alignment_map_names):
             h.cleanup(msg='duplicate aligment sample names provided to name mapping flag')
-        if h.lists_not_equal(alignment_map_names, vcf_sample_to_alignment_map.keys()):
+        if h.lists_not_equal(alignment_map_names,
+                             vcf_sample_to_alignment_map.keys()):
             h.cleanup(msg='alignment sample names provided to name mapping flag do not match alignment SM tags')
-        vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v for k, v in vcf_sample_to_alignment_map.items()}
+        vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v
+                                       for k, v
+                                       in vcf_sample_to_alignment_map.items()}
     else:
         if not vcf_sample_to_alignment_map.keys() <= sample_names:
             h.cleanup(msg='alignment SM tags do not match VCF sample names: {}'.format(vcf_sample_to_alignment_map.keys() - sample_names))
@@ -395,12 +518,19 @@ def main_cli() -> None:
     if args.output_json:
         try:
             with open(args.output_json, "w") as output_json:
-                json.dump({k: vars(args)[k] for k in (vars(args).keys() - {'input_json', 'output_json', 'format'})}, output_json, indent="")
+                json.dump(
+                          {
+                           k: vars(args)[k]
+                           for k
+                           in (vars(args).keys() - {'input_json', 'output_json', 'format'})
+                           },
+                          output_json, indent="")
         except Exception as e:
             h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 
     for record in vcf_in_handle.fetch():  # type:ignore
-        # need to test pysam's vcf record validation - e.g. what if start is after end
+        # need to test pysam's vcf record validation
+        # e.g. what if start is after end
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
                 alignments=vcf_sample_to_alignment_map,
@@ -416,7 +546,15 @@ def main_cli() -> None:
                 for filter in filter_bundle:
                     if filter.flag:
                         record.filter.add(filter.name)
-                    record.info.update({filter.name: '|'.join([alt] + [str(f) if not type(f) == float else str(round(f, 3)) for f in filter][2:])})
+                    record.info.update({filter.name: '|'.join(
+                                                      [alt] +
+                                                      [str(f)
+                                                          if type(f)
+                                                          is not float
+                                                          else str(round(f, 3))
+                                                          for f in filter
+                                                       ][2:]
+                                                    )})
 
             try:
                 vcf_out_handle.write(record)  # type:ignore
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index d84b482..4b507f1 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -5,22 +5,23 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 import pysam
 from hairpin2 import constants as c
 
+
 def ref2querypos(
             bam_record: pysam.AlignedSegment,
             ref_pos: int,
@@ -34,7 +35,8 @@ def ref2querypos(
     if query_pos is None or len(pos_aln) == 0:
         raise IndexError('reference position not covered by read')
     elif get_cig:
-        dist2op = ref_pos - bam_record.reference_start + 1  # since position is 0-indexed, add 1 to get distance
+        # since position is 0-indexed, add 1 to get distance
+        dist2op = ref_pos - bam_record.reference_start + 1
         cig = bam_record.cigartuples
         if cig is None or len(cig) == 0:
             raise ValueError('no cigar tuples available for pysam record')
@@ -67,7 +69,6 @@ def ref_end_via_cigar(
             digit_accumulator = ''
     cig_t = list(zip(cig_l[0::2], cig_l[1::2]))
     for op_len, op_code in cig_t:
-        if op_code in ['M','D','N','=','X']:
+        if op_code in ['M', 'D', 'N', '=', 'X']:
             ref_start += int(op_len)
     return ref_start
-                
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index 70122f3..9a39f76 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -5,17 +5,17 @@
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
 # This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
+# it under the terms of the GNU Affero General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-#
+
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
 from hairpin2 import main as hp2
@@ -46,60 +46,133 @@
 # so far there's a test for each flag
 
 def test_ideal():
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.CLEAR.value
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
 
 
 def test_mapqual():
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=30, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=30,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
 
 
 def test_not_aligned():
-    assert hp2.validate_read(read=r, vcf_start=200, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+    assert hp2.validate_read(read=r,
+                             vcf_start=200,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
 
 
 def test_not_alt():
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='T', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NOT_ALT.value
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='T',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALT.value
 
 
 def test_basequal():
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
 
 
 def test_read_short():
-    assert hp2.validate_read(read=r, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='ATTTTTTTTTTTTTT', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.SHORT.value
-
-
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='ATTTTTTTTTTTTTT',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.SHORT.value
 
 
 def test_missing_cigar():
     rc = copy.deepcopy(r)
     rc.cigarstring = None
     rc.cigartuples = None
-    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
 
 
 def test_bad_op():
     rc = copy.deepcopy(r)
     rc.cigartuples = [(c.Ops.EQUAL.value, 10)]
-    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.BAD_OP.value
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
 
 
 def test_clipqual():
     rc = copy.deepcopy(r)
     rc.cigarstring = '1S9M'
-    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=40, min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=40,
+                             min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
 
 
 def test_no_overlap_0x10():
     rc = copy.deepcopy(r)
     rc.flag |= 0x10
     rc.set_tag('MC', '3M')
-    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
 
 
 def test_no_overlap():
     rc = copy.deepcopy(r)
     rc.next_reference_start = 98
-    assert hp2.validate_read(read=rc, vcf_start=99, vcf_stop=100, vcf_rlen=1, alt='A', min_mapqual=11, min_clipqual=35, min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
-
-
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             vcf_rlen=1,
+                             alt='A',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value

From 1b50e9180baf1abe173e2ee6626f329cb47e9e90 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 3 Oct 2024 08:59:59 +0100
Subject: [PATCH 127/165] smoke test validate read

---
 test/test_validate_read.py | 98 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index 9a39f76..41bc011 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -22,6 +22,104 @@
 from hairpin2 import constants as c
 import pysam
 import copy
+import pytest
+import factory
+import factory.random
+from faker import Faker
+from faker_biology.bioseq import Bioseq
+import random
+
+factory.random.reseed_random(2501)
+random.seed(2501)
+
+# smoke test validate_read
+class ExtendedBioProvider(Bioseq):
+    def quality_string(self, length):
+        if length < 1:
+            raise ValueError('length must be geater than 1')
+        allowed_chars = [chr(i) for i in range(33, 75)]
+        return ''.join([random.choice(allowed_chars) for _ in range(length)])
+
+    def cigar_string(self, length):
+        if length < 1:
+            raise ValueError('length must be greater than 1')
+        opchars = 'MIDNSHP=XB'
+        opchars_noclip = 'MIDNP=XB'
+        bound = 200 if length > 200 else length
+        cig_op_lengths = []
+        while(bound > 0):
+            oplen = random.randint(1, bound)
+            cig_op_lengths.append(random.randint(1, oplen))
+            cig_sum = sum(cig_op_lengths)
+            bound = 200 if length - cig_sum > 200 else length - cig_sum
+        cig_op_lengths[-1] = cig_op_lengths[-1] - (cig_sum - length)
+        cig_ops = []
+        last_opchar = ''
+        # first and last op can be S or H, but not others
+        # first op H last op S, i.e. only clipping ops, seg faults pysam
+        # reads with only clipping ops seem to segfault pysam... report bug
+        if len(cig_op_lengths) == 1:
+            cig_ops.append(random.choice(opchars_noclip))
+        else:
+            cig_ops.append(random.choice(opchars))
+        for _ in range(max([len(cig_op_lengths) - 2, 0])):
+            iter_opchars = opchars_noclip.replace(last_opchar, '')
+            cig_ops.append(random.choice(iter_opchars))
+            last_opchar = cig_ops[-1]
+        if len(cig_ops) != 1:
+            cig_ops.append(random.choice(opchars_noclip if cig_ops[-1] in ['H', 'S'] else opchars))
+        return ''.join([str(x) for pair in zip(cig_op_lengths, cig_ops) for x in pair])
+
+
+fake = Faker()
+fake.add_provider(ExtendedBioProvider)
+
+
+class AlignedSegmentWrapper:
+    def __init__(self, query_name, query_sequence, query_qualities, flag, reference_id, reference_start, next_reference_start, mapping_quality, cigarstring, mc):
+        self.segment = pysam.AlignedSegment()
+        self.segment.query_name = query_name
+        self.segment.query_sequence = query_sequence
+        self.segment.query_qualities = pysam.qualitystring_to_array(query_qualities)
+        self.segment.flag = flag
+        self.segment.reference_id = reference_id
+        self.segment.reference_start = reference_start
+        self.segment.next_reference_start = next_reference_start
+        self.segment.mapping_quality = mapping_quality
+        self.segment.cigarstring = cigarstring
+        self.segment.set_tag('MC', mc)
+
+
+class ReadFactory(factory.Factory):
+    class Meta:
+        model = AlignedSegmentWrapper
+
+    query_name = 'read1'  # should one assume pysam handles all bizarre query names gracefully? I am...
+    query_sequence = factory.LazyAttribute(lambda _: fake.dna(length=random.randint(50, 200)))
+    query_qualities = factory.LazyAttribute(lambda o: fake.quality_string(length=len(o.query_sequence)))
+    flag = factory.LazyAttribute(lambda _: random.getrandbits(16))
+    reference_id = 0
+    reference_start = factory.LazyAttribute(lambda _: random.randint(1, 300000000))
+    next_reference_start = factory.LazyAttribute(lambda o: o.reference_start - random.randint(-700, 700))
+    mapping_quality = factory.LazyAttribute(lambda _: random.randint(0, 255))
+    cigarstring = factory.LazyAttribute(lambda o: fake.cigar_string(length=len(o.query_sequence)))
+    mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
+
+
+def test_factory():
+    t = ReadFactory().segment
+    print(t.flag)
+
+
+@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(100)])
+def test_smoke(test_read):
+    start_lower_bound = test_read.reference_start + 2
+    start_upper_bound = test_read.reference_start + len(test_read.query_sequence) - 4
+    start = random.randint(start_lower_bound, start_upper_bound)
+    alt = test_read.query_sequence[start:start + random.randint(1, 4)]
+    stop = start + len(alt)
+    hp2.validate_read(test_read, vcf_start = start, vcf_stop = stop, vcf_rlen = stop - start, alt = alt, min_mapqual=11, min_clipqual=35, min_basequal=25)  # need to verify I'm actually getting the positions right so it's not just all "not aligned" (print flags)
+
 
 # pysam guards against:
 # quality and seq length mismatch

From b39955f900b563bd8448cc682c721388e9b8de30 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 7 Oct 2024 15:43:18 +0000
Subject: [PATCH 128/165] refactor to handle indel issues unearthed by testing

---
 .gitignore                 |   1 +
 hairpin2/helpers.py        |   5 +-
 hairpin2/main.py           | 166 +++++++++---------
 pyproject.toml             |   3 +
 test/test_validate_read.py | 348 +++++++++++++++++++++----------------
 5 files changed, 282 insertions(+), 241 deletions(-)

diff --git a/.gitignore b/.gitignore
index 7315945..351fca9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ test_data_creation/
 *.sif
 *.json
 poetry.lock
+.coverage
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index b65d295..5ac97d1 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -57,9 +57,10 @@ def lists_not_equal(
 
 
 def print_flag(
-    print_enum: Flag
+    print_enum: Flag,
+    hex: bool = False
 ) -> None:
-    print([':'.join([str(e), hex(e.value)]) for e in print_enum])
+    print([':'.join([str(e), hex(e.value) if hex else bin(e.value)]) for e in print_enum])
 
 
 def print_enum(
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 31427c8..7cfa22b 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -27,6 +27,7 @@
 import json
 from itertools import tee
 from functools import partial
+from typing import Literal
 
 
 def validate_read(
@@ -34,23 +35,20 @@ def validate_read(
     vcf_start: int,
     vcf_stop: int,
     vcf_rlen: int,
-    alt: str,
+    alt: Literal['A', 'C', 'G', 'T', 'N', '*'],
+    mut_type: Literal['S', 'D', 'I'],
     min_mapqual: int,
     min_clipqual: int,
     min_basequal: int,
 ) -> int:
+    # write checks for alt and mut type!
     read_flag = c.ValidatorFlags.CLEAR.value
 
-    if not (read.flag & 0x2) or read.flag & 0xE00:  # move flag codes to constants
-        read_flag |= c.ValidatorFlags.FLAG.value
-
-    if read.mapping_quality < min_mapqual:
-        read_flag |= c.ValidatorFlags.MAPQUAL.value
-
     try:
         mate_cig = read.get_tag('MC')
     except KeyError:
         mate_cig = None
+
     if any(x is None for x in
             [read.reference_start,
                 read.reference_end,
@@ -59,94 +57,74 @@ def validate_read(
                 read.query_alignment_qualities,
                 read.cigarstring,
                 read.cigartuples,
+                read.flag,
+                read.mapping_quality,
                 mate_cig]):
         read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
     else:
+        if not (read.flag & 0x2) or read.flag & 0xE00:
+            read_flag |= c.ValidatorFlags.FLAG.value
+
+        if read.mapping_quality < min_mapqual:
+            read_flag |= c.ValidatorFlags.MAPQUAL.value
+
         if ('S' in read.cigarstring and  # type: ignore
                 mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
         # First, check for sub
-        try:
-            # VCF 1-INDEXED, alignments 0-INDEXED
-            # (vcf_start = 0-indexed mutation position)
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
-        except IndexError:
-            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-        else:
-            if vcf_rlen == len(alt) == 1:
-                if (mut_op not in [c.Ops.MATCH.value, c.Ops.DIFF.value]):
-                    read_flag |= c.ValidatorFlags.BAD_OP.value
-                if read.query_sequence[mut_pos] != alt:  # type: ignore
-                    read_flag |= c.ValidatorFlags.NOT_ALT.value
-                if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
-                    read_flag |= c.ValidatorFlags.BASEQUAL.value
-            # Second, check whether length of read can accommodate size of indel
-            elif (mut_pos + vcf_rlen > read.query_length or
-                  mut_pos + len(alt) > read.query_length):
-                read_flag |= c.ValidatorFlags.SHORT.value
+        if mut_type == 'S':
+            try:
+                # VCF 1-INDEXED, alignments 0-INDEXED
+                # (vcf_start = 0-indexed mutation position)
+                mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
+            except IndexError:
+                read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
             else:
-                if len(alt) == 1:  # DEL
-                    try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x),
-                                           range(vcf_start, vcf_stop)))
-                    except IndexError:
-                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-                    else:
-                        if (mut_rng[0][1] != c.Ops.MATCH.value or
-                            mut_rng[-1][1] != c.Ops.MATCH.value or
-                            any(x[1] != c.Ops.DEL.value for x in
-                                mut_rng[1:-2])):
-                            read_flag |= c.ValidatorFlags.BAD_OP.value
-                elif vcf_rlen == 1:  # INS
-                    try:
-                        mut_rng = list(map(lambda x: r2s.ref2querypos(read, x),
-                                           range(vcf_start,
-                                                 (vcf_start + len(alt)))))
-                    except IndexError:
-                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-                    else:
-                        if (mut_rng[0][1] != c.Ops.MATCH.value or
-                            mut_rng[-1][1] != c.Ops.MATCH.value or
-                                any(x[1] != c.Ops.INS.value for x in
-                                    mut_rng[1:-2])):
-                            read_flag |= c.ValidatorFlags.BAD_OP.value
-                        if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
-                            read_flag |= c.ValidatorFlags.NOT_ALT.value
-                else:  # COMPLEX
-                    max_rng = (range(vcf_start, vcf_stop)
-                               if (vcf_start + vcf_rlen) >
-                               (vcf_start + len(alt))
-                               else range(vcf_start, (vcf_start + len(alt))))
-                    try:
-                        mut_rng = list(map(
-                                lambda x: r2s.ref2querypos(read, x), max_rng))
-                    except IndexError:
-                        read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-                    else:
-                        if (mut_rng[0][1] != c.Ops.MATCH.value or
-                                mut_rng[-1][1] != c.Ops.MATCH.value):
-                            read_flag |= c.ValidatorFlags.BAD_OP.value
-                        if read.query_sequence[mut_pos:len(alt)] != alt:  # type: ignore
-                            read_flag |= c.ValidatorFlags.NOT_ALT.value
-
-        if read_flag == c.ValidatorFlags.CLEAR.value:
-            if not (read.flag & 0x40):
-                # this looks like it should be checked for indexing snags
-                pair_start = read.reference_start
-                pair_end = read.reference_end
-                if read.flag & 0x10:
-                    # through an unfortunate quirk of history
-                    # "next" means "mate"
-                    # so this is reliable (pulls RNEXT)
-                    mate_end = r2s.ref_end_via_cigar(mate_cig,
-                                                     read.next_reference_start)  # type:ignore
-                    if read.reference_start <= mate_end:
-                        pair_start = mate_end + 1
+                if vcf_rlen == len(alt) == 1:
+                    # checking bad_op superfluous
+                    # if it aligns then
+                    # either match or mismatch
+                    if read.query_sequence[mut_pos] != alt:  # type: ignore
+                        read_flag |= c.ValidatorFlags.NOT_ALT.value
+                    if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
+                        read_flag |= c.ValidatorFlags.BASEQUAL.value
+        elif mut_type == 'D':  # DEL - doesn't check for matches before and after...
+            mut_alns = [q for q, r in read.get_aligned_pairs() if r in range(vcf_start, vcf_stop)]
+            if any([x is not None for x in mut_alns]):
+                read_flag |= c.ValidatorFlags.BAD_OP.value
+        elif mut_type == 'I':  # INS
+            try:
+                first_pos, _ = r2s.ref2querypos(read, vcf_start)
+            except IndexError:
+                read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+            else:
+                if first_pos + len(alt) > read.query_length:
+                    read_flag |= c.ValidatorFlags.SHORT.value
                 else:
-                    if read.reference_end >= read.next_reference_start:  # type:ignore
-                        pair_end = read.next_reference_start - 1
-                if not (pair_start <= vcf_start <= pair_end):  # type:ignore
-                    read_flag |= c.ValidatorFlags.NO_OVERLAP.value
+                    # MUST TEST
+                    mut_alns = [(q, r) for q, r in read.get_aligned_pairs() if q in range(first_pos + 1, first_pos + vcf_rlen)]
+                    if any([r is not None for _, r in mut_alns]):
+                        read_flag |= c.ValidatorFlags.BAD_OP.value
+                    if read.query_sequence[first_pos:len(alt)] != alt:
+                        read_flag |= c.ValidatorFlags.NOT_ALT.value
+
+        if read_flag == c.ValidatorFlags.CLEAR.value and not (read.flag & 0x40):
+            # check for indexing snags
+            pair_start = read.reference_start
+            pair_end = read.reference_end
+            if read.flag & 0x10:
+                # through an unfortunate quirk of history
+                # "next" means "mate"
+                # so this is reliable
+                mate_end = r2s.ref_end_via_cigar(mate_cig,
+                                                 read.next_reference_start)  # type:ignore
+                if read.reference_start <= mate_end:
+                    pair_start = mate_end + 1
+            else:
+                if read.reference_end >= read.next_reference_start:  # type:ignore
+                    pair_end = read.next_reference_start - 1
+            if not (pair_start <= vcf_start <= pair_end):  # type:ignore
+                read_flag |= c.ValidatorFlags.NO_OVERLAP.value
     return read_flag
 
 
@@ -154,6 +132,7 @@ def test_variant(
     vcf_rec: pysam.VariantRecord,
     mutant_alignments: dict[str, pysam.AlignmentFile],
     alt: str,
+    mut_type: str,
     al_thresh: float,
     max_span: int,
     position_fraction_thresh: float,
@@ -187,7 +166,8 @@ def test_variant(
                                        alt=alt,
                                        vcf_start=vcf_rec.start,
                                        vcf_stop=vcf_rec.stop,
-                                       vcf_rlen=vcf_rec.rlen)
+                                       vcf_rlen=vcf_rec.rlen,
+                                       mut_type=mut_type)
 
             if read_flag == c.ValidatorFlags.CLEAR.value:
                 mut_reads[mut_sample].append(read)
@@ -324,7 +304,17 @@ def test_record_per_alt(
                             if k in samples_w_mutants}
     filt_d = {}
     for alt in vcf_rec.alts:
-        filt_d[alt] = variant_tester(vcf_rec, alignments_w_mutants, alt)
+        if vcf_rec.rlen == len(alt) and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):
+            mut_type = 'S'
+        elif len(alt) < vcf_rec.rlen or alt == '.':  # DEL - DOES NOT SUPPORT <DEL> TYPE IDS
+            mut_type = 'D'
+        elif vcf_rec.rlen == 1 and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):  # INS - DOES NOT SUPPORT <INS> TYPE IDS
+            mut_type = 'I'
+        else:
+            ## ERROR
+            logging.warning('could not type mutation POS={} REF={} ALT={}, skipping alt'.format(vcf_rec.pos, vcf_rec.ref, alt))
+            continue
+        filt_d[alt] = variant_tester(vcf_rec, alignments_w_mutants, alt, mut_type)
     return filt_d
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 45d0788..e44bf0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,9 @@ hairpin2 = "hairpin2.main:main_cli"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.3"
+pytest-cov = "^5.0.0"
+faker-biology = "^0.6.4"
+factory-boy = "^3.3.1"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index 41bc011..92f015b 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -32,6 +32,7 @@
 factory.random.reseed_random(2501)
 random.seed(2501)
 
+
 # smoke test validate_read
 class ExtendedBioProvider(Bioseq):
     def quality_string(self, length):
@@ -106,26 +107,17 @@ class Meta:
     mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
 
 
-def test_factory():
-    t = ReadFactory().segment
-    print(t.flag)
-
-
-@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(100)])
-def test_smoke(test_read):
-    start_lower_bound = test_read.reference_start + 2
-    start_upper_bound = test_read.reference_start + len(test_read.query_sequence) - 4
-    start = random.randint(start_lower_bound, start_upper_bound)
-    alt = test_read.query_sequence[start:start + random.randint(1, 4)]
-    stop = start + len(alt)
-    hp2.validate_read(test_read, vcf_start = start, vcf_stop = stop, vcf_rlen = stop - start, alt = alt, min_mapqual=11, min_clipqual=35, min_basequal=25)  # need to verify I'm actually getting the positions right so it's not just all "not aligned" (print flags)
-
+# @pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(100)])
+# def test_smoke(test_read):
+#     mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
+#     start = test_read.reference_start + mut_pos
+#     alt = test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)]  # n.b. func does not fail with no alt
+#     stop = start + len(alt)
+#     vflag = hp2.validate_read(test_read, vcf_start=start, vcf_stop=stop, vcf_rlen=stop - start, alt=alt, min_mapqual=11, min_clipqual=35, min_basequal=25)  # need to verify I'm actually getting the positions right so it's not just all "not aligned" (print flags)
+#     print(format(vflag, '010b'))
 
-# pysam guards against:
-# quality and seq length mismatch
-# flag not set
-# reference id is none
 
+# INDEL TESTING
 r = pysam.AlignedSegment()
 r.query_name = 'read1'
 r.query_sequence = 'CTGDAAAACC'
@@ -135,142 +127,196 @@ def test_smoke(test_read):
 r.reference_start = 95
 r.next_reference_start = 105
 r.mapping_quality = 20
-r.cigarstring = '10M'
-r.set_tag('MC', '10M')
-
-
-# ideally there would be a test for each time read_flag is set
-# i.e. test every path of achieving a given flag
-# so far there's a test for each flag
-
-def test_ideal():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
+r.cigarstring = '4M3D3M'
+r.set_tag('MC', '4M3D3M')
 
-def test_mapqual():
-    assert hp2.validate_read(read=r,
+def test_indel():
+    vrf = hp2.validate_read(read=r,
                              vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=30,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
-
-
-def test_not_aligned():
-    assert hp2.validate_read(read=r,
-                             vcf_start=200,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-def test_not_alt():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='T',
+                             vcf_stop=102,
+                             vcf_rlen=3,
+                             alt='.',
+                             mut_type='D',
                              min_mapqual=11,
                              min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALT.value
+                             min_basequal=25)
 
 
-def test_basequal():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
-
-
-def test_read_short():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='ATTTTTTTTTTTTTT',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.SHORT.value
-
-
-def test_missing_cigar():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = None
-    rc.cigartuples = None
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-def test_bad_op():
-    rc = copy.deepcopy(r)
-    rc.cigartuples = [(c.Ops.EQUAL.value, 10)]
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
-
-
-def test_clipqual():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = '1S9M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=40,
-                             min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
-
-
-def test_no_overlap_0x10():
-    rc = copy.deepcopy(r)
-    rc.flag |= 0x10
-    rc.set_tag('MC', '3M')
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
-
-
-def test_no_overlap():
-    rc = copy.deepcopy(r)
-    rc.next_reference_start = 98
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             vcf_rlen=1,
-                             alt='A',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+# pysam guards against:
+# quality and seq length mismatch
+# flag not set
+# reference id is none
+# r = pysam.AlignedSegment()
+# r.query_name = 'read1'
+# r.query_sequence = 'CTGDAAAACC'
+# r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
+# r.flag = 0x2
+# r.reference_id = 0
+# r.reference_start = 95
+# r.next_reference_start = 105
+# r.mapping_quality = 20
+# r.cigarstring = '10M'
+# r.set_tag('MC', '10M')
+
+
+# BASIS PATH TESTING
+# test every statement at least once
+######
+
+# def test_set_flag():
+#     rc = copy.deepcopy(r)
+#     rc.flag = 0x10
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.FLAG.value
+
+
+# def test_set_mapqual():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=30,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
+
+
+# def test_missing_mc():
+#     rc = copy.deepcopy(r)
+#     rc.set_tag('MC', None)
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+# def test_set_missing_fields():
+#     rc = copy.deepcopy(r)
+#     rc.cigarstring = None
+#     rc.cigartuples = None
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+# def test_set_clipqual():
+#     rc = copy.deepcopy(r)
+#     rc.cigarstring = '1S9M'
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=40,
+#                              min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
+
+
+# def test_not_aligned_first():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=200,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+# def test_bad_sub():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='T',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
+
+
+# def test_good_sub():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+# def test_basequal():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
+
+
+# def test_read_short():
+#     assert hp2.validate_read(read=r,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='ATTTTTTTTTTTTTT',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.SHORT.value
+
+
+# def test_bad_op():
+#     rc = copy.deepcopy(r)
+#     rc.cigartuples = [(c.Ops.EQUAL.value, 10)]
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.BAD_OP.value
+
+
+# def test_no_overlap_0x10():
+#     rc = copy.deepcopy(r)
+#     rc.flag |= 0x10
+#     rc.set_tag('MC', '3M')
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+
+
+# def test_no_overlap():
+#     rc = copy.deepcopy(r)
+#     rc.next_reference_start = 98
+#     assert hp2.validate_read(read=rc,
+#                              vcf_start=99,
+#                              vcf_stop=100,
+#                              vcf_rlen=1,
+#                              alt='A',
+#                              min_mapqual=11,
+#                              min_clipqual=35,
+#                              min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value

From 3db388a8b0efc19fd1a29d7d80d8e4ee949bc9f2 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 14:18:51 +0000
Subject: [PATCH 129/165] a good pass at basis path testing validate read, and
 shoring it up in the process

---
 hairpin2/constants.py      |   2 +-
 hairpin2/main.py           |  70 +++----
 test/__init__.py           |   0
 test/test_validate_read.py | 417 ++++++++++++++++++++-----------------
 4 files changed, 252 insertions(+), 237 deletions(-)
 create mode 100644 test/__init__.py

diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index bf98afb..109ab7e 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -62,7 +62,7 @@
                           'BASEQUAL',
                           'SHORT',
                           'CLIPQUAL',
-                          'NO_OVERLAP'],
+                          'OVERLAP'],
                       start=0)
 
 
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 7cfa22b..f243ade 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -34,14 +34,18 @@ def validate_read(
     read: pysam.AlignedSegment,
     vcf_start: int,
     vcf_stop: int,
-    vcf_rlen: int,
-    alt: Literal['A', 'C', 'G', 'T', 'N', '*'],
+    alt: str,
     mut_type: Literal['S', 'D', 'I'],
     min_mapqual: int,
     min_clipqual: int,
     min_basequal: int,
 ) -> int:
-    # write checks for alt and mut type!
+    sup_char_alt = ['A', 'C', 'G', 'T', 'N', '*', '.']
+    if any([b not in sup_char_alt for b in alt]):
+        raise ValueError('unsupported character in alt: {} - supports {}'.format(alt, ', '.join(sup_char_alt)))
+    if mut_type not in ['S', 'D', 'I']:
+        raise ValueError('unsupported mut_type: {} - supports \'S\' (SUB) \'D\' (DEL) \'I\' (INS)'.format(mut_type))
+
     read_flag = c.ValidatorFlags.CLEAR.value
 
     try:
@@ -50,15 +54,12 @@ def validate_read(
         mate_cig = None
 
     if any(x is None for x in
-            [read.reference_start,
-                read.reference_end,
+            [read.reference_end,
                 read.query_sequence,
                 read.query_qualities,
                 read.query_alignment_qualities,
                 read.cigarstring,
                 read.cigartuples,
-                read.flag,
-                read.mapping_quality,
                 mate_cig]):
         read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
     else:
@@ -71,60 +72,48 @@ def validate_read(
         if ('S' in read.cigarstring and  # type: ignore
                 mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
-        # First, check for sub
-        if mut_type == 'S':
+
+        if mut_type == 'S':  # SUB
             try:
-                # VCF 1-INDEXED, alignments 0-INDEXED
-                # (vcf_start = 0-indexed mutation position)
                 mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
             except IndexError:
                 read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
             else:
-                if vcf_rlen == len(alt) == 1:
-                    # checking bad_op superfluous
-                    # if it aligns then
-                    # either match or mismatch
-                    if read.query_sequence[mut_pos] != alt:  # type: ignore
-                        read_flag |= c.ValidatorFlags.NOT_ALT.value
-                    if read.query_qualities[mut_pos] < min_basequal:  # type: ignore
-                        read_flag |= c.ValidatorFlags.BASEQUAL.value
+                if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore
+                    read_flag |= c.ValidatorFlags.NOT_ALT.value
+                if any([bq < min_basequal for bq in read.query_qualities[mut_pos:mut_pos + len(alt)]]):  # type: ignore
+                    read_flag |= c.ValidatorFlags.BASEQUAL.value
         elif mut_type == 'D':  # DEL - doesn't check for matches before and after...
+            # this could error if read doesn't cover region (as could all)
             mut_alns = [q for q, r in read.get_aligned_pairs() if r in range(vcf_start, vcf_stop)]
             if any([x is not None for x in mut_alns]):
                 read_flag |= c.ValidatorFlags.BAD_OP.value
         elif mut_type == 'I':  # INS
             try:
-                first_pos, _ = r2s.ref2querypos(read, vcf_start)
+                prior_pos, _ = r2s.ref2querypos(read, vcf_start)
             except IndexError:
                 read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
             else:
-                if first_pos + len(alt) > read.query_length:
+                if prior_pos + len(alt) > read.query_length:
                     read_flag |= c.ValidatorFlags.SHORT.value
                 else:
-                    # MUST TEST
-                    mut_alns = [(q, r) for q, r in read.get_aligned_pairs() if q in range(first_pos + 1, first_pos + vcf_rlen)]
+                    mut_alns = [(q, r) for q, r in read.get_aligned_pairs() if q in range(prior_pos + 1, prior_pos + len(alt) + 1)]
                     if any([r is not None for _, r in mut_alns]):
                         read_flag |= c.ValidatorFlags.BAD_OP.value
-                    if read.query_sequence[first_pos:len(alt)] != alt:
+                    if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:
                         read_flag |= c.ValidatorFlags.NOT_ALT.value
 
         if read_flag == c.ValidatorFlags.CLEAR.value and not (read.flag & 0x40):
-            # check for indexing snags
-            pair_start = read.reference_start
-            pair_end = read.reference_end
-            if read.flag & 0x10:
-                # through an unfortunate quirk of history
-                # "next" means "mate"
-                # so this is reliable
-                mate_end = r2s.ref_end_via_cigar(mate_cig,
-                                                 read.next_reference_start)  # type:ignore
-                if read.reference_start <= mate_end:
-                    pair_start = mate_end + 1
-            else:
-                if read.reference_end >= read.next_reference_start:  # type:ignore
-                    pair_end = read.next_reference_start - 1
-            if not (pair_start <= vcf_start <= pair_end):  # type:ignore
-                read_flag |= c.ValidatorFlags.NO_OVERLAP.value
+            read_range = range(read.reference_start,
+                               read.reference_end)
+            mate_range = range(read.next_reference_start,
+                               r2s.ref_end_via_cigar(mate_cig,
+                                                     read.next_reference_start)
+                               )
+            ref_overlap = set(read_range).intersection(mate_range)
+            if vcf_start in ref_overlap:
+                read_flag |= c.ValidatorFlags.OVERLAP.value
+
     return read_flag
 
 
@@ -166,7 +155,6 @@ def test_variant(
                                        alt=alt,
                                        vcf_start=vcf_rec.start,
                                        vcf_stop=vcf_rec.stop,
-                                       vcf_rlen=vcf_rec.rlen,
                                        mut_type=mut_type)
 
             if read_flag == c.ValidatorFlags.CLEAR.value:
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index 92f015b..fd7d624 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -48,7 +48,7 @@ def cigar_string(self, length):
         opchars_noclip = 'MIDNP=XB'
         bound = 200 if length > 200 else length
         cig_op_lengths = []
-        while(bound > 0):
+        while (bound > 0):
             oplen = random.randint(1, bound)
             cig_op_lengths.append(random.randint(1, oplen))
             cig_sum = sum(cig_op_lengths)
@@ -107,216 +107,243 @@ class Meta:
     mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
 
 
-# @pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(100)])
-# def test_smoke(test_read):
-#     mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
-#     start = test_read.reference_start + mut_pos
-#     alt = test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)]  # n.b. func does not fail with no alt
-#     stop = start + len(alt)
-#     vflag = hp2.validate_read(test_read, vcf_start=start, vcf_stop=stop, vcf_rlen=stop - start, alt=alt, min_mapqual=11, min_clipqual=35, min_basequal=25)  # need to verify I'm actually getting the positions right so it's not just all "not aligned" (print flags)
-#     print(format(vflag, '010b'))
+@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(1000)])
+def test_smoke(test_read):
+    mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
+    start = test_read.reference_start + mut_pos
+    alt = random.choices([test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)], '.'], cum_weights=[66, 100])[0]
+    if alt == '.':
+        mut_type_str = 'D'
+        stop = start + random.randint(1, 3)
+    elif len(alt) == 1:
+        mut_type_str = random.choice(['S', 'I'])
+        stop = start + 1
+    else:
+        mut_type_str = random.choice(['S', 'I'])
+        stop = start + 1 if mut_type_str == 'I' else start + len(alt)
+    vflag = hp2.validate_read(test_read,
+                              vcf_start=start,
+                              vcf_stop=stop,
+                              alt=alt,
+                              mut_type=mut_type_str,
+                              min_mapqual=11,
+                              min_clipqual=35,
+                              min_basequal=25)
+    print(format(vflag, '010b'))
 
 
-# INDEL TESTING
+# BASIS PATH TESTING
+# test every node and edge at least once
+# N.B.
+# pysam guards against:
+# quality and seq length mismatch
+# reference id is none
+# ----
+# perfect read pair:
 r = pysam.AlignedSegment()
 r.query_name = 'read1'
 r.query_sequence = 'CTGDAAAACC'
 r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-r.flag = 0x2
+r.flag = 0x43
 r.reference_id = 0
 r.reference_start = 95
-r.next_reference_start = 105
+r.next_reference_start = 95
 r.mapping_quality = 20
-r.cigarstring = '4M3D3M'
-r.set_tag('MC', '4M3D3M')
+r.cigarstring = '10M'
+r.set_tag('MC', '10M')
+
+
+def test_path_unsupported_alt():
+    with pytest.raises(ValueError):
+        hp2.validate_read(read=r,
+                          vcf_start=99,
+                          vcf_stop=100,
+                          alt='8',
+                          mut_type='S',
+                          min_mapqual=11,
+                          min_clipqual=35,
+                          min_basequal=25)
+
+
+def test_path_unsupported_mut_type():
+    with pytest.raises(ValueError):
+        hp2.validate_read(read=r,
+                          vcf_start=99,
+                          vcf_stop=100,
+                          alt='A',
+                          mut_type='8',
+                          min_mapqual=11,
+                          min_clipqual=35,
+                          min_basequal=25)
+
+
+def test_path_missing_mc():
+    rc = copy.deepcopy(r)
+    rc.set_tag('MC', None)
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+def test_path_missing_field():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = None
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+def test_path_set_flag_mapqual_clipqual():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x200
+    rc.cigarstring = '1S9M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=30,
+                             min_clipqual=40,
+                             min_basequal=25) == (c.ValidatorFlags.FLAG.value | c.ValidatorFlags.MAPQUAL.value | c.ValidatorFlags.CLIPQUAL.value)
+
+
+def test_path_sub_not_aligned():
+    assert hp2.validate_read(read=r,
+                             vcf_start=200,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+def test_path_bad_sub():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='T',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
+
+
+def test_path_good_sub():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
 
-def test_indel():
-    vrf = hp2.validate_read(read=r,
+# checks cigar ops
+def test_path_del_bad_op():
+    assert hp2.validate_read(read=r,
                              vcf_start=99,
-                             vcf_stop=102,
-                             vcf_rlen=3,
+                             vcf_stop=100,
                              alt='.',
                              mut_type='D',
                              min_mapqual=11,
                              min_clipqual=35,
-                             min_basequal=25)
+                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
 
 
-# pysam guards against:
-# quality and seq length mismatch
-# flag not set
-# reference id is none
-# r = pysam.AlignedSegment()
-# r.query_name = 'read1'
-# r.query_sequence = 'CTGDAAAACC'
-# r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-# r.flag = 0x2
-# r.reference_id = 0
-# r.reference_start = 95
-# r.next_reference_start = 105
-# r.mapping_quality = 20
-# r.cigarstring = '10M'
-# r.set_tag('MC', '10M')
+# 2bp del
+def test_path_good_del():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '4M2D6M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=101,
+                             alt='.',
+                             mut_type='D',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
 
 
-# BASIS PATH TESTING
-# test every statement at least once
-######
-
-# def test_set_flag():
-#     rc = copy.deepcopy(r)
-#     rc.flag = 0x10
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.FLAG.value
-
-
-# def test_set_mapqual():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=30,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.MAPQUAL.value
-
-
-# def test_missing_mc():
-#     rc = copy.deepcopy(r)
-#     rc.set_tag('MC', None)
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-# def test_set_missing_fields():
-#     rc = copy.deepcopy(r)
-#     rc.cigarstring = None
-#     rc.cigartuples = None
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-# def test_set_clipqual():
-#     rc = copy.deepcopy(r)
-#     rc.cigarstring = '1S9M'
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=40,
-#                              min_basequal=25) == c.ValidatorFlags.CLIPQUAL.value
-
-
-# def test_not_aligned_first():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=200,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-# def test_bad_sub():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='T',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
-
-
-# def test_good_sub():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-# def test_basequal():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=40) == c.ValidatorFlags.BASEQUAL.value
-
-
-# def test_read_short():
-#     assert hp2.validate_read(read=r,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='ATTTTTTTTTTTTTT',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.SHORT.value
-
-
-# def test_bad_op():
-#     rc = copy.deepcopy(r)
-#     rc.cigartuples = [(c.Ops.EQUAL.value, 10)]
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.BAD_OP.value
-
-
-# def test_no_overlap_0x10():
-#     rc = copy.deepcopy(r)
-#     rc.flag |= 0x10
-#     rc.set_tag('MC', '3M')
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
-
-
-# def test_no_overlap():
-#     rc = copy.deepcopy(r)
-#     rc.next_reference_start = 98
-#     assert hp2.validate_read(read=rc,
-#                              vcf_start=99,
-#                              vcf_stop=100,
-#                              vcf_rlen=1,
-#                              alt='A',
-#                              min_mapqual=11,
-#                              min_clipqual=35,
-#                              min_basequal=25) == c.ValidatorFlags.NO_OVERLAP.value
+def test_path_ins_not_aligned():
+    assert hp2.validate_read(read=r,
+                             vcf_start=200,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+def test_path_ins_short():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='ATTTTTTTTTTTTTT',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.SHORT.value
+
+
+def test_path_bad_ins():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='AC',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == (c.ValidatorFlags.BAD_OP.value | c.ValidatorFlags.NOT_ALT.value)
+
+
+def test_path_good_ins():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '5M2I3M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='AA',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+def test_path_overlap():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.OVERLAP.value
+
+
+def test_path_no_overlap():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    rc.set_tag('MC', '3M')
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value

From 6d404f6f603020b372f7595071b0a6a3a50939cb Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 14:54:33 +0000
Subject: [PATCH 130/165] fix docker tests

---
 docker-run-unit-tests.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docker-run-unit-tests.sh b/docker-run-unit-tests.sh
index 2bde71f..ccc2dbd 100644
--- a/docker-run-unit-tests.sh
+++ b/docker-run-unit-tests.sh
@@ -11,6 +11,9 @@ echo "Package source directory: ${PKG_DIR}"
 
 pip install \
     pytest==8.2.2 \
-    pytest-cov==5.0.0 && \
+    pytest-cov==5.0.0 \
+    faker-biology==0.6.4 \
+    factory-boy==3.3.1 \
+    pysam==0.22 && \
 pytest --cov="${PKG_DIR}" "${TEST_DIR}"
 

From f648028e90ec0525599c10cb1b1a4538f6557009 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 15:18:35 +0000
Subject: [PATCH 131/165] modify dockerfile user

---
 Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 3e0a602..822edda 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,8 +6,13 @@ WORKDIR /hairpin2
 # Copy the current working directory contents into the container
 COPY . /hairpin2
 
+RUN adduser --disabled-password --gecos '' ubuntu && chsh -s /bin/bash && mkdir -p /home/ubuntu
+
+USER ubuntu
+WORKDIR /home/ubuntu
+
 # Install the hairpin package
-RUN pip install --root-user-action ignore /hairpin2/
+RUN pip install hairpin2/
 
 # Define a test script to check the installation of hairpin
 RUN LOC=$(which hairpin2) \

From 69ec35636805842ca333e3aa501bc7ed6ac88af4 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 15:23:47 +0000
Subject: [PATCH 132/165] missed a slash lol

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 822edda..8522de5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,7 +12,7 @@ USER ubuntu
 WORKDIR /home/ubuntu
 
 # Install the hairpin package
-RUN pip install hairpin2/
+RUN pip install /hairpin2
 
 # Define a test script to check the installation of hairpin
 RUN LOC=$(which hairpin2) \

From 95b9f91a38accefab3a08b44e7b44d5c50e86cd6 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 15:31:58 +0000
Subject: [PATCH 133/165] add to path

---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 8522de5..da9a4d7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,6 +14,8 @@ WORKDIR /home/ubuntu
 # Install the hairpin package
 RUN pip install /hairpin2
 
+ENV PATH=$PATH:/home/ubuntu/.local/bin
+
 # Define a test script to check the installation of hairpin
 RUN LOC=$(which hairpin2) \
     && if [ -z "$LOC" ]; then \

From 26a02409c658725541e1556225ca17ca6eeb0c70 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 14 Oct 2024 15:46:13 +0000
Subject: [PATCH 134/165] attend to some trivia; tie bows; dot i's

---
 Dockerfile                 | 2 +-
 test/test_validate_read.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index da9a4d7..fa7e60f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -12,7 +12,7 @@ USER ubuntu
 WORKDIR /home/ubuntu
 
 # Install the hairpin package
-RUN pip install /hairpin2
+RUN pip install --no-warn-script-location /hairpin2
 
 ENV PATH=$PATH:/home/ubuntu/.local/bin
 
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
index fd7d624..e27f4c4 100644
--- a/test/test_validate_read.py
+++ b/test/test_validate_read.py
@@ -34,6 +34,7 @@
 
 
 # smoke test validate_read
+# ----
 class ExtendedBioProvider(Bioseq):
     def quality_string(self, length):
         if length < 1:
@@ -130,6 +131,7 @@ def test_smoke(test_read):
                               min_clipqual=35,
                               min_basequal=25)
     print(format(vflag, '010b'))
+# ----
 
 
 # BASIS PATH TESTING

From 8ff0d96b816c3582dc193e7a4ec39851d9c65e04 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 21 Oct 2024 16:18:06 +0000
Subject: [PATCH 135/165] various changes, most importantly to removing PCR
 dups. needs testing

---
 .gitignore                 |   1 +
 hairpin2/main.py           |  95 +++++-----
 pyproject.toml             |   7 +
 test/test_validate_read.py | 351 -------------------------------------
 4 files changed, 56 insertions(+), 398 deletions(-)
 delete mode 100644 test/test_validate_read.py

diff --git a/.gitignore b/.gitignore
index 351fca9..d577ecd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ test_data_creation/
 *.json
 poetry.lock
 .coverage
+test/sim-data/
diff --git a/hairpin2/main.py b/hairpin2/main.py
index f243ade..d278b2c 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -28,6 +28,7 @@
 from itertools import tee
 from functools import partial
 from typing import Literal
+from collections.abc import Iterable
 
 
 def validate_read(
@@ -117,10 +118,28 @@ def validate_read(
     return read_flag
 
 
+def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
+    dup_idcs: list[int] = []
+    ends_sorted = sorted([(i, sorted(l)) for i, l in enumerate(readpair_ends)],
+                         key=lambda x: x[1])
+    testing_ends = [ends_sorted[0][1]]
+    for i in range(1, len(ends_sorted)):
+        max_diffs = []
+        for sublist in testing_ends:
+            max_diffs.append(max([abs(x - y) for x, y in zip(sublist, ends_sorted[i][1])]))
+        if all([x <= max_span for x in max_diffs]):
+            testing_ends.append(ends_sorted[i][1])
+            dup_idcs.append(ends_sorted[i][0])
+        else:
+            testing_ends = [ends_sorted[i][1]]
+    return dup_idcs
+
+
 def test_variant(
-    vcf_rec: pysam.VariantRecord,
-    mutant_alignments: dict[str, pysam.AlignmentFile],
+    vstart: int,
+    vstop: int,
     alt: str,
+    region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
     mut_type: str,
     al_thresh: float,
     max_span: int,
@@ -131,30 +150,22 @@ def test_variant(
     hp_filt = c.HPFilter()
     al_filt = c.ALFilter()
 
-    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in mutant_alignments}
-    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in mutant_alignments}
+    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in region_reads_by_sample}
+    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in region_reads_by_sample}
     mut_read_pos_f: list[int] = []
     mut_read_pos_r: list[int] = []
     mut_read_fracs_f: list[float] = []
     mut_read_fracs_r: list[float] = []
     aln_scores: list[float] = []
 
-    for mut_sample, alignment in mutant_alignments.items():
-        read_iter, test_iter = tee(alignment.fetch(vcf_rec.chrom,
-                                                   vcf_rec.start,
-                                                   (vcf_rec.start + 1)))
-        try:
-            next(test_iter)
-        except StopIteration:
-            continue
-        sample_readpair_ends = []
+    for mut_sample, read_iter in region_reads_by_sample.items():
+        sample_readpair_ends: list[list[int]] = []
         read = None
-        for read in read_iter:  # type: ignore
-            read_flag = c.ValidatorFlags.CLEAR.value
+        for read in read_iter:
             read_flag = read_validator(read=read,
                                        alt=alt,
-                                       vcf_start=vcf_rec.start,
-                                       vcf_stop=vcf_rec.stop,
+                                       vcf_start=vstart,
+                                       vcf_stop=vstop,
                                        mut_type=mut_type)
 
             if read_flag == c.ValidatorFlags.CLEAR.value:
@@ -167,39 +178,21 @@ def test_variant(
                                                 read.get_tag('MC'),
                                                 read.next_reference_start)])  # type: ignore
             mut_reads_log[mut_sample].append((read.query_name, read_flag))
-            del (read)
+        del (read)
+        # detect PCR duplicates previously missed due to (hairpin) artefacts
         if len(mut_reads[mut_sample]) > 1:
-            sample_readpair_ends_sorted: list[list[int]] = sorted(list(map(
-                                                        sorted,
-                                                        sample_readpair_ends)))
-            curr_ends = [sample_readpair_ends_sorted[0]]
-            drop_idx = []
-            for i in range(1, len(sample_readpair_ends_sorted)):
-                max_spans = map(lambda sublist:
-                                max(
-                                    [abs(x - y)
-                                        for x, y
-                                        in zip(sublist,
-                                               sample_readpair_ends_sorted[i])
-                                     ]
-                                ),
-                                curr_ends)
-                if all([x <= max_span for x in max_spans]):
-                    curr_ends.append(sample_readpair_ends_sorted[i])
-                    drop_idx.append(i)
-                else:
-                    curr_ends = [sample_readpair_ends_sorted[i]]
+            drop_idcs = get_hidden_PCRdup_indices(sample_readpair_ends, max_span=max_span)
             mut_reads[mut_sample] = [j
                                      for i, j
                                      in enumerate(mut_reads[mut_sample])
-                                     if i not in drop_idx]
+                                     if i not in drop_idcs]
     if all([len(x) == 0 for x in mut_reads.values()]):
         al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
         hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
     else:
         for read_list in mut_reads.values():
             for read in read_list:
-                mut_pos, _ = r2s.ref2querypos(read, vcf_rec.start)
+                mut_pos, _ = r2s.ref2querypos(read, vstart)
                 if read.flag & 0x10:
                     # 1-based position where start, idx 1, is alignment end
                     read_idx_wrt_aln = read.query_alignment_end - mut_pos
@@ -286,10 +279,19 @@ def test_record_per_alt(
     if len(samples_w_mutants) == 0:
         raise c.NoMutants
 
-    alignments_w_mutants = {k: v
-                            for k, v
-                            in alignments.items()
-                            if k in samples_w_mutants}
+    region_reads_by_sample: dict[str, pysam.IteratorRow] = {}
+    for k, v in alignments.items():
+        if k in samples_w_mutants:
+            read_iter, test_iter = tee(v.fetch(vcf_rec.chrom,
+                                               vcf_rec.start,
+                                               (vcf_rec.start + 1)))
+            try:
+                next(test_iter)
+            except StopIteration:
+                continue
+            else:
+                region_reads_by_sample[k] = read_iter  # doesn't check for overwrite
+
     filt_d = {}
     for alt in vcf_rec.alts:
         if vcf_rec.rlen == len(alt) and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):
@@ -299,10 +301,9 @@ def test_record_per_alt(
         elif vcf_rec.rlen == 1 and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):  # INS - DOES NOT SUPPORT <INS> TYPE IDS
             mut_type = 'I'
         else:
-            ## ERROR
-            logging.warning('could not type mutation POS={} REF={} ALT={}, skipping alt'.format(vcf_rec.pos, vcf_rec.ref, alt))
+            logging.warning('could not infer mutation type, POS={} REF={} ALT={}, skipping variant'.format(vcf_rec.pos, vcf_rec.ref, alt))
             continue
-        filt_d[alt] = variant_tester(vcf_rec, alignments_w_mutants, alt, mut_type)
+        filt_d[alt] = variant_tester(vcf_rec, region_reads_by_sample, alt, mut_type)
     return filt_d
 
 
diff --git a/pyproject.toml b/pyproject.toml
index e44bf0f..f4b2dde 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,3 +22,10 @@ factory-boy = "^3.3.1"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+
+[tool.pytest.ini_options]
+markers = [
+  "dev: development tests",
+  "validate: scientific validation tests"
+]
diff --git a/test/test_validate_read.py b/test/test_validate_read.py
deleted file mode 100644
index e27f4c4..0000000
--- a/test/test_validate_read.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-from hairpin2 import main as hp2
-from hairpin2 import constants as c
-import pysam
-import copy
-import pytest
-import factory
-import factory.random
-from faker import Faker
-from faker_biology.bioseq import Bioseq
-import random
-
-factory.random.reseed_random(2501)
-random.seed(2501)
-
-
-# smoke test validate_read
-# ----
-class ExtendedBioProvider(Bioseq):
-    def quality_string(self, length):
-        if length < 1:
-            raise ValueError('length must be geater than 1')
-        allowed_chars = [chr(i) for i in range(33, 75)]
-        return ''.join([random.choice(allowed_chars) for _ in range(length)])
-
-    def cigar_string(self, length):
-        if length < 1:
-            raise ValueError('length must be greater than 1')
-        opchars = 'MIDNSHP=XB'
-        opchars_noclip = 'MIDNP=XB'
-        bound = 200 if length > 200 else length
-        cig_op_lengths = []
-        while (bound > 0):
-            oplen = random.randint(1, bound)
-            cig_op_lengths.append(random.randint(1, oplen))
-            cig_sum = sum(cig_op_lengths)
-            bound = 200 if length - cig_sum > 200 else length - cig_sum
-        cig_op_lengths[-1] = cig_op_lengths[-1] - (cig_sum - length)
-        cig_ops = []
-        last_opchar = ''
-        # first and last op can be S or H, but not others
-        # first op H last op S, i.e. only clipping ops, seg faults pysam
-        # reads with only clipping ops seem to segfault pysam... report bug
-        if len(cig_op_lengths) == 1:
-            cig_ops.append(random.choice(opchars_noclip))
-        else:
-            cig_ops.append(random.choice(opchars))
-        for _ in range(max([len(cig_op_lengths) - 2, 0])):
-            iter_opchars = opchars_noclip.replace(last_opchar, '')
-            cig_ops.append(random.choice(iter_opchars))
-            last_opchar = cig_ops[-1]
-        if len(cig_ops) != 1:
-            cig_ops.append(random.choice(opchars_noclip if cig_ops[-1] in ['H', 'S'] else opchars))
-        return ''.join([str(x) for pair in zip(cig_op_lengths, cig_ops) for x in pair])
-
-
-fake = Faker()
-fake.add_provider(ExtendedBioProvider)
-
-
-class AlignedSegmentWrapper:
-    def __init__(self, query_name, query_sequence, query_qualities, flag, reference_id, reference_start, next_reference_start, mapping_quality, cigarstring, mc):
-        self.segment = pysam.AlignedSegment()
-        self.segment.query_name = query_name
-        self.segment.query_sequence = query_sequence
-        self.segment.query_qualities = pysam.qualitystring_to_array(query_qualities)
-        self.segment.flag = flag
-        self.segment.reference_id = reference_id
-        self.segment.reference_start = reference_start
-        self.segment.next_reference_start = next_reference_start
-        self.segment.mapping_quality = mapping_quality
-        self.segment.cigarstring = cigarstring
-        self.segment.set_tag('MC', mc)
-
-
-class ReadFactory(factory.Factory):
-    class Meta:
-        model = AlignedSegmentWrapper
-
-    query_name = 'read1'  # should one assume pysam handles all bizarre query names gracefully? I am...
-    query_sequence = factory.LazyAttribute(lambda _: fake.dna(length=random.randint(50, 200)))
-    query_qualities = factory.LazyAttribute(lambda o: fake.quality_string(length=len(o.query_sequence)))
-    flag = factory.LazyAttribute(lambda _: random.getrandbits(16))
-    reference_id = 0
-    reference_start = factory.LazyAttribute(lambda _: random.randint(1, 300000000))
-    next_reference_start = factory.LazyAttribute(lambda o: o.reference_start - random.randint(-700, 700))
-    mapping_quality = factory.LazyAttribute(lambda _: random.randint(0, 255))
-    cigarstring = factory.LazyAttribute(lambda o: fake.cigar_string(length=len(o.query_sequence)))
-    mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
-
-
-@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(1000)])
-def test_smoke(test_read):
-    mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
-    start = test_read.reference_start + mut_pos
-    alt = random.choices([test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)], '.'], cum_weights=[66, 100])[0]
-    if alt == '.':
-        mut_type_str = 'D'
-        stop = start + random.randint(1, 3)
-    elif len(alt) == 1:
-        mut_type_str = random.choice(['S', 'I'])
-        stop = start + 1
-    else:
-        mut_type_str = random.choice(['S', 'I'])
-        stop = start + 1 if mut_type_str == 'I' else start + len(alt)
-    vflag = hp2.validate_read(test_read,
-                              vcf_start=start,
-                              vcf_stop=stop,
-                              alt=alt,
-                              mut_type=mut_type_str,
-                              min_mapqual=11,
-                              min_clipqual=35,
-                              min_basequal=25)
-    print(format(vflag, '010b'))
-# ----
-
-
-# BASIS PATH TESTING
-# test every node and edge at least once
-# N.B.
-# pysam guards against:
-# quality and seq length mismatch
-# reference id is none
-# ----
-# perfect read pair:
-r = pysam.AlignedSegment()
-r.query_name = 'read1'
-r.query_sequence = 'CTGDAAAACC'
-r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-r.flag = 0x43
-r.reference_id = 0
-r.reference_start = 95
-r.next_reference_start = 95
-r.mapping_quality = 20
-r.cigarstring = '10M'
-r.set_tag('MC', '10M')
-
-
-def test_path_unsupported_alt():
-    with pytest.raises(ValueError):
-        hp2.validate_read(read=r,
-                          vcf_start=99,
-                          vcf_stop=100,
-                          alt='8',
-                          mut_type='S',
-                          min_mapqual=11,
-                          min_clipqual=35,
-                          min_basequal=25)
-
-
-def test_path_unsupported_mut_type():
-    with pytest.raises(ValueError):
-        hp2.validate_read(read=r,
-                          vcf_start=99,
-                          vcf_stop=100,
-                          alt='A',
-                          mut_type='8',
-                          min_mapqual=11,
-                          min_clipqual=35,
-                          min_basequal=25)
-
-
-def test_path_missing_mc():
-    rc = copy.deepcopy(r)
-    rc.set_tag('MC', None)
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-def test_path_missing_field():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = None
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-def test_path_set_flag_mapqual_clipqual():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x200
-    rc.cigarstring = '1S9M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=30,
-                             min_clipqual=40,
-                             min_basequal=25) == (c.ValidatorFlags.FLAG.value | c.ValidatorFlags.MAPQUAL.value | c.ValidatorFlags.CLIPQUAL.value)
-
-
-def test_path_sub_not_aligned():
-    assert hp2.validate_read(read=r,
-                             vcf_start=200,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-def test_path_bad_sub():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='T',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
-
-
-def test_path_good_sub():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-# checks cigar ops
-def test_path_del_bad_op():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='.',
-                             mut_type='D',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
-
-
-# 2bp del
-def test_path_good_del():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = '4M2D6M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=101,
-                             alt='.',
-                             mut_type='D',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-def test_path_ins_not_aligned():
-    assert hp2.validate_read(read=r,
-                             vcf_start=200,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-def test_path_ins_short():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='ATTTTTTTTTTTTTT',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.SHORT.value
-
-
-def test_path_bad_ins():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='AC',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == (c.ValidatorFlags.BAD_OP.value | c.ValidatorFlags.NOT_ALT.value)
-
-
-def test_path_good_ins():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = '5M2I3M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='AA',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-def test_path_overlap():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x83
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.OVERLAP.value
-
-
-def test_path_no_overlap():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x83
-    rc.set_tag('MC', '3M')
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value

From 4573845908569730ceddacd68a88e48dfd2b5bcd Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 21 Oct 2024 16:22:21 +0000
Subject: [PATCH 136/165] fix test_variant call

---
 hairpin2/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index d278b2c..662968a 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -139,8 +139,8 @@ def test_variant(
     vstart: int,
     vstop: int,
     alt: str,
-    region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
     mut_type: str,
+    region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
     al_thresh: float,
     max_span: int,
     position_fraction_thresh: float,
@@ -303,7 +303,7 @@ def test_record_per_alt(
         else:
             logging.warning('could not infer mutation type, POS={} REF={} ALT={}, skipping variant'.format(vcf_rec.pos, vcf_rec.ref, alt))
             continue
-        filt_d[alt] = variant_tester(vcf_rec, region_reads_by_sample, alt, mut_type)
+        filt_d[alt] = variant_tester(vcf_rec.start, vcf_rec.stop, vcf_rec.alt, mut_type, region_reads_by_sample)
     return filt_d
 
 

From 60da54dfa6a9c60ad91262a4fd7e7a19fad28314 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 21 Oct 2024 16:23:45 +0000
Subject: [PATCH 137/165] fix test_variant call again

---
 hairpin2/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 662968a..efdad56 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -303,7 +303,7 @@ def test_record_per_alt(
         else:
             logging.warning('could not infer mutation type, POS={} REF={} ALT={}, skipping variant'.format(vcf_rec.pos, vcf_rec.ref, alt))
             continue
-        filt_d[alt] = variant_tester(vcf_rec.start, vcf_rec.stop, vcf_rec.alt, mut_type, region_reads_by_sample)
+        filt_d[alt] = variant_tester(vcf_rec.start, vcf_rec.stop, alt, mut_type, region_reads_by_sample)
     return filt_d
 
 

From ebf7e51b049f3d1b5a82dc37063141c8e8b81d0a Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 21 Oct 2024 17:00:02 +0000
Subject: [PATCH 138/165] ingress into validating test_variant

---
 hairpin2/main.py                    |  29 ++--
 test/test_test_variant_validate.py  |  81 +++++++++
 test/test_validate_read_dev.py      | 132 ++++++++++++++
 test/test_validate_read_validate.py | 258 ++++++++++++++++++++++++++++
 4 files changed, 490 insertions(+), 10 deletions(-)
 create mode 100644 test/test_test_variant_validate.py
 create mode 100644 test/test_validate_read_dev.py
 create mode 100644 test/test_validate_read_validate.py

diff --git a/hairpin2/main.py b/hairpin2/main.py
index efdad56..f701f0c 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -118,20 +118,30 @@ def validate_read(
     return read_flag
 
 
+# detect PCR duplicates previously missed due to (hairpin) artefacts
 def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
     dup_idcs: list[int] = []
-    ends_sorted = sorted([(i, sorted(l)) for i, l in enumerate(readpair_ends)],
-                         key=lambda x: x[1])
-    testing_ends = [ends_sorted[0][1]]
-    for i in range(1, len(ends_sorted)):
+    read_ends_sorted: list[list[int]] = sorted([(i, sorted(l))
+                                                for i, l
+                                                in enumerate(readpair_ends)],
+                                               key=lambda x: x[1])
+    base_read_ends_list: list[list[int]] = [read_ends_sorted[0][1]]  # smallest first element. What was Peter's intention here?
+    for i in range(1, len(read_ends_sorted)):
+        comparison_read_ends = read_ends_sorted[i]
         max_diffs = []
-        for sublist in testing_ends:
-            max_diffs.append(max([abs(x - y) for x, y in zip(sublist, ends_sorted[i][1])]))
+        for sublist in base_read_ends_list:
+            max_diffs.append(max([abs(x - y)
+                                  for x, y
+                                  in zip(sublist, comparison_read_ends[1])]))
         if all([x <= max_span for x in max_diffs]):
-            testing_ends.append(ends_sorted[i][1])
-            dup_idcs.append(ends_sorted[i][0])
+            # dups
+            base_read_ends_list.append(comparison_read_ends[1])
+            dup_idcs.append(comparison_read_ends[0])
         else:
-            testing_ends = [ends_sorted[i][1]]
+            # read at i is not dup of reads in base_read_ends_list
+            # start again, test read at i
+            # against reads subsequent to i in ends_sorted
+            base_read_ends_list = [comparison_read_ends[1]]
     return dup_idcs
 
 
@@ -179,7 +189,6 @@ def test_variant(
                                                 read.next_reference_start)])  # type: ignore
             mut_reads_log[mut_sample].append((read.query_name, read_flag))
         del (read)
-        # detect PCR duplicates previously missed due to (hairpin) artefacts
         if len(mut_reads[mut_sample]) > 1:
             drop_idcs = get_hidden_PCRdup_indices(sample_readpair_ends, max_span=max_span)
             mut_reads[mut_sample] = [j
diff --git a/test/test_test_variant_validate.py b/test/test_test_variant_validate.py
new file mode 100644
index 0000000..394dcca
--- /dev/null
+++ b/test/test_test_variant_validate.py
@@ -0,0 +1,81 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2 import main as hp2
+import pysam
+import pytest
+import copy
+from functools import partial
+
+
+# BASIS PATH TESTING
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC' * 10
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 100
+r.next_reference_start = 100
+r.mapping_quality = 20
+r.cigarstring = '100M'
+r.set_tag('MC', '100M')
+
+# S1 needs 2 good reads to give True on if len(mut_reads...) > 1
+s1r1 = copy.deepcopy(r)
+s1r2 = copy.deepcopy(r)
+s1r3 = copy.deepcopy(r)
+# S2 needs a bad read to give a False on if read_flag == ... CLEAR
+# and len(mut_reads...)
+s2r1 = copy.deepcopy(r)
+s2r2 = copy.deepcopy(r)
+s2r2.flag = 0xE00
+iter1 = [s1r1, s1r2]
+iter2 = [s2r1, s2r2]
+readd = {'S1': iter1, 'S2': iter2}
+
+
+# max spans...
+### 21/10/24 HERE: I separated out max spans to make testing easier
+### or at least to make understanding max spans easier
+### I'm not super keen on the separate function so
+### maybe it's going back in but now I actually understand it
+### perhaps do test suite without it, put it back and cover at the end.
+# don't forget to install updated main.py
+@pytest.mark.validate
+def test_path_simple():
+    f = hp2.test_variant(
+        vstart=160,
+        vstop=161,
+        alt='A',
+        region_reads_by_sample=readd,
+        mut_type='S',
+        al_thresh=0.93,
+        max_span=6,
+        position_fraction_thresh=0.15,
+        read_validator=partial(hp2.validate_read,
+                               min_mapqual=11,
+                               min_clipqual=35,
+                               min_basequal=25)
+    )
+    breakpoint()
diff --git a/test/test_validate_read_dev.py b/test/test_validate_read_dev.py
new file mode 100644
index 0000000..ad018c8
--- /dev/null
+++ b/test/test_validate_read_dev.py
@@ -0,0 +1,132 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2 import main as hp2
+import pysam
+import pytest
+import factory
+import factory.random
+from faker import Faker
+from faker_biology.bioseq import Bioseq
+import random
+
+factory.random.reseed_random(2501)
+random.seed(2501)
+
+
+# smoke test validate_read
+# ----
+class ExtendedBioProvider(Bioseq):
+    def quality_string(self, length):
+        if length < 1:
+            raise ValueError('length must be geater than 1')
+        allowed_chars = [chr(i) for i in range(33, 75)]
+        return ''.join([random.choice(allowed_chars) for _ in range(length)])
+
+    def cigar_string(self, length):
+        if length < 1:
+            raise ValueError('length must be greater than 1')
+        opchars = 'MIDNSHP=XB'
+        opchars_noclip = 'MIDNP=XB'
+        bound = 200 if length > 200 else length
+        cig_op_lengths = []
+        while (bound > 0):
+            oplen = random.randint(1, bound)
+            cig_op_lengths.append(random.randint(1, oplen))
+            cig_sum = sum(cig_op_lengths)
+            bound = 200 if length - cig_sum > 200 else length - cig_sum
+        cig_op_lengths[-1] = cig_op_lengths[-1] - (cig_sum - length)
+        cig_ops = []
+        last_opchar = ''
+        # first and last op can be S or H, but not others
+        # first op H last op S, i.e. only clipping ops, seg faults pysam
+        # reads with only clipping ops seem to segfault pysam... report bug
+        if len(cig_op_lengths) == 1:
+            cig_ops.append(random.choice(opchars_noclip))
+        else:
+            cig_ops.append(random.choice(opchars))
+        for _ in range(max([len(cig_op_lengths) - 2, 0])):
+            iter_opchars = opchars_noclip.replace(last_opchar, '')
+            cig_ops.append(random.choice(iter_opchars))
+            last_opchar = cig_ops[-1]
+        if len(cig_ops) != 1:
+            cig_ops.append(random.choice(opchars_noclip if cig_ops[-1] in ['H', 'S'] else opchars))
+        return ''.join([str(x) for pair in zip(cig_op_lengths, cig_ops) for x in pair])
+
+
+fake = Faker()
+fake.add_provider(ExtendedBioProvider)
+
+
+class AlignedSegmentWrapper:
+    def __init__(self, query_name, query_sequence, query_qualities, flag, reference_id, reference_start, next_reference_start, mapping_quality, cigarstring, mc):
+        self.segment = pysam.AlignedSegment()
+        self.segment.query_name = query_name
+        self.segment.query_sequence = query_sequence
+        self.segment.query_qualities = pysam.qualitystring_to_array(query_qualities)
+        self.segment.flag = flag
+        self.segment.reference_id = reference_id
+        self.segment.reference_start = reference_start
+        self.segment.next_reference_start = next_reference_start
+        self.segment.mapping_quality = mapping_quality
+        self.segment.cigarstring = cigarstring
+        self.segment.set_tag('MC', mc)
+
+
+class ReadFactory(factory.Factory):
+    class Meta:
+        model = AlignedSegmentWrapper
+
+    query_name = 'read1'  # should one assume pysam handles all bizarre query names gracefully? I am...
+    query_sequence = factory.LazyAttribute(lambda _: fake.dna(length=random.randint(50, 200)))
+    query_qualities = factory.LazyAttribute(lambda o: fake.quality_string(length=len(o.query_sequence)))
+    flag = factory.LazyAttribute(lambda _: random.getrandbits(16))
+    reference_id = 0
+    reference_start = factory.LazyAttribute(lambda _: random.randint(1, 300000000))
+    next_reference_start = factory.LazyAttribute(lambda o: o.reference_start - random.randint(-700, 700))
+    mapping_quality = factory.LazyAttribute(lambda _: random.randint(0, 255))
+    cigarstring = factory.LazyAttribute(lambda o: fake.cigar_string(length=len(o.query_sequence)))
+    mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
+
+
+@pytest.mark.dev
+@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(1000)])
+def test_smoke(test_read):
+    mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
+    start = test_read.reference_start + mut_pos
+    alt = random.choices([test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)], '.'], cum_weights=[66, 100])[0]
+    if alt == '.':
+        mut_type_str = 'D'
+        stop = start + random.randint(1, 3)
+    elif len(alt) == 1:
+        mut_type_str = random.choice(['S', 'I'])
+        stop = start + 1
+    else:
+        mut_type_str = random.choice(['S', 'I'])
+        stop = start + 1 if mut_type_str == 'I' else start + len(alt)
+    vflag = hp2.validate_read(test_read,
+                              vcf_start=start,
+                              vcf_stop=stop,
+                              alt=alt,
+                              mut_type=mut_type_str,
+                              min_mapqual=11,
+                              min_clipqual=35,
+                              min_basequal=25)
+    print(format(vflag, '010b'))
diff --git a/test/test_validate_read_validate.py b/test/test_validate_read_validate.py
new file mode 100644
index 0000000..09cc308
--- /dev/null
+++ b/test/test_validate_read_validate.py
@@ -0,0 +1,258 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2 import main as hp2
+from hairpin2 import constants as c
+import pysam
+import copy
+import pytest
+
+
+# BASIS PATH TESTING
+# test every node and edge at least once
+# N.B.
+# pysam guards against:
+# quality and seq length mismatch
+# reference id is none
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC'
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 95
+r.next_reference_start = 95
+r.mapping_quality = 20
+r.cigarstring = '10M'
+r.set_tag('MC', '10M')
+
+
+@pytest.mark.validate
+def test_path_unsupported_alt():
+    with pytest.raises(ValueError):
+        hp2.validate_read(read=r,
+                          vcf_start=99,
+                          vcf_stop=100,
+                          alt='8',
+                          mut_type='S',
+                          min_mapqual=11,
+                          min_clipqual=35,
+                          min_basequal=25)
+
+
+@pytest.mark.validate
+def test_path_unsupported_mut_type():
+    with pytest.raises(ValueError):
+        hp2.validate_read(read=r,
+                          vcf_start=99,
+                          vcf_stop=100,
+                          alt='A',
+                          mut_type='8',
+                          min_mapqual=11,
+                          min_clipqual=35,
+                          min_basequal=25)
+
+
+@pytest.mark.validate
+def test_path_missing_mc():
+    rc = copy.deepcopy(r)
+    rc.set_tag('MC', None)
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+@pytest.mark.validate
+def test_path_missing_field():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = None
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
+
+
+@pytest.mark.validate
+def test_path_set_flag_mapqual_clipqual():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x200
+    rc.cigarstring = '1S9M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=30,
+                             min_clipqual=40,
+                             min_basequal=25) == (c.ValidatorFlags.FLAG.value | c.ValidatorFlags.MAPQUAL.value | c.ValidatorFlags.CLIPQUAL.value)
+
+
+@pytest.mark.validate
+def test_path_sub_not_aligned():
+    assert hp2.validate_read(read=r,
+                             vcf_start=200,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+@pytest.mark.validate
+def test_path_bad_sub():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='T',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
+
+
+@pytest.mark.validate
+def test_path_good_sub():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+# checks cigar ops
+@pytest.mark.validate
+def test_path_del_bad_op():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='.',
+                             mut_type='D',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
+
+
+# 2bp del
+@pytest.mark.validate
+def test_path_good_del():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '4M2D6M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=101,
+                             alt='.',
+                             mut_type='D',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+@pytest.mark.validate
+def test_path_ins_not_aligned():
+    assert hp2.validate_read(read=r,
+                             vcf_start=200,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
+
+
+@pytest.mark.validate
+def test_path_ins_short():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='ATTTTTTTTTTTTTT',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.SHORT.value
+
+
+@pytest.mark.validate
+def test_path_bad_ins():
+    assert hp2.validate_read(read=r,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='AC',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == (c.ValidatorFlags.BAD_OP.value | c.ValidatorFlags.NOT_ALT.value)
+
+
+@pytest.mark.validate
+def test_path_good_ins():
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '5M2I3M'
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='AA',
+                             mut_type='I',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
+
+
+@pytest.mark.validate
+def test_path_overlap():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.OVERLAP.value
+
+
+@pytest.mark.validate
+def test_path_no_overlap():
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    rc.set_tag('MC', '3M')
+    assert hp2.validate_read(read=rc,
+                             vcf_start=99,
+                             vcf_stop=100,
+                             alt='A',
+                             mut_type='S',
+                             min_mapqual=11,
+                             min_clipqual=35,
+                             min_basequal=25) == c.ValidatorFlags.CLEAR.value

From e9abc646cc3f137ac29ac432c971ebd5a519d833 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 22 Oct 2024 13:56:17 +0000
Subject: [PATCH 139/165] restructure to separate concerns

---
 hairpin2/main.py                   | 502 +++++++++++++++++------------
 test/test_test_variant_validate.py | 116 +++++--
 2 files changed, 390 insertions(+), 228 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index f701f0c..9e0fd30 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -26,34 +26,22 @@
 import logging
 import json
 from itertools import tee
-from functools import partial
 from typing import Literal
 from collections.abc import Iterable
 
 
-def validate_read(
+def validate_read_broad(
     read: pysam.AlignedSegment,
     vcf_start: int,
-    vcf_stop: int,
-    alt: str,
-    mut_type: Literal['S', 'D', 'I'],
     min_mapqual: int,
     min_clipqual: int,
-    min_basequal: int,
 ) -> int:
-    sup_char_alt = ['A', 'C', 'G', 'T', 'N', '*', '.']
-    if any([b not in sup_char_alt for b in alt]):
-        raise ValueError('unsupported character in alt: {} - supports {}'.format(alt, ', '.join(sup_char_alt)))
-    if mut_type not in ['S', 'D', 'I']:
-        raise ValueError('unsupported mut_type: {} - supports \'S\' (SUB) \'D\' (DEL) \'I\' (INS)'.format(mut_type))
-
     read_flag = c.ValidatorFlags.CLEAR.value
 
     try:
         mate_cig = read.get_tag('MC')
     except KeyError:
         mate_cig = None
-
     if any(x is None for x in
             [read.reference_end,
                 read.query_sequence,
@@ -70,47 +58,16 @@ def validate_read(
         if read.mapping_quality < min_mapqual:
             read_flag |= c.ValidatorFlags.MAPQUAL.value
 
-        if ('S' in read.cigarstring and  # type: ignore
-                mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore
+        if ('S' in read.cigarstring and
+                mean(read.query_alignment_qualities) < min_clipqual):
             read_flag |= c.ValidatorFlags.CLIPQUAL.value
 
-        if mut_type == 'S':  # SUB
-            try:
-                mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
-            except IndexError:
-                read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-            else:
-                if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore
-                    read_flag |= c.ValidatorFlags.NOT_ALT.value
-                if any([bq < min_basequal for bq in read.query_qualities[mut_pos:mut_pos + len(alt)]]):  # type: ignore
-                    read_flag |= c.ValidatorFlags.BASEQUAL.value
-        elif mut_type == 'D':  # DEL - doesn't check for matches before and after...
-            # this could error if read doesn't cover region (as could all)
-            mut_alns = [q for q, r in read.get_aligned_pairs() if r in range(vcf_start, vcf_stop)]
-            if any([x is not None for x in mut_alns]):
-                read_flag |= c.ValidatorFlags.BAD_OP.value
-        elif mut_type == 'I':  # INS
-            try:
-                prior_pos, _ = r2s.ref2querypos(read, vcf_start)
-            except IndexError:
-                read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-            else:
-                if prior_pos + len(alt) > read.query_length:
-                    read_flag |= c.ValidatorFlags.SHORT.value
-                else:
-                    mut_alns = [(q, r) for q, r in read.get_aligned_pairs() if q in range(prior_pos + 1, prior_pos + len(alt) + 1)]
-                    if any([r is not None for _, r in mut_alns]):
-                        read_flag |= c.ValidatorFlags.BAD_OP.value
-                    if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:
-                        read_flag |= c.ValidatorFlags.NOT_ALT.value
-
-        if read_flag == c.ValidatorFlags.CLEAR.value and not (read.flag & 0x40):
+        if not read.flag & 0x40:
             read_range = range(read.reference_start,
                                read.reference_end)
             mate_range = range(read.next_reference_start,
                                r2s.ref_end_via_cigar(mate_cig,
-                                                     read.next_reference_start)
-                               )
+                                                     read.next_reference_start))
             ref_overlap = set(read_range).intersection(mate_range)
             if vcf_start in ref_overlap:
                 read_flag |= c.ValidatorFlags.OVERLAP.value
@@ -118,14 +75,79 @@ def validate_read(
     return read_flag
 
 
+def validate_read_alt(
+    read: pysam.AlignedSegment,
+    vcf_start: int,
+    vcf_stop: int,
+    alt: str,
+    mut_type: Literal['S', 'D', 'I'],
+    min_basequal: int
+) -> int:
+    if mut_type not in ['S', 'D', 'I']:
+        raise ValueError(
+            'unsupported mut_type: {} - supports \'S\' (SUB) \'D\' (DEL) \'I\' (INS)'.format(mut_type))
+
+    read_flag = c.ValidatorFlags.CLEAR.value
+
+    if mut_type == 'S':  # SUB
+        try:
+            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
+        except IndexError:
+            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+        else:
+            if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:
+                read_flag |= c.ValidatorFlags.NOT_ALT.value
+            if any([bq < min_basequal
+                    for bq
+                    in read.query_qualities[mut_pos:mut_pos + len(alt)]]):
+                read_flag |= c.ValidatorFlags.BASEQUAL.value
+    # DEL - doesn't check for matches before and after...
+    elif mut_type == 'D':
+        # this could error if read doesn't cover region (as could all)
+        mut_alns = [q
+                    for q, r
+                    in read.get_aligned_pairs()
+                    if r in range(vcf_start, vcf_stop)]
+        if any([x is not None for x in mut_alns]):
+            read_flag |= c.ValidatorFlags.BAD_OP.value
+    elif mut_type == 'I':  # INS
+        try:
+            prior_pos, _ = r2s.ref2querypos(read, vcf_start)
+        except IndexError:
+            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+        else:
+            if prior_pos + len(alt) > read.query_length:
+                read_flag |= c.ValidatorFlags.SHORT.value
+            else:
+                mut_alns = [(q, r)
+                            for q, r
+                            in read.get_aligned_pairs()
+                            if q in range(prior_pos + 1, prior_pos + len(alt) + 1)]
+                if any([r is not None for _, r in mut_alns]):
+                    read_flag |= c.ValidatorFlags.BAD_OP.value
+                if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:
+                    read_flag |= c.ValidatorFlags.NOT_ALT.value
+
+    return read_flag
+
+
 # detect PCR duplicates previously missed due to (hairpin) artefacts
+# this implementation assumes that sorting on first element of each sublist
+# is appropriate, per Peter's initial implementation.
+# is an all against all comparison between all read lists more appropriate?
+# and between pairs of readlists, why is comparing sorted pairs most appropriate?
+# again, does all against all make more sense?
+# (if so, maybe two pointer comparison?)
+# it bothers me that it matters where in the chain this occurs
+# with more reads it's more likely they'll cluster as dupes right?
 def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
     dup_idcs: list[int] = []
     read_ends_sorted: list[list[int]] = sorted([(i, sorted(l))
                                                 for i, l
                                                 in enumerate(readpair_ends)],
                                                key=lambda x: x[1])
-    base_read_ends_list: list[list[int]] = [read_ends_sorted[0][1]]  # smallest first element. What was Peter's intention here?
+    # smallest first element. What was Peter's intention here?
+    base_read_ends_list: list[list[int]] = [read_ends_sorted[0][1]]
     for i in range(1, len(read_ends_sorted)):
         comparison_read_ends = read_ends_sorted[i]
         max_diffs = []
@@ -145,138 +167,163 @@ def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
     return dup_idcs
 
 
-def test_variant(
+def alt_filter_reads(
     vstart: int,
     vstop: int,
     alt: str,
     mut_type: str,
     region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
-    al_thresh: float,
     max_span: int,
+    min_basequal: float
+) -> list[pysam.AlignedSegment]:
+    rrbs_filt: dict[str, list[pysam.AlignedSegment]] = {key: []
+                                                        for key
+                                                        in region_reads_by_sample}
+    filtered_reads: list[pysam.AlignedSegment] = []
+
+    for mut_sample, read_iter in region_reads_by_sample.items():
+        sample_readpair_ends: list[list[int]] = []
+        for read in read_iter:
+            if not validate_read_alt(read,
+                                     vstart,
+                                     vstop,
+                                     alt,
+                                     mut_type,
+                                     min_basequal):
+                rrbs_filt[mut_sample].append(read)
+                next_ref_end = r2s.ref_end_via_cigar(
+                    read.get_tag('MC'),
+                    read.next_reference_start)
+                sample_readpair_ends.append([read.reference_start,
+                                             read.reference_end,
+                                             read.next_reference_start,
+                                             next_ref_end])
+        if len(rrbs_filt[mut_sample]) > 1:
+            drop_idcs = get_hidden_PCRdup_indices(sample_readpair_ends,
+                                                  max_span=max_span)
+            filtered_reads = filtered_reads + [j
+                                               for i, j
+                                               in enumerate(rrbs_filt[mut_sample])
+                                               if i not in drop_idcs]
+    return filtered_reads
+
+
+def test_variant_AL(
+    mut_reads: Iterable[pysam.AlignedSegment],
+    al_thresh: float
+) -> c.ALFilter:
+    al_filt = c.ALFilter()
+    aln_scores: list[int] = []
+
+    for read in mut_reads:
+        try:
+            aln_scores.append(read.get_tag('AS') / read.query_length)
+        except KeyError:
+            pass
+    if len(aln_scores) != 0:
+        al_filt.avg_as = median(aln_scores)
+        al_filt.code = c.FiltCodes.ON_THRESHOLD.value
+        if al_filt.avg_as <= al_thresh:
+            al_filt.set()
+    else:
+        al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+
+    return al_filt
+
+
+def test_variant_HP(
+    vstart: int,
+    vstop: int,
+    alt: str,
+    mut_type: str,
+    mut_reads: Iterable[pysam.AlignedSegment],
     position_fraction_thresh: float,
-    read_validator: c.FlagReturn,
 ) -> c.Filters:
 
     hp_filt = c.HPFilter()
-    al_filt = c.ALFilter()
-
-    mut_reads: dict[str, list[pysam.AlignedSegment]] = {key: [] for key in region_reads_by_sample}
-    mut_reads_log: dict[str, list[tuple]] = {key: [] for key in region_reads_by_sample}
     mut_read_pos_f: list[int] = []
     mut_read_pos_r: list[int] = []
     mut_read_fracs_f: list[float] = []
     mut_read_fracs_r: list[float] = []
-    aln_scores: list[float] = []
 
-    for mut_sample, read_iter in region_reads_by_sample.items():
-        sample_readpair_ends: list[list[int]] = []
-        read = None
-        for read in read_iter:
-            read_flag = read_validator(read=read,
-                                       alt=alt,
-                                       vcf_start=vstart,
-                                       vcf_stop=vstop,
-                                       mut_type=mut_type)
-
-            if read_flag == c.ValidatorFlags.CLEAR.value:
-                mut_reads[mut_sample].append(read)
-                sample_readpair_ends.append(
-                            [read.reference_start,
-                                read.reference_end,
-                                read.next_reference_start,
-                                r2s.ref_end_via_cigar(
-                                                read.get_tag('MC'),
-                                                read.next_reference_start)])  # type: ignore
-            mut_reads_log[mut_sample].append((read.query_name, read_flag))
-        del (read)
-        if len(mut_reads[mut_sample]) > 1:
-            drop_idcs = get_hidden_PCRdup_indices(sample_readpair_ends, max_span=max_span)
-            mut_reads[mut_sample] = [j
-                                     for i, j
-                                     in enumerate(mut_reads[mut_sample])
-                                     if i not in drop_idcs]
-    if all([len(x) == 0 for x in mut_reads.values()]):
-        al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-        hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-    else:
-        for read_list in mut_reads.values():
-            for read in read_list:
-                mut_pos, _ = r2s.ref2querypos(read, vstart)
-                if read.flag & 0x10:
-                    # 1-based position where start, idx 1, is alignment end
-                    read_idx_wrt_aln = read.query_alignment_end - mut_pos
-                    mut_read_fracs_r.append(read_idx_wrt_aln
-                                            / read.query_alignment_length)
-                    mut_read_pos_r.append(read_idx_wrt_aln)
-                else:
-                    read_idx_wrt_aln = mut_pos - read.query_alignment_start + 1
-                    mut_read_fracs_f.append(read_idx_wrt_aln
-                                            / read.query_alignment_length)
-                    mut_read_pos_f.append(read_idx_wrt_aln)
-                try:
-                    aln_scores.append(read.get_tag('AS') / read.query_length)  # type:ignore
-                except KeyError:
-                    pass
-        if len(aln_scores) != 0:
-            al_filt.avg_as = median(aln_scores)
-            al_filt.code = c.FiltCodes.ON_THRESHOLD.value
-            if al_filt.avg_as <= al_thresh:
-                al_filt.set()
-        else:
-            al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-        # hairpin conditions from Ellis et al.
-        if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
-            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-            sd_f = stdev(mut_read_pos_f)
-            if (
-                ((sum([x <= position_fraction_thresh
-                      for x
-                      in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
-                mad_f > 0 and
-                    sd_f > 4):
-                hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
-            else:
-                hp_filt.code = c.FiltCodes.SIXTYAI.value
-                hp_filt.set()
-        elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
-            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-            sd_r = stdev(mut_read_pos_r)
-            if (
-                ((sum([x <= position_fraction_thresh
-                      for x
-                      in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
-                mad_r > 0 and
-                    sd_r > 4):
-                hp_filt.code = c.FiltCodes.SIXTYAI.value
-            else:
-                hp_filt.code = c.FiltCodes.SIXTYAI.value
-                hp_filt.set()
-        elif len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
-            mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-            sd_f = stdev(mut_read_pos_f)
-            mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-            sd_r = stdev(mut_read_pos_r)
-            frac_lt_thresh = (sum([x <= position_fraction_thresh
-                                  for x
-                                  in mut_read_fracs_f + mut_read_fracs_r]) /
-                              (len(mut_read_pos_f) + len(mut_read_pos_r)))
-            if (frac_lt_thresh < 0.9 or
-               (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
-               (mad_f > 1 and sd_f > 10) or
-               (mad_r > 1 and sd_r > 10)):
-                hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
+    # if all([len(x) == 0 for x in mut_reads.values()]):
+    #     al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+    #     hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+    # else:
+    for read_list in mut_reads.values():
+        for read in read_list:
+            mut_pos, _ = r2s.ref2querypos(read, vstart)
+            if read.flag & 0x10:
+                # 1-based position where start, idx 1, is alignment end
+                mut_idx_wrt_query_aln = read.query_alignment_end - mut_pos
+                mut_read_fracs_r.append(mut_idx_wrt_query_aln
+                                        / read.query_alignment_length)
+                mut_read_pos_r.append(mut_idx_wrt_query_aln)
             else:
-                hp_filt.code = c.FiltCodes.SIXTYBI.value
-                hp_filt.set()
+                mut_idx_wrt_query_aln = mut_pos - read.query_alignment_start + 1
+                mut_read_fracs_f.append(mut_idx_wrt_query_aln
+                                        / read.query_alignment_length)
+                mut_read_pos_f.append(mut_idx_wrt_query_aln)
+    # hairpin conditions from Ellis et al.
+    if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
+        breakpoint()
+        mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
+        sd_f = stdev(mut_read_pos_f)
+        if (
+            ((sum([x <= position_fraction_thresh
+                  for x
+                  in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
+            mad_f > 0 and
+                sd_f > 4):
+            hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
         else:
-            hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-    return c.Filters(al_filt, hp_filt)
+            hp_filt.code = c.FiltCodes.SIXTYAI.value
+            hp_filt.set()
+    elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
+        mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
+        sd_r = stdev(mut_read_pos_r)
+        if (
+            ((sum([x <= position_fraction_thresh
+                  for x
+                  in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
+            mad_r > 0 and
+                sd_r > 4):
+            hp_filt.code = c.FiltCodes.SIXTYAI.value
+        else:
+            hp_filt.code = c.FiltCodes.SIXTYAI.value
+            hp_filt.set()
+    elif len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
+        mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
+        sd_f = stdev(mut_read_pos_f)
+        mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
+        sd_r = stdev(mut_read_pos_r)
+        frac_lt_thresh = (sum([x <= position_fraction_thresh
+                              for x
+                              in mut_read_fracs_f + mut_read_fracs_r]) /
+                          (len(mut_read_pos_f) + len(mut_read_pos_r)))
+        if (frac_lt_thresh < 0.9 or
+           (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
+           (mad_f > 1 and sd_f > 10) or
+           (mad_r > 1 and sd_r > 10)):
+            hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
+        else:
+            hp_filt.code = c.FiltCodes.SIXTYBI.value
+            hp_filt.set()
+    else:
+        hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+
+    return hp_filt
 
 
 def test_record_per_alt(
     alignments: dict[str, pysam.AlignmentFile],
     vcf_rec: pysam.VariantRecord,
-    variant_tester: c.FiltReturn,
+    min_mapqual: int,
+    min_clipqual: int,
+    min_basequal: int,
+    max_span: int,
+    al_thresh: float,
+    position_fraction: float
 ) -> dict[str, c.Filters]:
 
     if vcf_rec.alts is None:
@@ -299,20 +346,48 @@ def test_record_per_alt(
             except StopIteration:
                 continue
             else:
-                region_reads_by_sample[k] = read_iter  # doesn't check for overwrite
+                broad_filtered_iter = (read
+                                       for read
+                                       in read_iter
+                                       if validate_read_broad(read,
+                                                              vcf_rec.start,
+                                                              min_mapqual=min_mapqual,
+                                                              min_clipqual=min_clipqual))
+                # doesn't check for overwrite
+                region_reads_by_sample[k] = broad_filtered_iter
 
     filt_d = {}
     for alt in vcf_rec.alts:
-        if vcf_rec.rlen == len(alt) and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):
+        if (vcf_rec.rlen == len(alt)
+                and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*']))):
             mut_type = 'S'
         elif len(alt) < vcf_rec.rlen or alt == '.':  # DEL - DOES NOT SUPPORT <DEL> TYPE IDS
             mut_type = 'D'
-        elif vcf_rec.rlen == 1 and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*'])):  # INS - DOES NOT SUPPORT <INS> TYPE IDS
+        elif (vcf_rec.rlen == 1
+                and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*']))):  # INS - DOES NOT SUPPORT <INS> TYPE IDS
             mut_type = 'I'
         else:
-            logging.warning('could not infer mutation type, POS={} REF={} ALT={}, skipping variant'.format(vcf_rec.pos, vcf_rec.ref, alt))
+            logging.warning('could not infer mutation type, POS={} REF={} ALT={}, skipping variant'.format(
+                vcf_rec.pos, vcf_rec.ref, alt))
             continue
-        filt_d[alt] = variant_tester(vcf_rec.start, vcf_rec.stop, alt, mut_type, region_reads_by_sample)
+        alt_filt_reads: list = alt_filter_reads(vcf_rec.start,
+                                                vcf_rec.stop,
+                                                alt,
+                                                mut_type,
+                                                region_reads_by_sample,
+                                                max_span)
+        if len(alt_filt_reads) == 0:
+            filt_d[alt] = c.Filters(c.ALFilter(code=c.FiltCodes.INSUFFICIENT_READS.value),
+                                    c.HPFilter(code=c.FiltCodes.INSUFFICIENT_READS.value))
+        else:
+            filt_d[alt] = c.Filters(test_variant_AL(alt_filt_reads,
+                                                    al_thresh),
+                                    test_variant_HP(vcf_rec.start,
+                                                    vcf_rec.stop,
+                                                    alt,
+                                                    mut_type,
+                                                    alt_filt_reads,
+                                                    position_fraction))
     return filt_d
 
 
@@ -396,7 +471,8 @@ def main_cli() -> None:
 
     json_config: dict | None = None
     if args.input_json:
-        logging.info('args JSON provided, extended arguments will be loaded from JSON if not present on command line')
+        logging.info(
+            'args JSON provided, extended arguments will be loaded from JSON if not present on command line')
         try:
             with open(args.input_json, 'r') as f:
                 json_config = json.load(f)
@@ -416,17 +492,6 @@ def main_cli() -> None:
     # test args are sensible, exit if not
     h.test_options(args)
 
-    primed_validate_read = partial(validate_read,
-                                   min_mapqual=args.min_mapping_quality,
-                                   min_clipqual=args.min_clip_quality,
-                                   min_basequal=args.min_base_quality)
-
-    primed_variant_tester = partial(test_variant,
-                                    al_thresh=args.al_filter_threshold,
-                                    max_span=args.max_read_span,
-                                    position_fraction_thresh=args.position_fraction,
-                                    read_validator=primed_validate_read)
-
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
     except Exception as e:
@@ -456,7 +521,8 @@ def main_cli() -> None:
                                                                 and args.format == "c"
                                                                 else None))
         except Exception as e:
-            h.cleanup(msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
+            h.cleanup(
+                msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
         alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']
@@ -469,33 +535,44 @@ def main_cli() -> None:
         for pair in args.name_mapping:
             kv_split = pair.split(':')  # VCF:aln
             if len(kv_split) != 2:
-                h.cleanup(msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
+                h.cleanup(
+                    msg='name mapping misformatted, more than two elements in map string {}'.format(pair))
             vcf_map_names.append(kv_split[0])
             alignment_map_names.append(kv_split[1])
         if h.has_duplicates(vcf_map_names):
-            h.cleanup(msg='duplicate VCF sample names provided to name mapping flag')
+            h.cleanup(
+                msg='duplicate VCF sample names provided to name mapping flag')
         if not set(vcf_map_names) <= sample_names:
-            h.cleanup(msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
+            h.cleanup(
+                msg="VCF sample names provided to name mapping flag are not equal to, or a subset of, VCF sample names as retrieved from VCF")
         if h.has_duplicates(alignment_map_names):
-            h.cleanup(msg='duplicate aligment sample names provided to name mapping flag')
+            h.cleanup(
+                msg='duplicate aligment sample names provided to name mapping flag')
         if h.lists_not_equal(alignment_map_names,
                              vcf_sample_to_alignment_map.keys()):
-            h.cleanup(msg='alignment sample names provided to name mapping flag do not match alignment SM tags')
+            h.cleanup(
+                msg='alignment sample names provided to name mapping flag do not match alignment SM tags')
         vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v
                                        for k, v
                                        in vcf_sample_to_alignment_map.items()}
     else:
         if not vcf_sample_to_alignment_map.keys() <= sample_names:
-            h.cleanup(msg='alignment SM tags do not match VCF sample names: {}'.format(vcf_sample_to_alignment_map.keys() - sample_names))
+            h.cleanup(msg='alignment SM tags do not match VCF sample names: {}'.format(
+                vcf_sample_to_alignment_map.keys() - sample_names))
     if sample_names != vcf_sample_to_alignment_map.keys():
-        logging.info("alignments not provided for all VCF samples; {} will be ignored".format(sample_names - vcf_sample_to_alignment_map.keys()))
+        logging.info("alignments not provided for all VCF samples; {} will be ignored".format(
+            sample_names - vcf_sample_to_alignment_map.keys()))
 
     # init output
     out_head = vcf_in_handle.header  # type:ignore
-    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(args.al_filter_threshold, ', '.join(vcf_sample_to_alignment_map.keys())))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(', '.join(vcf_sample_to_alignment_map.keys())))
-    out_head.add_line("##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
-    out_head.add_line("##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
+    out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(
+        args.al_filter_threshold, ', '.join(vcf_sample_to_alignment_map.keys())))
+    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(
+        ', '.join(vcf_sample_to_alignment_map.keys())))
+    out_head.add_line(
+        "##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
+    out_head.add_line(
+        "##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
 
     try:
         vcf_out_handle = pysam.VariantFile(args.vcf_out, 'w', header=out_head)
@@ -507,42 +584,47 @@ def main_cli() -> None:
         try:
             with open(args.output_json, "w") as output_json:
                 json.dump(
-                          {
-                           k: vars(args)[k]
-                           for k
-                           in (vars(args).keys() - {'input_json', 'output_json', 'format'})
-                           },
-                          output_json, indent="")
+                    {
+                        k: vars(args)[k]
+                        for k
+                        in (vars(args).keys() - {'input_json', 'output_json', 'format'})
+                    },
+                    output_json, indent="")
         except Exception as e:
             h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 
-    for record in vcf_in_handle.fetch():  # type:ignore
-        # need to test pysam's vcf record validation
-        # e.g. what if start is after end
+    for record in vcf_in_handle.fetch():
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
-                alignments=vcf_sample_to_alignment_map,
-                vcf_rec=record,
-                variant_tester=primed_variant_tester
+                vcf_sample_to_alignment_map,
+                record,
+                args.min_mapping_quality,
+                args.min_clip_quality,
+                args.min_base_quality,
+                args.max_read_spans,
+                args.al_filter_threshold,
+                args.position_fraction
             )
         except c.NoAlts:
-            logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(record.chrom, record.pos))
+            logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(
+                record.chrom, record.pos))
         except c.NoMutants:
-            logging.warning('{0: <7}:{1: >12} ¦ no samples exhibit record alts'.format(record.chrom, record.pos))
+            logging.warning('{0: <7}:{1: >12} ¦ no samples exhibit record alts'.format(
+                record.chrom, record.pos))
         else:
             for alt, filter_bundle in filter_d.items():
                 for filter in filter_bundle:
                     if filter.flag:
                         record.filter.add(filter.name)
                     record.info.update({filter.name: '|'.join(
-                                                      [alt] +
-                                                      [str(f)
-                                                          if type(f)
-                                                          is not float
-                                                          else str(round(f, 3))
-                                                          for f in filter
-                                                       ][2:]
-                                                    )})
+                        [alt] +
+                        [str(f)
+                         if type(f)
+                         is not float
+                         else str(round(f, 3))
+                         for f in filter
+                         ][2:]
+                    )})
 
             try:
                 vcf_out_handle.write(record)  # type:ignore
diff --git a/test/test_test_variant_validate.py b/test/test_test_variant_validate.py
index 394dcca..59c05bb 100644
--- a/test/test_test_variant_validate.py
+++ b/test/test_test_variant_validate.py
@@ -19,6 +19,7 @@
 
 
 from hairpin2 import main as hp2
+from hairpin2 import constants as c
 import pysam
 import pytest
 import copy
@@ -41,19 +42,6 @@
 r.cigarstring = '100M'
 r.set_tag('MC', '100M')
 
-# S1 needs 2 good reads to give True on if len(mut_reads...) > 1
-s1r1 = copy.deepcopy(r)
-s1r2 = copy.deepcopy(r)
-s1r3 = copy.deepcopy(r)
-# S2 needs a bad read to give a False on if read_flag == ... CLEAR
-# and len(mut_reads...)
-s2r1 = copy.deepcopy(r)
-s2r2 = copy.deepcopy(r)
-s2r2.flag = 0xE00
-iter1 = [s1r1, s1r2]
-iter2 = [s2r1, s2r2]
-readd = {'S1': iter1, 'S2': iter2}
-
 
 # max spans...
 ### 21/10/24 HERE: I separated out max spans to make testing easier
@@ -62,11 +50,19 @@
 ### maybe it's going back in but now I actually understand it
 ### perhaps do test suite without it, put it back and cover at the end.
 # don't forget to install updated main.py
+
+
+# use this test to test all initial loops, and other tests to test unique outcomes
+# e.g. read_flag != clear and so on
 @pytest.mark.validate
-def test_path_simple():
-    f = hp2.test_variant(
-        vstart=160,
-        vstop=161,
+def test_path_insufficient_reads():
+    expected = c.Filters(AL=c.ALFilter(code=3),
+                         HP=c.HPFilter(code=3))
+    readd = {'S1': []}
+    # using defaults where not otherwise noted
+    actual = hp2.test_variant(
+        vstart=166,
+        vstop=167,
         alt='A',
         region_reads_by_sample=readd,
         mut_type='S',
@@ -78,4 +74,88 @@ def test_path_simple():
                                min_clipqual=35,
                                min_basequal=25)
     )
-    breakpoint()
+    assert expected == actual
+
+
+# N.B. copy r to several reads
+# combine into dict[str, Iterable[pysam.AlignedSegment]]
+# where keys are samples
+# yank this to register
+@pytest.mark.validate
+def test_path_xxx():
+    pass
+
+
+@pytest.mark.validate
+def test_path_AL_true_code_2():
+    expected = c.ALFilter(flag=True, code=2, avg_as=0.5)
+    s1r1 = copy.deepcopy(r)  # no AS, cover except KeyError
+    s1r2 = copy.deepcopy(r)
+    s1r2.set_tag('AS', 50)  # low AS
+    readd = {'S1': [s1r1, s1r2]}
+    result = hp2.test_variant(
+        vstart=166,
+        vstop=167,
+        alt='A',
+        region_reads_by_sample=readd,
+        mut_type='S',
+        al_thresh=0.93,
+        max_span=-1,  # don't trigger PCR dedup
+        position_fraction_thresh=0.15,
+        read_validator=partial(hp2.validate_read,
+                               min_mapqual=11,
+                               min_clipqual=35,
+                               min_basequal=25)
+    )
+    assert expected == result.AL
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_2_HP_false_code_3():
+    expected = c.Filters(c.ALFilter(flag=False, code=2, avg_as=0.99),
+                         c.HPFilter(code=3))
+    s1r1 = copy.deepcopy(r)
+    s1r1.set_tag('AS', 99)  # high AS
+    readd = {'S1': [s1r1]}
+    result = hp2.test_variant(
+        vstart=166,
+        vstop=167,
+        alt='A',
+        region_reads_by_sample=readd,
+        mut_type='S',
+        al_thresh=0.93,
+        max_span=-1,  # don't trigger PCR dedup
+        position_fraction_thresh=0.15,
+        read_validator=partial(hp2.validate_read,
+                               min_mapqual=11,
+                               min_clipqual=35,
+                               min_basequal=25)
+    )
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_3():
+    expected = c.ALFilter(code=3)
+    s1r1 = copy.deepcopy(r)
+    readd = {'S1': [s1r1]}
+    result = hp2.test_variant(
+        vstart=166,
+        vstop=167,
+        alt='A',
+        region_reads_by_sample=readd,
+        mut_type='S',
+        al_thresh=0.93,
+        max_span=-1,  # don't trigger PCR dedup
+        position_fraction_thresh=0.15,
+        read_validator=partial(hp2.validate_read,
+                               min_mapqual=11,
+                               min_clipqual=35,
+                               min_basequal=25)
+    )
+    assert expected == result.AL
+
+
+@pytest.mark.validate
+def test_path_HP_insufficient_reads():
+    s1r1 = copy.deepcopy(r)

From f4c2e538670ea250cbc45b20d7f1ae445609457c Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 22 Oct 2024 13:58:00 +0000
Subject: [PATCH 140/165] typo

---
 hairpin2/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 9e0fd30..9be0681 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -601,7 +601,7 @@ def main_cli() -> None:
                 args.min_mapping_quality,
                 args.min_clip_quality,
                 args.min_base_quality,
-                args.max_read_spans,
+                args.max_read_span,
                 args.al_filter_threshold,
                 args.position_fraction
             )

From 1d050a4255062e43443088875092bc4ce5ac40fa Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 22 Oct 2024 13:59:39 +0000
Subject: [PATCH 141/165] missing arg

---
 hairpin2/main.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 9be0681..53536b3 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -375,7 +375,8 @@ def test_record_per_alt(
                                                 alt,
                                                 mut_type,
                                                 region_reads_by_sample,
-                                                max_span)
+                                                max_span,
+                                                min_basequal)
         if len(alt_filt_reads) == 0:
             filt_d[alt] = c.Filters(c.ALFilter(code=c.FiltCodes.INSUFFICIENT_READS.value),
                                     c.HPFilter(code=c.FiltCodes.INSUFFICIENT_READS.value))

From 2cd0cc38fb226703ef6f62af12af5b51416c2b42 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Tue, 22 Oct 2024 16:08:15 +0100
Subject: [PATCH 142/165] cleanup, fix; post restructure

---
 hairpin2/main.py | 141 ++++++++++++++++++++++-------------------------
 1 file changed, 65 insertions(+), 76 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 53536b3..7f9c735 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -30,16 +30,16 @@
 from collections.abc import Iterable
 
 
-def validate_read_broad(
+def flag_read_broad(
     read: pysam.AlignedSegment,
     vcf_start: int,
     min_mapqual: int,
     min_clipqual: int,
 ) -> int:
-    read_flag = c.ValidatorFlags.CLEAR.value
+    invalid_flag = c.ValidatorFlags.CLEAR.value  # 0 - evaluates false
 
     try:
-        mate_cig = read.get_tag('MC')
+        mate_cig = str(read.get_tag('MC'))
     except KeyError:
         mate_cig = None
     if any(x is None for x in
@@ -50,32 +50,33 @@ def validate_read_broad(
                 read.cigarstring,
                 read.cigartuples,
                 mate_cig]):
-        read_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
+        invalid_flag |= c.ValidatorFlags.READ_FIELDS_MISSING.value
     else:
         if not (read.flag & 0x2) or read.flag & 0xE00:
-            read_flag |= c.ValidatorFlags.FLAG.value
+            invalid_flag |= c.ValidatorFlags.FLAG.value
 
         if read.mapping_quality < min_mapqual:
-            read_flag |= c.ValidatorFlags.MAPQUAL.value
+            invalid_flag |= c.ValidatorFlags.MAPQUAL.value
 
-        if ('S' in read.cigarstring and
-                mean(read.query_alignment_qualities) < min_clipqual):
-            read_flag |= c.ValidatorFlags.CLIPQUAL.value
+        if ('S' in read.cigarstring and  # type: ignore - not detecting cigarstring can't be none
+                mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore - legit type issue here with pysam but I can't fix it
+            invalid_flag |= c.ValidatorFlags.CLIPQUAL.value
 
-        if not read.flag & 0x40:
+        if (not (invalid_flag & c.ValidatorFlags.FLAG.value)
+            and not (read.flag & 0x40)):
             read_range = range(read.reference_start,
-                               read.reference_end)
+                               read.reference_end)  # type: ignore - can't be none
             mate_range = range(read.next_reference_start,
-                               r2s.ref_end_via_cigar(mate_cig,
+                               r2s.ref_end_via_cigar(mate_cig,  # type: ignore
                                                      read.next_reference_start))
             ref_overlap = set(read_range).intersection(mate_range)
             if vcf_start in ref_overlap:
-                read_flag |= c.ValidatorFlags.OVERLAP.value
+                invalid_flag |= c.ValidatorFlags.OVERLAP.value
 
-    return read_flag
+    return invalid_flag
 
 
-def validate_read_alt(
+def flag_read_alt(
     read: pysam.AlignedSegment,
     vcf_start: int,
     vcf_stop: int,
@@ -87,20 +88,20 @@ def validate_read_alt(
         raise ValueError(
             'unsupported mut_type: {} - supports \'S\' (SUB) \'D\' (DEL) \'I\' (INS)'.format(mut_type))
 
-    read_flag = c.ValidatorFlags.CLEAR.value
+    invalid_flag = c.ValidatorFlags.CLEAR.value
 
     if mut_type == 'S':  # SUB
         try:
-            mut_pos, mut_op = r2s.ref2querypos(read, vcf_start)
+            mut_pos, _ = r2s.ref2querypos(read, vcf_start)
         except IndexError:
-            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+            invalid_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
-            if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:
-                read_flag |= c.ValidatorFlags.NOT_ALT.value
+            if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore - can't be none
+                invalid_flag |= c.ValidatorFlags.NOT_ALT.value
             if any([bq < min_basequal
                     for bq
-                    in read.query_qualities[mut_pos:mut_pos + len(alt)]]):
-                read_flag |= c.ValidatorFlags.BASEQUAL.value
+                    in read.query_qualities[mut_pos:mut_pos + len(alt)]]):  # type: ignore - can't be none
+                invalid_flag |= c.ValidatorFlags.BASEQUAL.value
     # DEL - doesn't check for matches before and after...
     elif mut_type == 'D':
         # this could error if read doesn't cover region (as could all)
@@ -109,26 +110,26 @@ def validate_read_alt(
                     in read.get_aligned_pairs()
                     if r in range(vcf_start, vcf_stop)]
         if any([x is not None for x in mut_alns]):
-            read_flag |= c.ValidatorFlags.BAD_OP.value
+            invalid_flag |= c.ValidatorFlags.BAD_OP.value
     elif mut_type == 'I':  # INS
         try:
             prior_pos, _ = r2s.ref2querypos(read, vcf_start)
         except IndexError:
-            read_flag |= c.ValidatorFlags.NOT_ALIGNED.value
+            invalid_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
             if prior_pos + len(alt) > read.query_length:
-                read_flag |= c.ValidatorFlags.SHORT.value
+                invalid_flag |= c.ValidatorFlags.SHORT.value
             else:
                 mut_alns = [(q, r)
                             for q, r
                             in read.get_aligned_pairs()
                             if q in range(prior_pos + 1, prior_pos + len(alt) + 1)]
                 if any([r is not None for _, r in mut_alns]):
-                    read_flag |= c.ValidatorFlags.BAD_OP.value
-                if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:
-                    read_flag |= c.ValidatorFlags.NOT_ALT.value
+                    invalid_flag |= c.ValidatorFlags.BAD_OP.value
+                if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:  # type: ignore - can't be none
+                    invalid_flag |= c.ValidatorFlags.NOT_ALT.value
 
-    return read_flag
+    return invalid_flag
 
 
 # detect PCR duplicates previously missed due to (hairpin) artefacts
@@ -142,10 +143,10 @@ def validate_read_alt(
 # with more reads it's more likely they'll cluster as dupes right?
 def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
     dup_idcs: list[int] = []
-    read_ends_sorted: list[list[int]] = sorted([(i, sorted(l))
-                                                for i, l
-                                                in enumerate(readpair_ends)],
-                                               key=lambda x: x[1])
+    read_ends_sorted: list[tuple[int, list[int]]] = sorted([(i, sorted(l))
+                                                            for i, l
+                                                            in enumerate(readpair_ends)],
+                                                           key=lambda x: x[1])
     # smallest first element. What was Peter's intention here?
     base_read_ends_list: list[list[int]] = [read_ends_sorted[0][1]]
     for i in range(1, len(read_ends_sorted)):
@@ -174,7 +175,7 @@ def alt_filter_reads(
     mut_type: str,
     region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
     max_span: int,
-    min_basequal: float
+    min_basequal: int
 ) -> list[pysam.AlignedSegment]:
     rrbs_filt: dict[str, list[pysam.AlignedSegment]] = {key: []
                                                         for key
@@ -184,18 +185,18 @@ def alt_filter_reads(
     for mut_sample, read_iter in region_reads_by_sample.items():
         sample_readpair_ends: list[list[int]] = []
         for read in read_iter:
-            if not validate_read_alt(read,
+            if not flag_read_alt(read,
                                      vstart,
                                      vstop,
                                      alt,
-                                     mut_type,
+                                     mut_type,  # type: ignore - type checkers annoying about literals
                                      min_basequal):
                 rrbs_filt[mut_sample].append(read)
                 next_ref_end = r2s.ref_end_via_cigar(
-                    read.get_tag('MC'),
+                    str(read.get_tag('MC')),
                     read.next_reference_start)
                 sample_readpair_ends.append([read.reference_start,
-                                             read.reference_end,
+                                             read.reference_end,  # type: ignore - won't be unbound within program
                                              read.next_reference_start,
                                              next_ref_end])
         if len(rrbs_filt[mut_sample]) > 1:
@@ -213,11 +214,11 @@ def test_variant_AL(
     al_thresh: float
 ) -> c.ALFilter:
     al_filt = c.ALFilter()
-    aln_scores: list[int] = []
+    aln_scores: list[float] = []
 
     for read in mut_reads:
         try:
-            aln_scores.append(read.get_tag('AS') / read.query_length)
+            aln_scores.append(int(read.get_tag('AS')) / read.query_length)
         except KeyError:
             pass
     if len(aln_scores) != 0:
@@ -233,12 +234,9 @@ def test_variant_AL(
 
 def test_variant_HP(
     vstart: int,
-    vstop: int,
-    alt: str,
-    mut_type: str,
     mut_reads: Iterable[pysam.AlignedSegment],
     position_fraction_thresh: float,
-) -> c.Filters:
+) -> c.HPFilter:
 
     hp_filt = c.HPFilter()
     mut_read_pos_f: list[int] = []
@@ -246,27 +244,21 @@ def test_variant_HP(
     mut_read_fracs_f: list[float] = []
     mut_read_fracs_r: list[float] = []
 
-    # if all([len(x) == 0 for x in mut_reads.values()]):
-    #     al_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-    #     hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
-    # else:
-    for read_list in mut_reads.values():
-        for read in read_list:
-            mut_pos, _ = r2s.ref2querypos(read, vstart)
-            if read.flag & 0x10:
-                # 1-based position where start, idx 1, is alignment end
-                mut_idx_wrt_query_aln = read.query_alignment_end - mut_pos
-                mut_read_fracs_r.append(mut_idx_wrt_query_aln
-                                        / read.query_alignment_length)
-                mut_read_pos_r.append(mut_idx_wrt_query_aln)
-            else:
-                mut_idx_wrt_query_aln = mut_pos - read.query_alignment_start + 1
-                mut_read_fracs_f.append(mut_idx_wrt_query_aln
-                                        / read.query_alignment_length)
-                mut_read_pos_f.append(mut_idx_wrt_query_aln)
+    for read in mut_reads:
+        mut_pos, _ = r2s.ref2querypos(read, vstart)
+        if read.flag & 0x10:
+            # 1-based position where start, idx 1, is alignment end
+            mut_idx_wrt_query_aln = read.query_alignment_end - mut_pos
+            mut_read_fracs_r.append(mut_idx_wrt_query_aln
+                                    / read.query_alignment_length)
+            mut_read_pos_r.append(mut_idx_wrt_query_aln)
+        else:
+            mut_idx_wrt_query_aln = mut_pos - read.query_alignment_start + 1
+            mut_read_fracs_f.append(mut_idx_wrt_query_aln
+                                    / read.query_alignment_length)
+            mut_read_pos_f.append(mut_idx_wrt_query_aln)
     # hairpin conditions from Ellis et al.
     if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
-        breakpoint()
         mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
         sd_f = stdev(mut_read_pos_f)
         if (
@@ -335,7 +327,7 @@ def test_record_per_alt(
     if len(samples_w_mutants) == 0:
         raise c.NoMutants
 
-    region_reads_by_sample: dict[str, pysam.IteratorRow] = {}
+    region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]] = {}
     for k, v in alignments.items():
         if k in samples_w_mutants:
             read_iter, test_iter = tee(v.fetch(vcf_rec.chrom,
@@ -349,10 +341,10 @@ def test_record_per_alt(
                 broad_filtered_iter = (read
                                        for read
                                        in read_iter
-                                       if validate_read_broad(read,
-                                                              vcf_rec.start,
-                                                              min_mapqual=min_mapqual,
-                                                              min_clipqual=min_clipqual))
+                                       if not flag_read_broad(read,
+                                                                  vcf_rec.start,
+                                                                  min_mapqual=min_mapqual,
+                                                                  min_clipqual=min_clipqual))
                 # doesn't check for overwrite
                 region_reads_by_sample[k] = broad_filtered_iter
 
@@ -384,9 +376,6 @@ def test_record_per_alt(
             filt_d[alt] = c.Filters(test_variant_AL(alt_filt_reads,
                                                     al_thresh),
                                     test_variant_HP(vcf_rec.start,
-                                                    vcf_rec.stop,
-                                                    alt,
-                                                    mut_type,
                                                     alt_filt_reads,
                                                     position_fraction))
     return filt_d
@@ -516,7 +505,7 @@ def main_cli() -> None:
     for path in args.alignments:
         try:
             alignment = pysam.AlignmentFile(path,
-                                            mode,
+                                            mode,  # type: ignore - argparse ensures not unbound
                                             reference_filename=(args.cram_reference
                                                                 if args.cram_reference
                                                                 and args.format == "c"
@@ -526,8 +515,8 @@ def main_cli() -> None:
                 msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
-        alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']
-        vcf_sample_to_alignment_map[alignment_sample_name] = alignment
+        alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']  # type: ignore - program ensures not unbound
+        vcf_sample_to_alignment_map[alignment_sample_name] = alignment  # type: ignore - program ensures not unbound
     if args.name_mapping:
         if len(args.name_mapping) > len(args.alignments):
             h.cleanup(msg="more name mappings than alignments provided")
@@ -550,7 +539,7 @@ def main_cli() -> None:
             h.cleanup(
                 msg='duplicate aligment sample names provided to name mapping flag')
         if h.lists_not_equal(alignment_map_names,
-                             vcf_sample_to_alignment_map.keys()):
+                             vcf_sample_to_alignment_map.keys()):  # type: ignore - dicts are stable
             h.cleanup(
                 msg='alignment sample names provided to name mapping flag do not match alignment SM tags')
         vcf_sample_to_alignment_map = {vcf_map_names[alignment_map_names.index(k)]: v
@@ -594,7 +583,7 @@ def main_cli() -> None:
         except Exception as e:
             h.cleanup(msg='failed to write output JSON, reporting: {}'.format(e))
 
-    for record in vcf_in_handle.fetch():
+    for record in vcf_in_handle.fetch():  # type: ignore - program ensures not unbound
         try:
             filter_d: dict[str, c.Filters] = test_record_per_alt(
                 vcf_sample_to_alignment_map,

From 30210be62a07d4196386882177856c4882ee2f65 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 23 Oct 2024 14:56:42 +0000
Subject: [PATCH 143/165] complete tests for fixed sci funcs

---
 .gitignore                            |   1 +
 hairpin2/constants.py                 |  56 +++---
 hairpin2/main.py                      | 154 +++++++--------
 test/test_flag_read_alt_validate.py   | 168 +++++++++++++++++
 test/test_flag_read_broad_valiate.py  | 116 ++++++++++++
 test/test_test_variant_AL_validate.py |  69 +++++++
 test/test_test_variant_HP_validate.py | 105 +++++++++++
 test/test_test_variant_validate.py    | 161 ----------------
 test/test_validate_read_dev.py        | 132 -------------
 test/test_validate_read_validate.py   | 258 --------------------------
 10 files changed, 566 insertions(+), 654 deletions(-)
 create mode 100644 test/test_flag_read_alt_validate.py
 create mode 100644 test/test_flag_read_broad_valiate.py
 create mode 100644 test/test_test_variant_AL_validate.py
 create mode 100644 test/test_test_variant_HP_validate.py
 delete mode 100644 test/test_test_variant_validate.py
 delete mode 100644 test/test_validate_read_dev.py
 delete mode 100644 test/test_validate_read_validate.py

diff --git a/.gitignore b/.gitignore
index d577ecd..6297ddc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,4 @@ test_data_creation/
 poetry.lock
 .coverage
 test/sim-data/
+test/old_*
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 109ab7e..faee8a9 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -26,43 +26,43 @@
 EXIT_FAILURE = 1
 
 DEFAULTS: dict[str, int | float] = dict((('al_filter_threshold', 0.93),
-                                        ('min_clip_quality', 35),
-                                        ('min_mapping_quality', 11),
-                                        ('min_base_quality', 25),
-                                        ('max_read_span', 6),
-                                        ('position_fraction', 0.15)))
+                                         ('min_clip_quality', 35),
+                                         ('min_mapping_quality', 11),
+                                         ('min_base_quality', 25),
+                                         ('max_read_span', 6),
+                                         ('position_fraction', 0.15)))
 
 FiltCodes = IntEnum('FiltCodes',
                     ['SIXTYAI',
-                        'SIXTYBI',
-                        'ON_THRESHOLD',
-                        'INSUFFICIENT_READS',
-                        'NO_MUTANTS'],
+                     'SIXTYBI',
+                     'ON_THRESHOLD',
+                     'INSUFFICIENT_READS',
+                     'NO_MUTANTS'],
                     start=0)
 Ops = IntEnum('Ops',
               ['MATCH',
-                  'INS',
-                  'DEL',
-                  'SKIP',
-                  'SOFT',
-                  'HARD',
-                  'PAD',
-                  'EQUAL',
-                  'DIFF',
-                  'BACK'],
+               'INS',
+               'DEL',
+               'SKIP',
+               'SOFT',
+               'HARD',
+               'PAD',
+               'EQUAL',
+               'DIFF',
+               'BACK'],
               start=0)
 ValidatorFlags = Flag('ReadFlags',
                       ['CLEAR',
-                          'FLAG',
-                          'MAPQUAL',
-                          'READ_FIELDS_MISSING',
-                          'NOT_ALIGNED',
-                          'BAD_OP',
-                          'NOT_ALT',
-                          'BASEQUAL',
-                          'SHORT',
-                          'CLIPQUAL',
-                          'OVERLAP'],
+                       'FLAG',
+                       'MAPQUAL',
+                       'READ_FIELDS_MISSING',
+                       'NOT_ALIGNED',
+                       'BAD_OP',
+                       'NOT_ALT',
+                       'BASEQUAL',
+                       'SHORT',
+                       'CLIPQUAL',
+                       'OVERLAP'],
                       start=0)
 
 
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 7f9c735..37fe7d8 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -30,6 +30,10 @@
 from collections.abc import Iterable
 
 
+# N.B.
+# pysam guards against:
+# quality and seq length mismatch
+# reference id is none
 def flag_read_broad(
     read: pysam.AlignedSegment,
     vcf_start: int,
@@ -58,12 +62,12 @@ def flag_read_broad(
         if read.mapping_quality < min_mapqual:
             invalid_flag |= c.ValidatorFlags.MAPQUAL.value
 
-        if ('S' in read.cigarstring and  # type: ignore - not detecting cigarstring can't be none
+        if ('S' in read.cigarstring and  # type: ignore - program ensures can't be none
                 mean(read.query_alignment_qualities) < min_clipqual):  # type: ignore - legit type issue here with pysam but I can't fix it
             invalid_flag |= c.ValidatorFlags.CLIPQUAL.value
 
         if (not (invalid_flag & c.ValidatorFlags.FLAG.value)
-            and not (read.flag & 0x40)):
+                and not (read.flag & 0x40)):
             read_range = range(read.reference_start,
                                read.reference_end)  # type: ignore - can't be none
             mate_range = range(read.next_reference_start,
@@ -141,7 +145,10 @@ def flag_read_alt(
 # (if so, maybe two pointer comparison?)
 # it bothers me that it matters where in the chain this occurs
 # with more reads it's more likely they'll cluster as dupes right?
-def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
+def get_hidden_PCRdup_indices(
+    readpair_ends: list[list[int]],
+    max_span: int
+) -> list[int]:
     dup_idcs: list[int] = []
     read_ends_sorted: list[tuple[int, list[int]]] = sorted([(i, sorted(l))
                                                             for i, l
@@ -163,7 +170,7 @@ def get_hidden_PCRdup_indices(readpair_ends: list[list[int]], max_span: int):
         else:
             # read at i is not dup of reads in base_read_ends_list
             # start again, test read at i
-            # against reads subsequent to i in ends_sorted
+            # against reads subsequent from i in ends_sorted
             base_read_ends_list = [comparison_read_ends[1]]
     return dup_idcs
 
@@ -174,8 +181,8 @@ def alt_filter_reads(
     alt: str,
     mut_type: str,
     region_reads_by_sample: dict[str, Iterable[pysam.AlignedSegment]],
-    max_span: int,
-    min_basequal: int
+    max_span: int = 6,
+    min_basequal: int = 25
 ) -> list[pysam.AlignedSegment]:
     rrbs_filt: dict[str, list[pysam.AlignedSegment]] = {key: []
                                                         for key
@@ -186,11 +193,11 @@ def alt_filter_reads(
         sample_readpair_ends: list[list[int]] = []
         for read in read_iter:
             if not flag_read_alt(read,
-                                     vstart,
-                                     vstop,
-                                     alt,
-                                     mut_type,  # type: ignore - type checkers annoying about literals
-                                     min_basequal):
+                                 vstart,
+                                 vstop,
+                                 alt,
+                                 mut_type,  # type: ignore - type checkers annoying about literals
+                                 min_basequal):
                 rrbs_filt[mut_sample].append(read)
                 next_ref_end = r2s.ref_end_via_cigar(
                     str(read.get_tag('MC')),
@@ -211,7 +218,7 @@ def alt_filter_reads(
 
 def test_variant_AL(
     mut_reads: Iterable[pysam.AlignedSegment],
-    al_thresh: float
+    al_thresh: float = 0.93
 ) -> c.ALFilter:
     al_filt = c.ALFilter()
     aln_scores: list[float] = []
@@ -232,77 +239,74 @@ def test_variant_AL(
     return al_filt
 
 
+# per Peter's implementation
+# can set hairpin for mutations nowhere near alignment start
+# expose more ellis conditions as parameters?
 def test_variant_HP(
     vstart: int,
     mut_reads: Iterable[pysam.AlignedSegment],
-    position_fraction_thresh: float,
+    position_fraction_thresh: float = 0.15
 ) -> c.HPFilter:
 
     hp_filt = c.HPFilter()
-    mut_read_pos_f: list[int] = []
-    mut_read_pos_r: list[int] = []
-    mut_read_fracs_f: list[float] = []
-    mut_read_fracs_r: list[float] = []
+    # *l*engths of *a*lignment starts *to* *m*utant query positions
+    la2ms_f: list[int] = []
+    la2ms_r: list[int] = []
+    near_start_f: list[bool] = []
+    near_start_r: list[bool] = []
 
     for read in mut_reads:
-        mut_pos, _ = r2s.ref2querypos(read, vstart)
+        mut_qpos, _ = r2s.ref2querypos(read, vstart)
         if read.flag & 0x10:
-            # 1-based position where start, idx 1, is alignment end
-            mut_idx_wrt_query_aln = read.query_alignment_end - mut_pos
-            mut_read_fracs_r.append(mut_idx_wrt_query_aln
-                                    / read.query_alignment_length)
-            mut_read_pos_r.append(mut_idx_wrt_query_aln)
-        else:
-            mut_idx_wrt_query_aln = mut_pos - read.query_alignment_start + 1
-            mut_read_fracs_f.append(mut_idx_wrt_query_aln
-                                    / read.query_alignment_length)
-            mut_read_pos_f.append(mut_idx_wrt_query_aln)
-    # hairpin conditions from Ellis et al.
-    if len(mut_read_pos_f) > 1 and not len(mut_read_pos_r) > 1:
-        mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-        sd_f = stdev(mut_read_pos_f)
-        if (
-            ((sum([x <= position_fraction_thresh
-                  for x
-                  in mut_read_fracs_f]) / len(mut_read_pos_f)) < 0.9) and
-            mad_f > 0 and
-                sd_f > 4):
-            hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
-        else:
-            hp_filt.code = c.FiltCodes.SIXTYAI.value
-            hp_filt.set()
-    elif len(mut_read_pos_r) > 1 and not len(mut_read_pos_f) > 1:
-        mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-        sd_r = stdev(mut_read_pos_r)
-        if (
-            ((sum([x <= position_fraction_thresh
-                  for x
-                  in mut_read_fracs_r]) / len(mut_read_pos_r)) < 0.9) and
-            mad_r > 0 and
-                sd_r > 4):
-            hp_filt.code = c.FiltCodes.SIXTYAI.value
+            # +1 to include last base in length
+            la2m = read.query_alignment_end - mut_qpos + 1
+            near_start_r.append(((la2m / read.query_alignment_length)
+                                 <= position_fraction_thresh))
+            la2ms_r.append(la2m)
         else:
-            hp_filt.code = c.FiltCodes.SIXTYAI.value
-            hp_filt.set()
-    elif len(mut_read_pos_f) > 1 and len(mut_read_pos_r) > 1:
-        mad_f = max(mut_read_pos_f) - min(mut_read_pos_f)
-        sd_f = stdev(mut_read_pos_f)
-        mad_r = max(mut_read_pos_r) - min(mut_read_pos_r)
-        sd_r = stdev(mut_read_pos_r)
-        frac_lt_thresh = (sum([x <= position_fraction_thresh
-                              for x
-                              in mut_read_fracs_f + mut_read_fracs_r]) /
-                          (len(mut_read_pos_f) + len(mut_read_pos_r)))
-        if (frac_lt_thresh < 0.9 or
-           (mad_f > 2 and mad_r > 2 and sd_f > 2 and sd_r > 2) or
-           (mad_f > 1 and sd_f > 10) or
-           (mad_r > 1 and sd_r > 10)):
-            hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
-        else:
-            hp_filt.code = c.FiltCodes.SIXTYBI.value
-            hp_filt.set()
-    else:
+            la2m = mut_qpos - read.query_alignment_start + 1
+            near_start_f.append(((la2m / read.query_alignment_length)
+                                 <= position_fraction_thresh))
+            la2ms_f.append(la2m)
+
+    # hairpin conditions from Ellis et al. 2020, Nature Protocols
+    # sometimes reported as 2021
+    if len(la2ms_f) < 2 and len(la2ms_r) < 2:
         hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+    else:
+        if len(la2ms_f) > 1:
+            range_f = max(la2ms_f) - min(la2ms_f)
+            sd_f = stdev(la2ms_f)
+            if len(la2ms_r) < 2:
+                if (((sum(near_start_f) / len(near_start_f)) < 0.9) and
+                    range_f > 0 and
+                        sd_f > 4):
+                    hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
+                else:
+                    hp_filt.code = c.FiltCodes.SIXTYAI.value
+                    hp_filt.set()
+        if len(la2ms_r) > 1:
+            range_r = max(la2ms_r) - min(la2ms_r)
+            sd_r = stdev(la2ms_r)
+            if len(la2ms_f) < 2:
+                if (((sum(near_start_r) / len(near_start_r)) < 0.9) and
+                    range_r > 0 and
+                        sd_r > 4):
+                    hp_filt.code = c.FiltCodes.SIXTYAI.value
+                else:
+                    hp_filt.code = c.FiltCodes.SIXTYAI.value
+                    hp_filt.set()
+        if len(la2ms_f) > 1 and len(la2ms_r) > 1:
+            frac_lt_thresh = (sum(near_start_f + near_start_r)
+                              / (len(near_start_f) + len(near_start_r)))
+            if (frac_lt_thresh < 0.9 or
+                (range_f > 2 and range_r > 2 and sd_f > 2 and sd_r > 2) or
+                (range_f > 1 and sd_f > 10) or
+                    (range_r > 1 and sd_r > 10)):
+                hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
+            else:
+                hp_filt.code = c.FiltCodes.SIXTYBI.value
+                hp_filt.set()
 
     return hp_filt
 
@@ -342,9 +346,9 @@ def test_record_per_alt(
                                        for read
                                        in read_iter
                                        if not flag_read_broad(read,
-                                                                  vcf_rec.start,
-                                                                  min_mapqual=min_mapqual,
-                                                                  min_clipqual=min_clipqual))
+                                                              vcf_rec.start,
+                                                              min_mapqual,
+                                                              min_clipqual))
                 # doesn't check for overwrite
                 region_reads_by_sample[k] = broad_filtered_iter
 
diff --git a/test/test_flag_read_alt_validate.py b/test/test_flag_read_alt_validate.py
new file mode 100644
index 0000000..34b11bf
--- /dev/null
+++ b/test/test_flag_read_alt_validate.py
@@ -0,0 +1,168 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2.main import flag_read_alt
+from hairpin2 import constants as c
+import pysam
+import copy
+import pytest
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC'
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 95
+r.next_reference_start = 95
+r.mapping_quality = 20
+r.cigarstring = '10M'
+r.set_tag('MC', '10M')
+
+
+@pytest.mark.validate
+def test_path_unsupported_mut_type():
+    with pytest.raises(ValueError):
+        flag_read_alt(read=r,
+                      vcf_start=99,
+                      vcf_stop=100,
+                      alt='A',
+                      mut_type='8',
+                      min_basequal=25)
+
+
+@pytest.mark.validate
+def test_path_sub_not_aligned():
+    expected = c.ValidatorFlags.NOT_ALIGNED.value
+    result = flag_read_alt(read=r,
+                           vcf_start=200,
+                           vcf_stop=100,
+                           alt='A',
+                           mut_type='S',
+                           min_basequal=25)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_bad_sub():
+    expected = (c.ValidatorFlags.NOT_ALT.value
+                | c.ValidatorFlags.BASEQUAL.value)
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='T',
+                           mut_type='S',
+                           min_basequal=50)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_good_sub():
+    expected = c.ValidatorFlags.CLEAR.value
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='A',
+                           mut_type='S',
+                           min_basequal=25)
+    assert expected == result
+
+
+# checks cigar ops
+@pytest.mark.validate
+def test_path_del_bad_op():
+    expected = c.ValidatorFlags.BAD_OP.value
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='.',
+                           mut_type='D',
+                           min_basequal=25)
+    assert expected == result
+
+
+# 2bp del
+@pytest.mark.validate
+def test_path_good_del():
+    expected = c.ValidatorFlags.CLEAR.value
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '4M2D6M'
+    result = flag_read_alt(read=rc,
+                           vcf_start=99,
+                           vcf_stop=101,
+                           alt='.',
+                           mut_type='D',
+                           min_basequal=25)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_ins_not_aligned():
+    expected = c.ValidatorFlags.NOT_ALIGNED.value
+    result = flag_read_alt(read=r,
+                           vcf_start=200,
+                           vcf_stop=100,
+                           alt='A',
+                           mut_type='I',
+                           min_basequal=25)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_ins_short():
+    expected = c.ValidatorFlags.SHORT.value
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='ATTTTTTTTTTTTTT',
+                           mut_type='I',
+                           min_basequal=25)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_bad_ins():
+    expected = (c.ValidatorFlags.BAD_OP.value | c.ValidatorFlags.NOT_ALT.value)
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='AC',
+                           mut_type='I',
+                           min_basequal=25)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_good_ins():
+    expected = c.ValidatorFlags.CLEAR.value
+    rc = copy.deepcopy(r)
+    rc.cigarstring = '5M2I3M'
+    result = flag_read_alt(read=rc,
+                           vcf_start=99,
+                           vcf_stop=100,
+                           alt='AA',
+                           mut_type='I',
+                           min_basequal=25)
+    assert expected == result
diff --git a/test/test_flag_read_broad_valiate.py b/test/test_flag_read_broad_valiate.py
new file mode 100644
index 0000000..d65f0c1
--- /dev/null
+++ b/test/test_flag_read_broad_valiate.py
@@ -0,0 +1,116 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2.main import flag_read_broad
+from hairpin2 import constants as c
+import pysam
+import copy
+import pytest
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC'
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 95
+r.next_reference_start = 95
+r.mapping_quality = 20
+r.cigarstring = '10M'
+r.set_tag('MC', '10M')
+
+
+@pytest.mark.validate
+def test_path_clear():
+    expected = c.ValidatorFlags.CLEAR.value
+    result = flag_read_broad(read=r,
+                             vcf_start=99,
+                             min_mapqual=11,
+                             min_clipqual=35)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_missing_mc():
+    expected = c.ValidatorFlags.READ_FIELDS_MISSING.value
+    rc = copy.deepcopy(r)
+    rc.set_tag('MC', None)
+    result = flag_read_broad(read=rc,
+                             vcf_start=99,
+                             min_mapqual=11,
+                             min_clipqual=35)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_missing_field():
+    expected = c.ValidatorFlags.READ_FIELDS_MISSING.value
+    rc = copy.deepcopy(r)
+    rc.cigarstring = None
+    result = flag_read_broad(read=rc,
+                             vcf_start=99,
+                             min_mapqual=11,
+                             min_clipqual=35)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_set_flag_mapqual_clipqual():
+    expected = (c.ValidatorFlags.FLAG.value
+                | c.ValidatorFlags.MAPQUAL.value
+                | c.ValidatorFlags.CLIPQUAL.value)
+    rc = copy.deepcopy(r)
+    rc.flag = 0x200
+    rc.cigarstring = '1S9M'
+    result = flag_read_broad(read=rc,
+                             vcf_start=99,
+                             min_mapqual=30,
+                             min_clipqual=40)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_overlap():
+    expected = c.ValidatorFlags.OVERLAP.value
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    result = flag_read_broad(read=rc,
+                             vcf_start=99,
+                             min_mapqual=11,
+                             min_clipqual=40)
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_no_overlap():
+    expected = c.ValidatorFlags.CLEAR.value
+    rc = copy.deepcopy(r)
+    rc.flag = 0x83
+    rc.set_tag('MC', '3M')
+    result = flag_read_broad(read=rc,
+                             vcf_start=99,
+                             min_mapqual=11,
+                             min_clipqual=40)
+    assert expected == result
diff --git a/test/test_test_variant_AL_validate.py b/test/test_test_variant_AL_validate.py
new file mode 100644
index 0000000..8172ba8
--- /dev/null
+++ b/test/test_test_variant_AL_validate.py
@@ -0,0 +1,69 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2.main import test_variant_AL
+from hairpin2 import constants as c
+import pysam
+import pytest
+import copy
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# there's some duplication here but it's clearer to test explicitly
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC' * 10
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 100
+r.next_reference_start = 100
+r.mapping_quality = 20
+r.cigarstring = '100M'
+r.set_tag('MC', '100M')
+
+
+@pytest.mark.validate
+def test_path_AL_true_code_2():
+    expected = c.ALFilter(flag=True, code=2, avg_as=0.5)
+    s1r1 = copy.deepcopy(r)  # no AS, cover except KeyError
+    s1r2 = copy.deepcopy(r)
+    s1r2.set_tag('AS', 50)  # low AS
+    result = test_variant_AL([s1r1, s1r2])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_2():
+    expected = c.ALFilter(flag=False, code=2, avg_as=0.99)
+    s1r1 = copy.deepcopy(r)
+    s1r1.set_tag('AS', 99)  # high AS
+    result = test_variant_AL([s1r1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_3():
+    expected = c.ALFilter(code=3)
+    result = test_variant_AL([])
+    assert expected == result
diff --git a/test/test_test_variant_HP_validate.py b/test/test_test_variant_HP_validate.py
new file mode 100644
index 0000000..f36b9dd
--- /dev/null
+++ b/test/test_test_variant_HP_validate.py
@@ -0,0 +1,105 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from hairpin2.main import test_variant_HP
+from hairpin2 import constants as c
+import pysam
+import pytest
+import copy
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC' * 10
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 100
+r.next_reference_start = 100
+r.mapping_quality = 20
+r.cigarstring = '100M'
+r.set_tag('MC', '100M')
+
+
+@pytest.mark.validate
+def test_path_insufficient_reads():
+    expected = c.HPFilter(code=3)
+    result = test_variant_HP(0, [])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_f_60ai_set():
+    expected = c.HPFilter(flag=True, code=0)
+    result = test_variant_HP(150, [r, r])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_f_60ai_noset():
+    expected = c.HPFilter(code=0)
+    r1 = copy.deepcopy(r)
+    r1.reference_start = 90
+    result = test_variant_HP(150, [r, r1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_r_60ai_set():
+    expected = c.HPFilter(flag=True, code=0)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = test_variant_HP(150, [rr, rr])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_r_60ai_noset():
+    expected = c.HPFilter(code=0)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    rr1 = copy.deepcopy(rr)
+    rr1.reference_start = 90
+    result = test_variant_HP(150, [rr, rr1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_60bi_set():
+    expected = c.HPFilter(flag=True, code=1)
+    r1 = copy.deepcopy(r)
+    r1.reference_start = 190
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = test_variant_HP(198, [r1, r1, rr, rr])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_60bi_noset():
+    expected = c.HPFilter(code=1)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = test_variant_HP(150, [r, r, rr, rr])
+    assert expected == result
diff --git a/test/test_test_variant_validate.py b/test/test_test_variant_validate.py
deleted file mode 100644
index 59c05bb..0000000
--- a/test/test_test_variant_validate.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-from hairpin2 import main as hp2
-from hairpin2 import constants as c
-import pysam
-import pytest
-import copy
-from functools import partial
-
-
-# BASIS PATH TESTING
-# test every node and edge at least once
-# ----
-# perfect read pair:
-r = pysam.AlignedSegment()
-r.query_name = 'read1'
-r.query_sequence = 'CTGDAAAACC' * 10
-r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
-r.flag = 0x43
-r.reference_id = 0
-r.reference_start = 100
-r.next_reference_start = 100
-r.mapping_quality = 20
-r.cigarstring = '100M'
-r.set_tag('MC', '100M')
-
-
-# max spans...
-### 21/10/24 HERE: I separated out max spans to make testing easier
-### or at least to make understanding max spans easier
-### I'm not super keen on the separate function so
-### maybe it's going back in but now I actually understand it
-### perhaps do test suite without it, put it back and cover at the end.
-# don't forget to install updated main.py
-
-
-# use this test to test all initial loops, and other tests to test unique outcomes
-# e.g. read_flag != clear and so on
-@pytest.mark.validate
-def test_path_insufficient_reads():
-    expected = c.Filters(AL=c.ALFilter(code=3),
-                         HP=c.HPFilter(code=3))
-    readd = {'S1': []}
-    # using defaults where not otherwise noted
-    actual = hp2.test_variant(
-        vstart=166,
-        vstop=167,
-        alt='A',
-        region_reads_by_sample=readd,
-        mut_type='S',
-        al_thresh=0.93,
-        max_span=6,
-        position_fraction_thresh=0.15,
-        read_validator=partial(hp2.validate_read,
-                               min_mapqual=11,
-                               min_clipqual=35,
-                               min_basequal=25)
-    )
-    assert expected == actual
-
-
-# N.B. copy r to several reads
-# combine into dict[str, Iterable[pysam.AlignedSegment]]
-# where keys are samples
-# yank this to register
-@pytest.mark.validate
-def test_path_xxx():
-    pass
-
-
-@pytest.mark.validate
-def test_path_AL_true_code_2():
-    expected = c.ALFilter(flag=True, code=2, avg_as=0.5)
-    s1r1 = copy.deepcopy(r)  # no AS, cover except KeyError
-    s1r2 = copy.deepcopy(r)
-    s1r2.set_tag('AS', 50)  # low AS
-    readd = {'S1': [s1r1, s1r2]}
-    result = hp2.test_variant(
-        vstart=166,
-        vstop=167,
-        alt='A',
-        region_reads_by_sample=readd,
-        mut_type='S',
-        al_thresh=0.93,
-        max_span=-1,  # don't trigger PCR dedup
-        position_fraction_thresh=0.15,
-        read_validator=partial(hp2.validate_read,
-                               min_mapqual=11,
-                               min_clipqual=35,
-                               min_basequal=25)
-    )
-    assert expected == result.AL
-
-
-@pytest.mark.validate
-def test_path_AL_false_code_2_HP_false_code_3():
-    expected = c.Filters(c.ALFilter(flag=False, code=2, avg_as=0.99),
-                         c.HPFilter(code=3))
-    s1r1 = copy.deepcopy(r)
-    s1r1.set_tag('AS', 99)  # high AS
-    readd = {'S1': [s1r1]}
-    result = hp2.test_variant(
-        vstart=166,
-        vstop=167,
-        alt='A',
-        region_reads_by_sample=readd,
-        mut_type='S',
-        al_thresh=0.93,
-        max_span=-1,  # don't trigger PCR dedup
-        position_fraction_thresh=0.15,
-        read_validator=partial(hp2.validate_read,
-                               min_mapqual=11,
-                               min_clipqual=35,
-                               min_basequal=25)
-    )
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_AL_false_code_3():
-    expected = c.ALFilter(code=3)
-    s1r1 = copy.deepcopy(r)
-    readd = {'S1': [s1r1]}
-    result = hp2.test_variant(
-        vstart=166,
-        vstop=167,
-        alt='A',
-        region_reads_by_sample=readd,
-        mut_type='S',
-        al_thresh=0.93,
-        max_span=-1,  # don't trigger PCR dedup
-        position_fraction_thresh=0.15,
-        read_validator=partial(hp2.validate_read,
-                               min_mapqual=11,
-                               min_clipqual=35,
-                               min_basequal=25)
-    )
-    assert expected == result.AL
-
-
-@pytest.mark.validate
-def test_path_HP_insufficient_reads():
-    s1r1 = copy.deepcopy(r)
diff --git a/test/test_validate_read_dev.py b/test/test_validate_read_dev.py
deleted file mode 100644
index ad018c8..0000000
--- a/test/test_validate_read_dev.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-from hairpin2 import main as hp2
-import pysam
-import pytest
-import factory
-import factory.random
-from faker import Faker
-from faker_biology.bioseq import Bioseq
-import random
-
-factory.random.reseed_random(2501)
-random.seed(2501)
-
-
-# smoke test validate_read
-# ----
-class ExtendedBioProvider(Bioseq):
-    def quality_string(self, length):
-        if length < 1:
-            raise ValueError('length must be geater than 1')
-        allowed_chars = [chr(i) for i in range(33, 75)]
-        return ''.join([random.choice(allowed_chars) for _ in range(length)])
-
-    def cigar_string(self, length):
-        if length < 1:
-            raise ValueError('length must be greater than 1')
-        opchars = 'MIDNSHP=XB'
-        opchars_noclip = 'MIDNP=XB'
-        bound = 200 if length > 200 else length
-        cig_op_lengths = []
-        while (bound > 0):
-            oplen = random.randint(1, bound)
-            cig_op_lengths.append(random.randint(1, oplen))
-            cig_sum = sum(cig_op_lengths)
-            bound = 200 if length - cig_sum > 200 else length - cig_sum
-        cig_op_lengths[-1] = cig_op_lengths[-1] - (cig_sum - length)
-        cig_ops = []
-        last_opchar = ''
-        # first and last op can be S or H, but not others
-        # first op H last op S, i.e. only clipping ops, seg faults pysam
-        # reads with only clipping ops seem to segfault pysam... report bug
-        if len(cig_op_lengths) == 1:
-            cig_ops.append(random.choice(opchars_noclip))
-        else:
-            cig_ops.append(random.choice(opchars))
-        for _ in range(max([len(cig_op_lengths) - 2, 0])):
-            iter_opchars = opchars_noclip.replace(last_opchar, '')
-            cig_ops.append(random.choice(iter_opchars))
-            last_opchar = cig_ops[-1]
-        if len(cig_ops) != 1:
-            cig_ops.append(random.choice(opchars_noclip if cig_ops[-1] in ['H', 'S'] else opchars))
-        return ''.join([str(x) for pair in zip(cig_op_lengths, cig_ops) for x in pair])
-
-
-fake = Faker()
-fake.add_provider(ExtendedBioProvider)
-
-
-class AlignedSegmentWrapper:
-    def __init__(self, query_name, query_sequence, query_qualities, flag, reference_id, reference_start, next_reference_start, mapping_quality, cigarstring, mc):
-        self.segment = pysam.AlignedSegment()
-        self.segment.query_name = query_name
-        self.segment.query_sequence = query_sequence
-        self.segment.query_qualities = pysam.qualitystring_to_array(query_qualities)
-        self.segment.flag = flag
-        self.segment.reference_id = reference_id
-        self.segment.reference_start = reference_start
-        self.segment.next_reference_start = next_reference_start
-        self.segment.mapping_quality = mapping_quality
-        self.segment.cigarstring = cigarstring
-        self.segment.set_tag('MC', mc)
-
-
-class ReadFactory(factory.Factory):
-    class Meta:
-        model = AlignedSegmentWrapper
-
-    query_name = 'read1'  # should one assume pysam handles all bizarre query names gracefully? I am...
-    query_sequence = factory.LazyAttribute(lambda _: fake.dna(length=random.randint(50, 200)))
-    query_qualities = factory.LazyAttribute(lambda o: fake.quality_string(length=len(o.query_sequence)))
-    flag = factory.LazyAttribute(lambda _: random.getrandbits(16))
-    reference_id = 0
-    reference_start = factory.LazyAttribute(lambda _: random.randint(1, 300000000))
-    next_reference_start = factory.LazyAttribute(lambda o: o.reference_start - random.randint(-700, 700))
-    mapping_quality = factory.LazyAttribute(lambda _: random.randint(0, 255))
-    cigarstring = factory.LazyAttribute(lambda o: fake.cigar_string(length=len(o.query_sequence)))
-    mc = factory.LazyAttribute(lambda _: fake.cigar_string(length=random.randint(50, 200)))
-
-
-@pytest.mark.dev
-@pytest.mark.parametrize("test_read", [ReadFactory().segment for _ in range(1000)])
-def test_smoke(test_read):
-    mut_pos = random.randint(1, len(test_read.query_sequence) - 1)
-    start = test_read.reference_start + mut_pos
-    alt = random.choices([test_read.query_sequence[mut_pos:mut_pos + random.randint(1, 3)], '.'], cum_weights=[66, 100])[0]
-    if alt == '.':
-        mut_type_str = 'D'
-        stop = start + random.randint(1, 3)
-    elif len(alt) == 1:
-        mut_type_str = random.choice(['S', 'I'])
-        stop = start + 1
-    else:
-        mut_type_str = random.choice(['S', 'I'])
-        stop = start + 1 if mut_type_str == 'I' else start + len(alt)
-    vflag = hp2.validate_read(test_read,
-                              vcf_start=start,
-                              vcf_stop=stop,
-                              alt=alt,
-                              mut_type=mut_type_str,
-                              min_mapqual=11,
-                              min_clipqual=35,
-                              min_basequal=25)
-    print(format(vflag, '010b'))
diff --git a/test/test_validate_read_validate.py b/test/test_validate_read_validate.py
deleted file mode 100644
index 09cc308..0000000
--- a/test/test_validate_read_validate.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-from hairpin2 import main as hp2
-from hairpin2 import constants as c
-import pysam
-import copy
-import pytest
-
-
-# BASIS PATH TESTING
-# test every node and edge at least once
-# N.B.
-# pysam guards against:
-# quality and seq length mismatch
-# reference id is none
-# ----
-# perfect read pair:
-r = pysam.AlignedSegment()
-r.query_name = 'read1'
-r.query_sequence = 'CTGDAAAACC'
-r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-r.flag = 0x43
-r.reference_id = 0
-r.reference_start = 95
-r.next_reference_start = 95
-r.mapping_quality = 20
-r.cigarstring = '10M'
-r.set_tag('MC', '10M')
-
-
-@pytest.mark.validate
-def test_path_unsupported_alt():
-    with pytest.raises(ValueError):
-        hp2.validate_read(read=r,
-                          vcf_start=99,
-                          vcf_stop=100,
-                          alt='8',
-                          mut_type='S',
-                          min_mapqual=11,
-                          min_clipqual=35,
-                          min_basequal=25)
-
-
-@pytest.mark.validate
-def test_path_unsupported_mut_type():
-    with pytest.raises(ValueError):
-        hp2.validate_read(read=r,
-                          vcf_start=99,
-                          vcf_stop=100,
-                          alt='A',
-                          mut_type='8',
-                          min_mapqual=11,
-                          min_clipqual=35,
-                          min_basequal=25)
-
-
-@pytest.mark.validate
-def test_path_missing_mc():
-    rc = copy.deepcopy(r)
-    rc.set_tag('MC', None)
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-@pytest.mark.validate
-def test_path_missing_field():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = None
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.READ_FIELDS_MISSING.value
-
-
-@pytest.mark.validate
-def test_path_set_flag_mapqual_clipqual():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x200
-    rc.cigarstring = '1S9M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=30,
-                             min_clipqual=40,
-                             min_basequal=25) == (c.ValidatorFlags.FLAG.value | c.ValidatorFlags.MAPQUAL.value | c.ValidatorFlags.CLIPQUAL.value)
-
-
-@pytest.mark.validate
-def test_path_sub_not_aligned():
-    assert hp2.validate_read(read=r,
-                             vcf_start=200,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-@pytest.mark.validate
-def test_path_bad_sub():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='T',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=50) == (c.ValidatorFlags.NOT_ALT.value | c.ValidatorFlags.BASEQUAL.value)
-
-
-@pytest.mark.validate
-def test_path_good_sub():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-# checks cigar ops
-@pytest.mark.validate
-def test_path_del_bad_op():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='.',
-                             mut_type='D',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.BAD_OP.value
-
-
-# 2bp del
-@pytest.mark.validate
-def test_path_good_del():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = '4M2D6M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=101,
-                             alt='.',
-                             mut_type='D',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-@pytest.mark.validate
-def test_path_ins_not_aligned():
-    assert hp2.validate_read(read=r,
-                             vcf_start=200,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.NOT_ALIGNED.value
-
-
-@pytest.mark.validate
-def test_path_ins_short():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='ATTTTTTTTTTTTTT',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.SHORT.value
-
-
-@pytest.mark.validate
-def test_path_bad_ins():
-    assert hp2.validate_read(read=r,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='AC',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == (c.ValidatorFlags.BAD_OP.value | c.ValidatorFlags.NOT_ALT.value)
-
-
-@pytest.mark.validate
-def test_path_good_ins():
-    rc = copy.deepcopy(r)
-    rc.cigarstring = '5M2I3M'
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='AA',
-                             mut_type='I',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value
-
-
-@pytest.mark.validate
-def test_path_overlap():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x83
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.OVERLAP.value
-
-
-@pytest.mark.validate
-def test_path_no_overlap():
-    rc = copy.deepcopy(r)
-    rc.flag = 0x83
-    rc.set_tag('MC', '3M')
-    assert hp2.validate_read(read=rc,
-                             vcf_start=99,
-                             vcf_stop=100,
-                             alt='A',
-                             mut_type='S',
-                             min_mapqual=11,
-                             min_clipqual=35,
-                             min_basequal=25) == c.ValidatorFlags.CLEAR.value

From c3ef60f1cc193638f2d5caf8becb08ec558a82cb Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 23 Oct 2024 15:00:23 +0000
Subject: [PATCH 144/165] update test runner; version

---
 docker-run-unit-tests.sh | 2 +-
 pyproject.toml           | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 docker-run-unit-tests.sh

diff --git a/docker-run-unit-tests.sh b/docker-run-unit-tests.sh
old mode 100644
new mode 100755
index ccc2dbd..dc68b4d
--- a/docker-run-unit-tests.sh
+++ b/docker-run-unit-tests.sh
@@ -15,5 +15,5 @@ pip install \
     faker-biology==0.6.4 \
     factory-boy==3.3.1 \
     pysam==0.22 && \
-pytest --cov="${PKG_DIR}" "${TEST_DIR}"
+pytest -m "validate" --cov="${PKG_DIR}" "${TEST_DIR}"
 
diff --git a/pyproject.toml b/pyproject.toml
index f4b2dde..be64602 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "hairpin2"
-version = "0.0.2a"
+version = "0.0.3rc"
 description = "CLI implementation of the hairpin detection algorithm concieved by Ellis et al, 2020."
 authors = ["Alex Byrne <ab63@sanger.ac.uk>"]
 license = "AGPL3"
@@ -23,7 +23,6 @@ factory-boy = "^3.3.1"
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
-
 [tool.pytest.ini_options]
 markers = [
   "dev: development tests",

From 1de681c9b2d7262f7fa8b3f9d1c8bfec65e44835 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 23 Oct 2024 16:31:00 +0000
Subject: [PATCH 145/165] missed tests

---
 hairpin2/main.py                        | 50 ++++++++++++-------------
 hairpin2/ref2seq.py                     | 27 +++----------
 test/test_flag_read_alt_validate.py     | 20 ----------
 test/test_flag_read_broad_valiate.py    | 20 ----------
 test/test_ref2querypos_validate.py      | 33 ++++++++++++++++
 test/test_ref_end_via_cigar_validate.py | 18 +++++++++
 test/test_test_variant_AL_validate.py   | 21 -----------
 test/test_test_variant_HP_validate.py   | 20 ----------
 8 files changed, 80 insertions(+), 129 deletions(-)
 create mode 100644 test/test_ref2querypos_validate.py
 create mode 100644 test/test_ref_end_via_cigar_validate.py

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 37fe7d8..66e0e6d 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -94,20 +94,33 @@ def flag_read_alt(
 
     invalid_flag = c.ValidatorFlags.CLEAR.value
 
-    if mut_type == 'S':  # SUB
+    if mut_type in ['S', 'I']:
         try:
-            mut_pos, _ = r2s.ref2querypos(read, vcf_start)
+            mut_pos = r2s.ref2querypos(read, vcf_start)
         except IndexError:
             invalid_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
-            if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore - can't be none
-                invalid_flag |= c.ValidatorFlags.NOT_ALT.value
-            if any([bq < min_basequal
-                    for bq
-                    in read.query_qualities[mut_pos:mut_pos + len(alt)]]):  # type: ignore - can't be none
-                invalid_flag |= c.ValidatorFlags.BASEQUAL.value
+            if mut_type == 'S':  # SUB
+                if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore - can't be none
+                    invalid_flag |= c.ValidatorFlags.NOT_ALT.value
+                if any([bq < min_basequal
+                        for bq
+                        in read.query_qualities[mut_pos:mut_pos + len(alt)]]):  # type: ignore - can't be none
+                    invalid_flag |= c.ValidatorFlags.BASEQUAL.value
+            if mut_type == 'I':  # INS - mut_pos is position immediately before insertion
+                if mut_pos + len(alt) > read.query_length:
+                    invalid_flag |= c.ValidatorFlags.SHORT.value
+                else:
+                    mut_alns = [(q, r)
+                                for q, r
+                                in read.get_aligned_pairs()
+                                if q in range(mut_pos + 1, mut_pos + len(alt) + 1)]
+                    if any([r is not None for _, r in mut_alns]):
+                        invalid_flag |= c.ValidatorFlags.BAD_OP.value
+                    if read.query_sequence[mut_pos + 1:mut_pos + len(alt) + 1] != alt:  # type: ignore - can't be none
+                        invalid_flag |= c.ValidatorFlags.NOT_ALT.value
     # DEL - doesn't check for matches before and after...
-    elif mut_type == 'D':
+    if mut_type == 'D':
         # this could error if read doesn't cover region (as could all)
         mut_alns = [q
                     for q, r
@@ -115,23 +128,6 @@ def flag_read_alt(
                     if r in range(vcf_start, vcf_stop)]
         if any([x is not None for x in mut_alns]):
             invalid_flag |= c.ValidatorFlags.BAD_OP.value
-    elif mut_type == 'I':  # INS
-        try:
-            prior_pos, _ = r2s.ref2querypos(read, vcf_start)
-        except IndexError:
-            invalid_flag |= c.ValidatorFlags.NOT_ALIGNED.value
-        else:
-            if prior_pos + len(alt) > read.query_length:
-                invalid_flag |= c.ValidatorFlags.SHORT.value
-            else:
-                mut_alns = [(q, r)
-                            for q, r
-                            in read.get_aligned_pairs()
-                            if q in range(prior_pos + 1, prior_pos + len(alt) + 1)]
-                if any([r is not None for _, r in mut_alns]):
-                    invalid_flag |= c.ValidatorFlags.BAD_OP.value
-                if read.query_sequence[prior_pos + 1:prior_pos + len(alt) + 1] != alt:  # type: ignore - can't be none
-                    invalid_flag |= c.ValidatorFlags.NOT_ALT.value
 
     return invalid_flag
 
@@ -256,7 +252,7 @@ def test_variant_HP(
     near_start_r: list[bool] = []
 
     for read in mut_reads:
-        mut_qpos, _ = r2s.ref2querypos(read, vstart)
+        mut_qpos = r2s.ref2querypos(read, vstart)
         if read.flag & 0x10:
             # +1 to include last base in length
             la2m = read.query_alignment_end - mut_qpos + 1
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index 4b507f1..09030c8 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -19,45 +19,30 @@
 
 
 import pysam
-from hairpin2 import constants as c
 
 
 def ref2querypos(
             bam_record: pysam.AlignedSegment,
             ref_pos: int,
-            get_cig: bool = True
 ) -> tuple[int, int | None]:
     pos_aln = bam_record.get_aligned_pairs()
-    query_pos = pos_op = None
+    query_pos = None
     for aln_pair in pos_aln:
         if aln_pair[1] == ref_pos:
             query_pos = aln_pair[0]
     if query_pos is None or len(pos_aln) == 0:
         raise IndexError('reference position not covered by read')
-    elif get_cig:
-        # since position is 0-indexed, add 1 to get distance
-        dist2op = ref_pos - bam_record.reference_start + 1
-        cig = bam_record.cigartuples
-        if cig is None or len(cig) == 0:
-            raise ValueError('no cigar tuples available for pysam record')
-        sum_len = 0
-        while len(cig) > 0:
-            cig_pair = cig.pop(0)
-            if cig_pair[0] != c.Ops.SOFT.value:
-                sum_len += cig_pair[1]
-                if dist2op <= sum_len:
-                    pos_op = cig_pair[0]
-        if pos_op is None:
-            raise ValueError('cigar op could not be recovered')
-    return query_pos, pos_op
+    return query_pos
 
 
 def ref_end_via_cigar(
     cig_str: str,
     ref_start: int
 ) -> int:
-    if not cig_str[0].isdigit() or len(cig_str) < 2:
-        raise ValueError('cigar string misformatted')
+    if (not cig_str[0].isdigit() or
+        not all([(c.isalnum() or c == '=') for c in cig_str]) or
+            len(cig_str) < 2):
+        raise ValueError('could not interpret cigar string {}'.format(cig_str))
     cig_l = []
     digit_accumulator: str = ''
     for char in cig_str:
diff --git a/test/test_flag_read_alt_validate.py b/test/test_flag_read_alt_validate.py
index 34b11bf..8e7515f 100644
--- a/test/test_flag_read_alt_validate.py
+++ b/test/test_flag_read_alt_validate.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from hairpin2.main import flag_read_alt
 from hairpin2 import constants as c
 import pysam
diff --git a/test/test_flag_read_broad_valiate.py b/test/test_flag_read_broad_valiate.py
index d65f0c1..abfe9af 100644
--- a/test/test_flag_read_broad_valiate.py
+++ b/test/test_flag_read_broad_valiate.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from hairpin2.main import flag_read_broad
 from hairpin2 import constants as c
 import pysam
diff --git a/test/test_ref2querypos_validate.py b/test/test_ref2querypos_validate.py
new file mode 100644
index 0000000..a1d4b9c
--- /dev/null
+++ b/test/test_ref2querypos_validate.py
@@ -0,0 +1,33 @@
+from hairpin2.ref2seq import ref2querypos
+import pytest
+import pysam
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC'
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 95
+r.next_reference_start = 95
+r.mapping_quality = 20
+r.cigarstring = '10M'
+r.set_tag('MC', '10M')
+
+
+@pytest.mark.validate
+def test_path_indexerror():
+    with pytest.raises(IndexError):
+        ref2querypos(r, 1000)
+
+
+@pytest.mark.validate
+def test_path_good():
+    expected = 5
+    result = ref2querypos(r, 100)
+    assert expected == result
diff --git a/test/test_ref_end_via_cigar_validate.py b/test/test_ref_end_via_cigar_validate.py
new file mode 100644
index 0000000..c2e48e5
--- /dev/null
+++ b/test/test_ref_end_via_cigar_validate.py
@@ -0,0 +1,18 @@
+from hairpin2.ref2seq import ref_end_via_cigar
+import pytest
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+@pytest.mark.validate
+def test_path_valueerror():
+    with pytest.raises(ValueError):
+        ref_end_via_cigar('30M2I* ,', 0)
+
+
+@pytest.mark.validate
+def test_path_normal():
+    expected = 50
+    result = ref_end_via_cigar('20M10I30M', 0)
+    assert expected == result
diff --git a/test/test_test_variant_AL_validate.py b/test/test_test_variant_AL_validate.py
index 8172ba8..6b6a63b 100644
--- a/test/test_test_variant_AL_validate.py
+++ b/test/test_test_variant_AL_validate.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from hairpin2.main import test_variant_AL
 from hairpin2 import constants as c
 import pysam
@@ -27,7 +7,6 @@
 
 # BASIS PATH TESTING (ish)
 # test every node and edge at least once
-# there's some duplication here but it's clearer to test explicitly
 # ----
 # perfect read pair:
 r = pysam.AlignedSegment()
diff --git a/test/test_test_variant_HP_validate.py b/test/test_test_variant_HP_validate.py
index f36b9dd..d1bf628 100644
--- a/test/test_test_variant_HP_validate.py
+++ b/test/test_test_variant_HP_validate.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from hairpin2.main import test_variant_HP
 from hairpin2 import constants as c
 import pysam

From 9d688e888e3b6819c3aa71ff87052246a03a52e7 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Wed, 23 Oct 2024 18:30:20 +0100
Subject: [PATCH 146/165] little bits and pieces

---
 hairpin2/main.py                    | 14 ++++++++------
 hairpin2/ref2seq.py                 |  2 +-
 test/test_flag_read_alt_validate.py | 15 +++++++++++++--
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index 66e0e6d..b247cb9 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -121,11 +121,13 @@ def flag_read_alt(
                         invalid_flag |= c.ValidatorFlags.NOT_ALT.value
     # DEL - doesn't check for matches before and after...
     if mut_type == 'D':
-        # this could error if read doesn't cover region (as could all)
+        rng = list(range(vcf_start, vcf_stop))
         mut_alns = [q
                     for q, r
                     in read.get_aligned_pairs()
-                    if r in range(vcf_start, vcf_stop)]
+                    if r in rng]
+        if len(mut_alns) != len(rng):
+            invalid_flag |= c.ValidatorFlags.SHORT.value
         if any([x is not None for x in mut_alns]):
             invalid_flag |= c.ValidatorFlags.BAD_OP.value
 
@@ -296,9 +298,9 @@ def test_variant_HP(
             frac_lt_thresh = (sum(near_start_f + near_start_r)
                               / (len(near_start_f) + len(near_start_r)))
             if (frac_lt_thresh < 0.9 or
-                (range_f > 2 and range_r > 2 and sd_f > 2 and sd_r > 2) or
-                (range_f > 1 and sd_f > 10) or
-                    (range_r > 1 and sd_r > 10)):
+                (range_f > 2 and range_r > 2 and sd_f > 2 and sd_r > 2) or  # type: ignore
+                (range_f > 1 and sd_f > 10) or  # type: ignore
+                    (range_r > 1 and sd_r > 10)):  # type: ignore
                 hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
             else:
                 hp_filt.code = c.FiltCodes.SIXTYBI.value
@@ -606,7 +608,7 @@ def main_cli() -> None:
                 for filter in filter_bundle:
                     if filter.flag:
                         record.filter.add(filter.name)
-                    record.info.update({filter.name: '|'.join(
+                    record.info.update({filter.name: '|'.join(  # type: ignore
                         [alt] +
                         [str(f)
                          if type(f)
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index 09030c8..c923255 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -24,7 +24,7 @@
 def ref2querypos(
             bam_record: pysam.AlignedSegment,
             ref_pos: int,
-) -> tuple[int, int | None]:
+) -> int:
     pos_aln = bam_record.get_aligned_pairs()
     query_pos = None
     for aln_pair in pos_aln:
diff --git a/test/test_flag_read_alt_validate.py b/test/test_flag_read_alt_validate.py
index 8e7515f..515e538 100644
--- a/test/test_flag_read_alt_validate.py
+++ b/test/test_flag_read_alt_validate.py
@@ -29,7 +29,7 @@ def test_path_unsupported_mut_type():
                       vcf_start=99,
                       vcf_stop=100,
                       alt='A',
-                      mut_type='8',
+                      mut_type='8',  # type: ignore
                       min_basequal=25)
 
 
@@ -70,7 +70,18 @@ def test_path_good_sub():
     assert expected == result
 
 
-# checks cigar ops
+@pytest.mark.validate
+def test_path_del_short():
+    expected = c.ValidatorFlags.SHORT.value
+    result = flag_read_alt(read=r,
+                           vcf_start=99,
+                           vcf_stop=110,
+                           alt='.',
+                           mut_type='D',
+                           min_basequal=25)
+    assert expected & result
+
+
 @pytest.mark.validate
 def test_path_del_bad_op():
     expected = c.ValidatorFlags.BAD_OP.value

From 64457d5325cb756ced9286552c221965a21e44c0 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 13:15:37 +0000
Subject: [PATCH 147/165] Luca's suggestions

---
 hairpin2/main.py                      | 14 ++---
 test/test_flag_read_alt_validate.py   |  3 +-
 test/test_flag_read_broad_valiate.py  |  9 +--
 test/test_test_variant_AL_validate.py | 48 ---------------
 test/test_test_variant_HP_validate.py | 85 ---------------------------
 5 files changed, 14 insertions(+), 145 deletions(-)
 delete mode 100644 test/test_test_variant_AL_validate.py
 delete mode 100644 test/test_test_variant_HP_validate.py

diff --git a/hairpin2/main.py b/hairpin2/main.py
index b247cb9..d761610 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -214,7 +214,7 @@ def alt_filter_reads(
     return filtered_reads
 
 
-def test_variant_AL(
+def is_variant_AL(
     mut_reads: Iterable[pysam.AlignedSegment],
     al_thresh: float = 0.93
 ) -> c.ALFilter:
@@ -240,7 +240,7 @@ def test_variant_AL(
 # per Peter's implementation
 # can set hairpin for mutations nowhere near alignment start
 # expose more ellis conditions as parameters?
-def test_variant_HP(
+def is_variant_HP(
     vstart: int,
     mut_reads: Iterable[pysam.AlignedSegment],
     position_fraction_thresh: float = 0.15
@@ -375,11 +375,11 @@ def test_record_per_alt(
             filt_d[alt] = c.Filters(c.ALFilter(code=c.FiltCodes.INSUFFICIENT_READS.value),
                                     c.HPFilter(code=c.FiltCodes.INSUFFICIENT_READS.value))
         else:
-            filt_d[alt] = c.Filters(test_variant_AL(alt_filt_reads,
-                                                    al_thresh),
-                                    test_variant_HP(vcf_rec.start,
-                                                    alt_filt_reads,
-                                                    position_fraction))
+            filt_d[alt] = c.Filters(is_variant_AL(alt_filt_reads,
+                                                  al_thresh),
+                                    is_variant_HP(vcf_rec.start,
+                                                  alt_filt_reads,
+                                                  position_fraction))
     return filt_d
 
 
diff --git a/test/test_flag_read_alt_validate.py b/test/test_flag_read_alt_validate.py
index 515e538..0fb6c2a 100644
--- a/test/test_flag_read_alt_validate.py
+++ b/test/test_flag_read_alt_validate.py
@@ -1,6 +1,7 @@
 from hairpin2.main import flag_read_alt
 from hairpin2 import constants as c
 import pysam
+from pysam.libcalignedsegment import SAM_FLAGS as s
 import copy
 import pytest
 
@@ -13,7 +14,7 @@
 r.query_name = 'read1'
 r.query_sequence = 'CTGDAAAACC'
 r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-r.flag = 0x43
+r.flag = s.FPAIRED | s.FPROPER_PAIR | s.FREAD1  # 0x43
 r.reference_id = 0
 r.reference_start = 95
 r.next_reference_start = 95
diff --git a/test/test_flag_read_broad_valiate.py b/test/test_flag_read_broad_valiate.py
index abfe9af..48d51b2 100644
--- a/test/test_flag_read_broad_valiate.py
+++ b/test/test_flag_read_broad_valiate.py
@@ -1,6 +1,7 @@
 from hairpin2.main import flag_read_broad
 from hairpin2 import constants as c
 import pysam
+from pysam.libcalignedsegment import SAM_FLAGS as s
 import copy
 import pytest
 
@@ -13,7 +14,7 @@
 r.query_name = 'read1'
 r.query_sequence = 'CTGDAAAACC'
 r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA')
-r.flag = 0x43
+r.flag = s.FPAIRED | s.FPROPER_PAIR | s.FREAD1  # 0x43
 r.reference_id = 0
 r.reference_start = 95
 r.next_reference_start = 95
@@ -62,7 +63,7 @@ def test_path_set_flag_mapqual_clipqual():
                 | c.ValidatorFlags.MAPQUAL.value
                 | c.ValidatorFlags.CLIPQUAL.value)
     rc = copy.deepcopy(r)
-    rc.flag = 0x200
+    rc.flag = s.FQCFAIL  # 0x200
     rc.cigarstring = '1S9M'
     result = flag_read_broad(read=rc,
                              vcf_start=99,
@@ -75,7 +76,7 @@ def test_path_set_flag_mapqual_clipqual():
 def test_path_overlap():
     expected = c.ValidatorFlags.OVERLAP.value
     rc = copy.deepcopy(r)
-    rc.flag = 0x83
+    rc.flag = s.FPAIRED | s.FPROPER_PAIR | s.FREAD2  # 0x80
     result = flag_read_broad(read=rc,
                              vcf_start=99,
                              min_mapqual=11,
@@ -87,7 +88,7 @@ def test_path_overlap():
 def test_path_no_overlap():
     expected = c.ValidatorFlags.CLEAR.value
     rc = copy.deepcopy(r)
-    rc.flag = 0x83
+    rc.flag = s.FPAIRED | s.FPROPER_PAIR | s.FREAD2  # 0x80
     rc.set_tag('MC', '3M')
     result = flag_read_broad(read=rc,
                              vcf_start=99,
diff --git a/test/test_test_variant_AL_validate.py b/test/test_test_variant_AL_validate.py
deleted file mode 100644
index 6b6a63b..0000000
--- a/test/test_test_variant_AL_validate.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from hairpin2.main import test_variant_AL
-from hairpin2 import constants as c
-import pysam
-import pytest
-import copy
-
-
-# BASIS PATH TESTING (ish)
-# test every node and edge at least once
-# ----
-# perfect read pair:
-r = pysam.AlignedSegment()
-r.query_name = 'read1'
-r.query_sequence = 'CTGDAAAACC' * 10
-r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
-r.flag = 0x43
-r.reference_id = 0
-r.reference_start = 100
-r.next_reference_start = 100
-r.mapping_quality = 20
-r.cigarstring = '100M'
-r.set_tag('MC', '100M')
-
-
-@pytest.mark.validate
-def test_path_AL_true_code_2():
-    expected = c.ALFilter(flag=True, code=2, avg_as=0.5)
-    s1r1 = copy.deepcopy(r)  # no AS, cover except KeyError
-    s1r2 = copy.deepcopy(r)
-    s1r2.set_tag('AS', 50)  # low AS
-    result = test_variant_AL([s1r1, s1r2])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_AL_false_code_2():
-    expected = c.ALFilter(flag=False, code=2, avg_as=0.99)
-    s1r1 = copy.deepcopy(r)
-    s1r1.set_tag('AS', 99)  # high AS
-    result = test_variant_AL([s1r1])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_AL_false_code_3():
-    expected = c.ALFilter(code=3)
-    result = test_variant_AL([])
-    assert expected == result
diff --git a/test/test_test_variant_HP_validate.py b/test/test_test_variant_HP_validate.py
deleted file mode 100644
index d1bf628..0000000
--- a/test/test_test_variant_HP_validate.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from hairpin2.main import test_variant_HP
-from hairpin2 import constants as c
-import pysam
-import pytest
-import copy
-
-
-# BASIS PATH TESTING (ish)
-# test every node and edge at least once
-# ----
-# perfect read pair:
-r = pysam.AlignedSegment()
-r.query_name = 'read1'
-r.query_sequence = 'CTGDAAAACC' * 10
-r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
-r.flag = 0x43
-r.reference_id = 0
-r.reference_start = 100
-r.next_reference_start = 100
-r.mapping_quality = 20
-r.cigarstring = '100M'
-r.set_tag('MC', '100M')
-
-
-@pytest.mark.validate
-def test_path_insufficient_reads():
-    expected = c.HPFilter(code=3)
-    result = test_variant_HP(0, [])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_f_60ai_set():
-    expected = c.HPFilter(flag=True, code=0)
-    result = test_variant_HP(150, [r, r])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_f_60ai_noset():
-    expected = c.HPFilter(code=0)
-    r1 = copy.deepcopy(r)
-    r1.reference_start = 90
-    result = test_variant_HP(150, [r, r1])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_r_60ai_set():
-    expected = c.HPFilter(flag=True, code=0)
-    rr = copy.deepcopy(r)
-    rr.flag = rr.flag | 0x10
-    result = test_variant_HP(150, [rr, rr])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_r_60ai_noset():
-    expected = c.HPFilter(code=0)
-    rr = copy.deepcopy(r)
-    rr.flag = rr.flag | 0x10
-    rr1 = copy.deepcopy(rr)
-    rr1.reference_start = 90
-    result = test_variant_HP(150, [rr, rr1])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_60bi_set():
-    expected = c.HPFilter(flag=True, code=1)
-    r1 = copy.deepcopy(r)
-    r1.reference_start = 190
-    rr = copy.deepcopy(r)
-    rr.flag = rr.flag | 0x10
-    result = test_variant_HP(198, [r1, r1, rr, rr])
-    assert expected == result
-
-
-@pytest.mark.validate
-def test_path_60bi_noset():
-    expected = c.HPFilter(code=1)
-    rr = copy.deepcopy(r)
-    rr.flag = rr.flag | 0x10
-    result = test_variant_HP(150, [r, r, rr, rr])
-    assert expected == result

From 66581c259a041b4f219c06fd2e915f9069bd5ba6 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 13:25:56 +0000
Subject: [PATCH 148/165] add renames

---
 test/test_is_variant_AL_validate.py | 48 ++++++++++++++++
 test/test_is_variant_HP_validate.py | 85 +++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)
 create mode 100644 test/test_is_variant_AL_validate.py
 create mode 100644 test/test_is_variant_HP_validate.py

diff --git a/test/test_is_variant_AL_validate.py b/test/test_is_variant_AL_validate.py
new file mode 100644
index 0000000..7eed3e7
--- /dev/null
+++ b/test/test_is_variant_AL_validate.py
@@ -0,0 +1,48 @@
+from hairpin2.main import is_variant_AL
+from hairpin2 import constants as c
+import pysam
+import pytest
+import copy
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC' * 10
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 100
+r.next_reference_start = 100
+r.mapping_quality = 20
+r.cigarstring = '100M'
+r.set_tag('MC', '100M')
+
+
+@pytest.mark.validate
+def test_path_AL_true_code_2():
+    expected = c.ALFilter(flag=True, code=2, avg_as=0.5)
+    s1r1 = copy.deepcopy(r)  # no AS, cover except KeyError
+    s1r2 = copy.deepcopy(r)
+    s1r2.set_tag('AS', 50)  # low AS
+    result = is_variant_AL([s1r1, s1r2])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_2():
+    expected = c.ALFilter(flag=False, code=2, avg_as=0.99)
+    s1r1 = copy.deepcopy(r)
+    s1r1.set_tag('AS', 99)  # high AS
+    result = is_variant_AL([s1r1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_AL_false_code_3():
+    expected = c.ALFilter(code=3)
+    result = is_variant_AL([])
+    assert expected == result
diff --git a/test/test_is_variant_HP_validate.py b/test/test_is_variant_HP_validate.py
new file mode 100644
index 0000000..2a05da1
--- /dev/null
+++ b/test/test_is_variant_HP_validate.py
@@ -0,0 +1,85 @@
+from hairpin2.main import is_variant_HP
+from hairpin2 import constants as c
+import pysam
+import pytest
+import copy
+
+
+# BASIS PATH TESTING (ish)
+# test every node and edge at least once
+# ----
+# perfect read pair:
+r = pysam.AlignedSegment()
+r.query_name = 'read1'
+r.query_sequence = 'CTGDAAAACC' * 10
+r.query_qualities = pysam.qualitystring_to_array('AAAAAAAAAA' * 10)
+r.flag = 0x43
+r.reference_id = 0
+r.reference_start = 100
+r.next_reference_start = 100
+r.mapping_quality = 20
+r.cigarstring = '100M'
+r.set_tag('MC', '100M')
+
+
+@pytest.mark.validate
+def test_path_insufficient_reads():
+    expected = c.HPFilter(code=3)
+    result = is_variant_HP(0, [])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_f_60ai_set():
+    expected = c.HPFilter(flag=True, code=0)
+    result = is_variant_HP(150, [r, r])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_f_60ai_noset():
+    expected = c.HPFilter(code=0)
+    r1 = copy.deepcopy(r)
+    r1.reference_start = 90
+    result = is_variant_HP(150, [r, r1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_r_60ai_set():
+    expected = c.HPFilter(flag=True, code=0)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = is_variant_HP(150, [rr, rr])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_r_60ai_noset():
+    expected = c.HPFilter(code=0)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    rr1 = copy.deepcopy(rr)
+    rr1.reference_start = 90
+    result = is_variant_HP(150, [rr, rr1])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_60bi_set():
+    expected = c.HPFilter(flag=True, code=1)
+    r1 = copy.deepcopy(r)
+    r1.reference_start = 190
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = is_variant_HP(198, [r1, r1, rr, rr])
+    assert expected == result
+
+
+@pytest.mark.validate
+def test_path_60bi_noset():
+    expected = c.HPFilter(code=1)
+    rr = copy.deepcopy(r)
+    rr.flag = rr.flag | 0x10
+    result = is_variant_HP(150, [r, r, rr, rr])
+    assert expected == result

From 9d061b4b560b109201ca64d5c2363743d4b686b9 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 14:01:35 +0000
Subject: [PATCH 149/165] update README

---
 README.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1c1354f..8116b26 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # hairpin2
 
-`hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). 
+`hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6).
+
+`hairpin2` is designed to flag variants as possible cruciform artefacts. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
 For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
@@ -85,9 +87,10 @@ procedural:
 
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
-- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately.  
-- `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
+- --name-mapping – some variant callers, for example caveman, output sample names such as "TUMOUR" in VCF header columns. hairpin2 uses these column names to match to BAM samples via the SM tag - if these fields do not match, you'll need to provide a mapping here, for example "TUMOR:PD3738..."
+- --al-filter-threshold – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on ASRD.
+- --max-read-span – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `hairpin2` will attempt to filter out these duplicates, and MAX_READ_SPAN is then the maximum +- position to use during duplicate detection.
+- --position-fraction – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within POSITION_FRACTION of read edges, allow for calling HPF flag.
 
 
 ### DETAILS
@@ -108,3 +111,8 @@ The basic procedure of this implementation is as follows:
 >   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
 
 The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
+
+
+### TESTING
+
+A test suite has been provided to prove the validity of the algorithm. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be availble on path. The tests can be found in the `test` directory. They are simple, and, upon having read them, it should be very easy to add your own further tests should you want to confirm any behaviour.

From b2f10b047354d91dcfdffb058b48d5953110a7d8 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 14:08:41 +0000
Subject: [PATCH 150/165] doc and dependecies

---
 README.md                | 2 +-
 docker-run-unit-tests.sh | 2 --
 pyproject.toml           | 6 +-----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 8116b26..c85051f 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ For paired data, given a VCF, and BAM files for the samples of that VCF, return
 
 * Python >= 3.10 – required
 * pysam >= 0.22.1 – installed automatically during install process (tested with 0.22.1 only)
-* pytest - optional, only necessary to run tests
+* pytest >= 0.8.2.2 - optional, only necessary to run tests
 
 ### INSTALLATION
 
diff --git a/docker-run-unit-tests.sh b/docker-run-unit-tests.sh
index dc68b4d..cf4f919 100755
--- a/docker-run-unit-tests.sh
+++ b/docker-run-unit-tests.sh
@@ -12,8 +12,6 @@ echo "Package source directory: ${PKG_DIR}"
 pip install \
     pytest==8.2.2 \
     pytest-cov==5.0.0 \
-    faker-biology==0.6.4 \
-    factory-boy==3.3.1 \
     pysam==0.22 && \
 pytest -m "validate" --cov="${PKG_DIR}" "${TEST_DIR}"
 
diff --git a/pyproject.toml b/pyproject.toml
index be64602..40b0030 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,10 +14,7 @@ pysam = "^0.22"
 hairpin2 = "hairpin2.main:main_cli"
 
 [tool.poetry.group.dev.dependencies]
-pytest = "^8.3.3"
-pytest-cov = "^5.0.0"
-faker-biology = "^0.6.4"
-factory-boy = "^3.3.1"
+pytest = "^8.2.2"
 
 [build-system]
 requires = ["poetry-core"]
@@ -25,6 +22,5 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.pytest.ini_options]
 markers = [
-  "dev: development tests",
   "validate: scientific validation tests"
 ]

From 7b09b5548bb0e4ebc8fdd3271b0c1bafc57f7d4b Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 14:12:45 +0000
Subject: [PATCH 151/165] update README

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index c85051f..ebfefbd 100644
--- a/README.md
+++ b/README.md
@@ -116,3 +116,27 @@ The code has been written with the intention of clarity and extensibility – fu
 ### TESTING
 
 A test suite has been provided to prove the validity of the algorithm. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be availble on path. The tests can be found in the `test` directory. They are simple, and, upon having read them, it should be very easy to add your own further tests should you want to confirm any behaviour.
+
+
+### LICENCE
+
+```
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+```

From c9c8dac34c2aefc82938931e633f7a808d8014f1 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 14:14:10 +0000
Subject: [PATCH 152/165] update README

---
 README.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index ebfefbd..2aded1f 100644
--- a/README.md
+++ b/README.md
@@ -121,22 +121,22 @@ A test suite has been provided to prove the validity of the algorithm. To run th
 ### LICENCE
 
 ```
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+hairpin2
+
+Copyright (C) 2024 Genome Research Ltd.
+
+Author: Alex Byrne <ab63@sanger.ac.uk>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ```

From 294b500997ba7861ad41a0566b8e2bfea4f805fc Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 14:37:05 +0000
Subject: [PATCH 153/165] change reporting

---
 hairpin2/main.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/hairpin2/main.py b/hairpin2/main.py
index d761610..f921864 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -309,7 +309,7 @@ def is_variant_HP(
     return hp_filt
 
 
-def test_record_per_alt(
+def test_record_all_alts(
     alignments: dict[str, pysam.AlignmentFile],
     vcf_rec: pysam.VariantRecord,
     min_mapqual: int,
@@ -587,7 +587,7 @@ def main_cli() -> None:
 
     for record in vcf_in_handle.fetch():  # type: ignore - program ensures not unbound
         try:
-            filter_d: dict[str, c.Filters] = test_record_per_alt(
+            filter_d: dict[str, c.Filters] = test_record_all_alts(
                 vcf_sample_to_alignment_map,
                 record,
                 args.min_mapping_quality,
@@ -609,13 +609,8 @@ def main_cli() -> None:
                     if filter.flag:
                         record.filter.add(filter.name)
                     record.info.update({filter.name: '|'.join(  # type: ignore
-                        [alt] +
-                        [str(f)
-                         if type(f)
-                         is not float
-                         else str(round(f, 3))
-                         for f in filter
-                         ][2:]
+                        [alt, int(filter.flag), str(filter.code)] +
+                        ([filter.avg_as] if filter.name == 'ALF' else [])
                     )})
 
             try:

From a6695bd3c041b5c893dbb7a9664c3c8510068dda Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 16:30:06 +0100
Subject: [PATCH 154/165] little bits and pieces

---
 README.md             |  4 ++--
 hairpin2/constants.py |  2 +-
 hairpin2/helpers.py   | 20 ++++++--------------
 hairpin2/main.py      | 27 ++++++++++++++++-----------
 4 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index 2aded1f..c2ff863 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 `hairpin2` is designed to flag variants as possible cruciform artefacts. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
+Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
 
 ### DEPENDENCIES
@@ -95,7 +95,7 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 
 ### DETAILS
 
-The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<True/False>|<code>` and `ALF=<alt>|<True/False>|<code>|<median AS score>`. The codes are as follows:  
 
 > **0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
 > **1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index faee8a9..2b1cc86 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -51,7 +51,7 @@
                'DIFF',
                'BACK'],
               start=0)
-ValidatorFlags = Flag('ReadFlags',
+ValidatorFlags = Flag('ValidatorFlags',
                       ['CLEAR',
                        'FLAG',
                        'MAPQUAL',
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index 5ac97d1..f5113ab 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -32,17 +32,6 @@ def cleanup(code: int = c.EXIT_FAILURE, msg: None | str = None) -> None:
     sys.exit(code)
 
 
-def test_options(args):
-    if not (0 < args.min_clip_quality < 93):
-        cleanup(msg='invalid --min-clip-quality; range 0-93')
-    if not (0 < args.min_mapping_quality < 60):
-        cleanup(msg='invalid --min-mapping-quality; range 0-60')
-    if not (0 < args.min_base_quality < 93):
-        cleanup(msg='invalid --min-base-quality; range 0-93')
-    if not (0 < args.position_fraction < 1):
-        cleanup(msg='invalid --position-fraction; range 0-1')
-
-
 def has_duplicates(
     l: list
 ) -> bool:
@@ -58,12 +47,15 @@ def lists_not_equal(
 
 def print_flag(
     print_enum: Flag,
-    hex: bool = False
 ) -> None:
-    print([':'.join([str(e), hex(e.value) if hex else bin(e.value)]) for e in print_enum])
+    pl = []
+    for e in print_enum:
+        vs = '-'.join([str(int(e.value)), str(hex(e.value)), str(bin(e.value))])
+        pl.append(': '.join([str(e), vs]))
+    print(pl)
 
 
 def print_enum(
     print_enum: IntEnum
 ) -> None:
-    print([e for e in print_enum])
+    print([e for e in print_enum])  # type: ignore - iterating works fine
diff --git a/hairpin2/main.py b/hairpin2/main.py
index f921864..529a172 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -419,27 +419,27 @@ def main_cli() -> None:
     opt = parser.add_argument_group('extended')
     opt.add_argument('-al',
                      '--al-filter-threshold',
-                     help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93',
+                     help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93, range: 0-',
                      type=float)
     opt.add_argument('-mc',
                      '--min-clip-quality',
-                     help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35',
+                     help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35, range: 0-93',
                      type=int)
     opt.add_argument('-mq',
                      '--min-mapping-quality',
-                     help='discard reads with mapping quality below this value - default: 11',
+                     help='discard reads with mapping quality below this value - default: 11, range: 0-60',
                      type=int)
     opt.add_argument('-mb',
                      '--min-base-quality',
-                     help='discard reads with base quality at variant position below this value - default: 25',
+                     help='discard reads with base quality at variant position below this value - default: 25, range: 0-93',
                      type=int)
     opt.add_argument('-ms',
                      '--max-read-span',
-                     help='maximum +- position to use when detecting PCR duplicates - default: 6',
+                     help='maximum +- position to use when detecting PCR duplicates. -1 will disable duplicate detection - default: 6, range: -1-',
                      type=int)
     opt.add_argument('-pf',
                      '--position-fraction',
-                     help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15',
+                     help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15, range: 0-1',
                      type=float)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-r',
@@ -482,7 +482,13 @@ def main_cli() -> None:
                 setattr(args, k, c.DEFAULTS[k])
 
     # test args are sensible, exit if not
-    h.test_options(args)
+    if not any([(args.al_filter_threshold >= 0),
+                (0 <= args.min_clip_quality <= 93),
+                (0 <= args.min_mapping_quality <= 60),
+                (0 <= args.min_base_quality <= 93),
+                (0 <= args.position_fraction <= 1)
+                (args.max_read_span >= -1)]):
+        h.cleanup(msg='extended arg out range, check helptext for ranges')
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)
@@ -608,11 +614,10 @@ def main_cli() -> None:
                 for filter in filter_bundle:
                     if filter.flag:
                         record.filter.add(filter.name)
-                    record.info.update({filter.name: '|'.join(  # type: ignore
-                        [alt, int(filter.flag), str(filter.code)] +
-                        ([filter.avg_as] if filter.name == 'ALF' else [])
+                    record.info.update({filter.name: '|'.join(  # type: ignore - unclear what pysam wants
+                        [alt, str(int(filter.flag)), str(filter.code)] +
+                        ([str(filter.avg_as)] if filter.name == 'ALF' else [])
                     )})
-
             try:
                 vcf_out_handle.write(record)  # type:ignore
             except Exception as e:

From 19efa2627aeafdebb4aef69631bc67f6553e7851 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 16:31:25 +0100
Subject: [PATCH 155/165] update README

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2ff863..c2d8fd5 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,9 @@
 
 `hairpin2` is designed to flag variants as possible cruciform artefacts. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
-Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
+Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
+
+The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
 
 
 ### DEPENDENCIES

From 109febee4730ba674adec07e87c72e033f0fa30a Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 16:33:12 +0100
Subject: [PATCH 156/165] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c2d8fd5..821ad50 100644
--- a/README.md
+++ b/README.md
@@ -117,7 +117,7 @@ The code has been written with the intention of clarity and extensibility – fu
 
 ### TESTING
 
-A test suite has been provided to prove the validity of the algorithm. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be availble on path. The tests can be found in the `test` directory. They are simple, and, upon having read them, it should be very easy to add your own further tests should you want to confirm any behaviour.
+A test suite has been provided to prove the validity of the algorithm. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be available on path. The tests can be found in the `test` directory. They are simple, and, once you have read them, it should be very easy to add your own further tests should you want to confirm any behaviour.
 
 
 ### LICENCE

From 162f819fac4b55d3ae4bacd844f4d482198fdd3f Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 24 Oct 2024 16:44:05 +0100
Subject: [PATCH 157/165] update helptext

---
 README.md        | 51 +++++++++++++++++++++++++++++++++++-------------
 hairpin2/main.py |  6 +++---
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 821ad50..a404005 100644
--- a/README.md
+++ b/README.md
@@ -43,10 +43,15 @@ hairpin -h
 ### USAGE
 
 ```
-usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
-                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:aln [VCF:aln ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
+usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...]
+                -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY]
+                [-mq MIN_MAPPING_QUALITY] [-mb MIN_BASE_QUALITY]
+                [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION]
+                [-r CRAM_REFERENCE] [-m VCF:aln [VCF:aln ...]]
+                [-ji INPUT_JSON] [-jo OUTPUT_JSON]
 
-cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
+cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI:
+10.1038/s41596-020-00437-6). See README for further explanation of parameters.
 
 info:
   -h, --help            show this help message and exit
@@ -58,31 +63,49 @@ mandatory:
   -o VCF_OUT, --vcf-out VCF_OUT
                         path to write output VCF
   -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
-                        list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories
+                        list of paths to (S/B/CR)AMs (indicated by --format)
+                        for samples in input VCF, whitespace separated -
+                        (s/b/cr)ai expected in same directories
   -f {s,b,c}, --format {s,b,c}
-                        format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
+                        format of alignment files; s indicates SAM, b
+                        indicates BAM, and c indicates CRAM
 
 extended:
   -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
+                        threshold for median of read alignment score per base
+                        of all relevant reads, below which a variant is
+                        flagged as ALF - default: 0.93, range: 0-
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
-                        discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
+                        discard reads with mean base quality of aligned bases
+                        below this value, if they have soft-clipped bases -
+                        default: 35, range: 0-93
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
-                        discard reads with mapping quality below this value - default: 11
+                        discard reads with mapping quality below this value -
+                        default: 11, range: 0-60
   -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
-                        discard reads with base quality at variant position below this value - default: 25
+                        discard reads with base quality at variant position
+                        below this value - default: 25, range: 0-93
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
-                        maximum +- position to use when detecting PCR duplicates - default: 6
+                        maximum +- position to use when detecting PCR
+                        duplicates. -1 will disable duplicate detection -
+                        default: 6, range: -1-
   -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
+                        >90% of variant must occur within POSITION_FRACTION of
+                        read edges to allow HPF flag - default: 0.15, range:
+                        0-1
 
 procedural:
   -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
-                        path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
+                        path to FASTA format CRAM reference, overrides
+                        $REF_PATH and UR tags - ignored if --format is not
+                        CRAM
   -m VCF:aln [VCF:aln ...], --name-mapping VCF:aln [VCF:aln ...]
-                        map VCF sample names to alignment SM tags; useful if they differ
+                        map VCF sample names to alignment SM tags; useful if
+                        they differ
   -ji INPUT_JSON, --input-json INPUT_JSON
-                        path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line
+                        path to JSON of input parameters, from which extended
+                        arguments will be loaded - overridden by arguments
+                        provided on command line
   -jo OUTPUT_JSON, --output-json OUTPUT_JSON
                         log input arguments to JSON
 ```
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 529a172..2acb403 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -389,7 +389,7 @@ def main_cli() -> None:
                         datefmt='%I:%M:%S')
 
     parser = argparse.ArgumentParser(prog="hairpin2",
-                                     description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)')
+                                     description='cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6). See README for further explanation of parameters.')
     parser._optionals.title = 'info'
     parser.add_argument('-v',
                         '--version',
@@ -486,9 +486,9 @@ def main_cli() -> None:
                 (0 <= args.min_clip_quality <= 93),
                 (0 <= args.min_mapping_quality <= 60),
                 (0 <= args.min_base_quality <= 93),
-                (0 <= args.position_fraction <= 1)
+                (0 <= args.position_fraction <= 1),
                 (args.max_read_span >= -1)]):
-        h.cleanup(msg='extended arg out range, check helptext for ranges')
+        h.cleanup(msg='extended arg out of range, check helptext for ranges')
 
     try:
         vcf_in_handle = pysam.VariantFile(args.vcf_in)

From 6058fb3ae340b5d07330231f1e0406ff549739a9 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Thu, 31 Oct 2024 15:34:50 +0000
Subject: [PATCH 158/165] expose params; update help accordingly

---
 README.md             |  80 +++++++++++++-----
 hairpin2/constants.py |   9 +-
 hairpin2/main.py      | 192 +++++++++++++++++++++++++++++-------------
 3 files changed, 203 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index a404005..20ac2ef 100644
--- a/README.md
+++ b/README.md
@@ -70,29 +70,70 @@ mandatory:
                         format of alignment files; s indicates SAM, b
                         indicates BAM, and c indicates CRAM
 
-extended:
-  -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshold for median of read alignment score per base
-                        of all relevant reads, below which a variant is
-                        flagged as ALF - default: 0.93, range: 0-
+read validation:
   -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
                         discard reads with mean base quality of aligned bases
                         below this value, if they have soft-clipped bases -
-                        default: 35, range: 0-93
+                        default: 35, range: 0-93, exclusive
   -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
                         discard reads with mapping quality below this value -
-                        default: 11, range: 0-60
+                        default: 11, range: 0-60, exclusive
   -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
                         discard reads with base quality at variant position
-                        below this value - default: 25, range: 0-93
+                        below this value - default: 25, range: 0-93, exclusive
   -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
                         maximum +- position to use when detecting PCR
                         duplicates. -1 will disable duplicate detection -
-                        default: 6, range: -1-
-  -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant must occur within POSITION_FRACTION of
-                        read edges to allow HPF flag - default: 0.15, range:
-                        0-1
+                        default: 6, range: -1-, inclusive
+
+filter conditions:
+  -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
+                        ALF; threshold for median of read alignment score per
+                        base of all relevant reads, at and below which a
+                        variant is flagged as ALF - default: 0.93, range: 0-,
+                        inclusive
+  -ed EDGE_DEFINITION, --edge-definition EDGE_DEFINITION
+                        HPF; percentage of a read that is considered to be
+                        "the edge" for the purposes of assessing variant
+                        location distribution - default: 0.15, range: 0-0.99,
+                        inclusive
+  -ef EDGE_FRACTION, --edge-fraction EDGE_FRACTION
+                        HPF; percentage of variants must occur within
+                        EDGE_FRACTION of read edges to allow HPF flag -
+                        default: 0.15, range: 0-0.99, exclusive
+  -mos MIN_MAD_ONE_STRAND, --min-MAD-one-strand MIN_MAD_ONE_STRAND
+                        HPF; min range of distances between variant position
+                        and read start for valid reads when only one strand
+                        has sufficient valid reads for testing - default: 0,
+                        range: 0-, exclusive
+  -sos MIN_SD_ONE_STRAND, --min-sd-one-strand MIN_SD_ONE_STRAND
+                        HPF; min stdev of variant position and read start for
+                        valid reads when only one strand has sufficient valid
+                        reads for testing - default: 4, range: 0-, exclusive
+  -mbsw MIN_MAD_BOTH_STRAND_WEAK, --min-MAD-both-strand-weak MIN_MAD_BOTH_STRAND_WEAK
+                        HPF; min range of distances between variant position
+                        and read start for valid reads when both strands have
+                        sufficient valid reads for testing AND -sbsw is true -
+                        default: 2, range: 0-, exclusive
+  -sbsw MIN_SD_BOTH_STRAND_WEAK, --min-sd-both-strand-weak MIN_SD_BOTH_STRAND_WEAK
+                        HPF; min stdev of variant position and read start for
+                        valid reads when both strands have sufficient valid
+                        reads for testing AND -mbsw is true- default: 2,
+                        range: 0-, exclusive
+  -mbss MIN_MAD_BOTH_STRAND_STRONG, --min-mad-both-strand-strong MIN_MAD_BOTH_STRAND_STRONG
+                        HPF; min range of distances between variant position
+                        and read start for valid reads when both strands have
+                        sufficient valid reads for testing AND -sbss is true -
+                        default: 1, range: 0-, exclusive
+  -sbss MIN_SD_BOTH_STRAND_STRONG, --min-sd-both-strand-strong MIN_SD_BOTH_STRAND_STRONG
+                        HPF; min stdev of variant position and read start for
+                        valid reads when both strands have sufficient valid
+                        reads for testing AND -mbss is true - default: 10,
+                        range: 0-, exclusive
+  -mr MIN_READS, --min-reads MIN_READS
+                        HPF; number of reads at and below which the hairpin
+                        filtering logic considers a strand to have
+                        insufficient reads for testing - default: 1, range: 0-
 
 procedural:
   -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
@@ -115,10 +156,9 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 - --name-mapping – some variant callers, for example caveman, output sample names such as "TUMOUR" in VCF header columns. hairpin2 uses these column names to match to BAM samples via the SM tag - if these fields do not match, you'll need to provide a mapping here, for example "TUMOR:PD3738..."
 - --al-filter-threshold – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on ASRD.
 - --max-read-span – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `hairpin2` will attempt to filter out these duplicates, and MAX_READ_SPAN is then the maximum +- position to use during duplicate detection.
-- --position-fraction – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within POSITION_FRACTION of read edges, allow for calling HPF flag.
-
+- --edge-fraction – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within EDGE_FRACTION of read edges, allow for calling HPF flag.
 
-### DETAILS
+The parameters available for the HPF flag are probably best understood by reading the implementation of the function `is_variant_HP()` in `hairpin2/main.py`.
 
 The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<True/False>|<code>` and `ALF=<alt>|<True/False>|<code>|<median AS score>`. The codes are as follows:  
 
@@ -129,13 +169,13 @@ The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flag
 > **4** – no samples have non 0,0 genotype for the record  
 
 The basic procedure of this implementation is as follows:  
->   For each record in the VCF, test every alt for that record by:  
->   1. retrieving reads from samples exhibiting the mutations
->   2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+>   For each record in the VCF, test every alt for that record as follows:  
+>   1. for samples exhibiting the mutation, retrieve reads covering the region
+>   2. test each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
 >   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
 
-The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
+The code has been written with the intention of clarity and extensibility – again, further understanding may be achieved by reading `hairpin2/main.py`.
 
 
 ### TESTING
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 2b1cc86..a2ad6e4 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -30,7 +30,14 @@
                                          ('min_mapping_quality', 11),
                                          ('min_base_quality', 25),
                                          ('max_read_span', 6),
-                                         ('position_fraction', 0.15)))
+                                         ('edge_definition', 0.15),
+                                         ('edge_fraction', 0.9),
+                                         ('min_MAD_one_strand', 0),
+                                         ('min_sd_one_strand', 4),
+                                         ('min_MAD_both_strand_weak', 2),
+                                         ('min_sd_both_strand_strong', 2),
+                                         ('min_MAD_both_strand_strong', 1),
+                                         ('min_sd_both_strand_strong', 10)))
 
 FiltCodes = IntEnum('FiltCodes',
                     ['SIXTYAI',
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 2acb403..a7f81d1 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -101,7 +101,8 @@ def flag_read_alt(
             invalid_flag |= c.ValidatorFlags.NOT_ALIGNED.value
         else:
             if mut_type == 'S':  # SUB
-                if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:  # type: ignore - can't be none
+                # type: ignore - can't be none
+                if read.query_sequence[mut_pos:mut_pos + len(alt)] != alt:
                     invalid_flag |= c.ValidatorFlags.NOT_ALT.value
                 if any([bq < min_basequal
                         for bq
@@ -117,7 +118,8 @@ def flag_read_alt(
                                 if q in range(mut_pos + 1, mut_pos + len(alt) + 1)]
                     if any([r is not None for _, r in mut_alns]):
                         invalid_flag |= c.ValidatorFlags.BAD_OP.value
-                    if read.query_sequence[mut_pos + 1:mut_pos + len(alt) + 1] != alt:  # type: ignore - can't be none
+                    # type: ignore - can't be none
+                    if read.query_sequence[mut_pos + 1:mut_pos + len(alt) + 1] != alt:
                         invalid_flag |= c.ValidatorFlags.NOT_ALT.value
     # DEL - doesn't check for matches before and after...
     if mut_type == 'D':
@@ -243,7 +245,15 @@ def is_variant_AL(
 def is_variant_HP(
     vstart: int,
     mut_reads: Iterable[pysam.AlignedSegment],
-    position_fraction_thresh: float = 0.15
+    edge_definition: float = 0.15,
+    edge_clustering_threshold: float = 0.9,
+    min_MAD_one_strand: int = 0,  # exclusive (and subsequent)
+    min_sd_one_strand: float = 4,
+    min_MAD_both_strand_weak: int = 2,
+    min_sd_both_strand_weak: float = 2,
+    min_MAD_both_strand_strong: int = 1,
+    min_sd_both_strand_strong: float = 10,
+    min_reads: int = 1  # inclusive
 ) -> c.HPFilter:
 
     hp_filt = c.HPFilter()
@@ -259,48 +269,50 @@ def is_variant_HP(
             # +1 to include last base in length
             la2m = read.query_alignment_end - mut_qpos + 1
             near_start_r.append(((la2m / read.query_alignment_length)
-                                 <= position_fraction_thresh))
+                                 <= edge_definition))
             la2ms_r.append(la2m)
         else:
             la2m = mut_qpos - read.query_alignment_start + 1
             near_start_f.append(((la2m / read.query_alignment_length)
-                                 <= position_fraction_thresh))
+                                 <= edge_definition))
             la2ms_f.append(la2m)
 
     # hairpin conditions from Ellis et al. 2020, Nature Protocols
     # sometimes reported as 2021
-    if len(la2ms_f) < 2 and len(la2ms_r) < 2:
+    if len(la2ms_f) <= min_reads and len(la2ms_r) <= min_reads:
         hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
     else:
-        if len(la2ms_f) > 1:
-            range_f = max(la2ms_f) - min(la2ms_f)
+        if len(la2ms_f) > min_reads:
+            mad_f = max(la2ms_f) - min(la2ms_f)
             sd_f = stdev(la2ms_f)
-            if len(la2ms_r) < 2:
-                if (((sum(near_start_f) / len(near_start_f)) < 0.9) and
-                    range_f > 0 and
-                        sd_f > 4):
+            if len(la2ms_r) <= min_reads:
+                if (((sum(near_start_f) / len(near_start_f)) < edge_clustering_threshold) and
+                    mad_f > min_MAD_one_strand and
+                        sd_f > min_sd_one_strand):
                     hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
                 else:
                     hp_filt.code = c.FiltCodes.SIXTYAI.value
                     hp_filt.set()
-        if len(la2ms_r) > 1:
-            range_r = max(la2ms_r) - min(la2ms_r)
+        if len(la2ms_r) > min_reads:
+            mad_r = max(la2ms_r) - min(la2ms_r)
             sd_r = stdev(la2ms_r)
-            if len(la2ms_f) < 2:
-                if (((sum(near_start_r) / len(near_start_r)) < 0.9) and
-                    range_r > 0 and
-                        sd_r > 4):
+            if len(la2ms_f) <= min_reads:
+                if (((sum(near_start_r) / len(near_start_r)) < edge_clustering_threshold) and
+                    mad_r > min_MAD_one_strand and
+                        sd_r > min_sd_one_strand):
                     hp_filt.code = c.FiltCodes.SIXTYAI.value
                 else:
                     hp_filt.code = c.FiltCodes.SIXTYAI.value
                     hp_filt.set()
-        if len(la2ms_f) > 1 and len(la2ms_r) > 1:
+        if len(la2ms_f) > min_reads and len(la2ms_r) > min_reads:
             frac_lt_thresh = (sum(near_start_f + near_start_r)
                               / (len(near_start_f) + len(near_start_r)))
-            if (frac_lt_thresh < 0.9 or
-                (range_f > 2 and range_r > 2 and sd_f > 2 and sd_r > 2) or  # type: ignore
-                (range_f > 1 and sd_f > 10) or  # type: ignore
-                    (range_r > 1 and sd_r > 10)):  # type: ignore
+            if (frac_lt_thresh < edge_clustering_threshold or
+                # type: ignore
+                (mad_f > min_MAD_both_strand_weak and mad_r > min_MAD_both_strand_weak and sd_f > min_sd_both_strand_weak and sd_r > min_sd_both_strand_weak) or
+                # type: ignore
+                (mad_f > min_MAD_both_strand_strong and sd_f > min_sd_both_strand_strong) or
+                    (mad_r > min_MAD_both_strand_strong and sd_r > min_sd_both_strand_strong)):  # type: ignore
                 hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
             else:
                 hp_filt.code = c.FiltCodes.SIXTYBI.value
@@ -317,7 +329,15 @@ def test_record_all_alts(
     min_basequal: int,
     max_span: int,
     al_thresh: float,
-    position_fraction: float
+    edge_def: float,
+    edge_frac: float,
+    mos: int,
+    sos: float,
+    mbsw: int,
+    sbsw: float,
+    mbss: int,
+    sbss: float,
+    min_reads: int
 ) -> dict[str, c.Filters]:
 
     if vcf_rec.alts is None:
@@ -379,7 +399,15 @@ def test_record_all_alts(
                                                   al_thresh),
                                     is_variant_HP(vcf_rec.start,
                                                   alt_filt_reads,
-                                                  position_fraction))
+                                                  edge_def,
+                                                  edge_frac,
+                                                  mos,
+                                                  sos,
+                                                  mbsw,
+                                                  sbsw,
+                                                  mbss,
+                                                  sbss,
+                                                  min_reads))
     return filt_d
 
 
@@ -416,31 +444,64 @@ def main_cli() -> None:
                      choices=["s", "b", "c"],
                      type=str,
                      required=True)
-    opt = parser.add_argument_group('extended')
-    opt.add_argument('-al',
-                     '--al-filter-threshold',
-                     help='threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93, range: 0-',
-                     type=float)
-    opt.add_argument('-mc',
-                     '--min-clip-quality',
-                     help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35, range: 0-93',
-                     type=int)
-    opt.add_argument('-mq',
-                     '--min-mapping-quality',
-                     help='discard reads with mapping quality below this value - default: 11, range: 0-60',
-                     type=int)
-    opt.add_argument('-mb',
-                     '--min-base-quality',
-                     help='discard reads with base quality at variant position below this value - default: 25, range: 0-93',
-                     type=int)
-    opt.add_argument('-ms',
-                     '--max-read-span',
-                     help='maximum +- position to use when detecting PCR duplicates. -1 will disable duplicate detection - default: 6, range: -1-',
-                     type=int)
-    opt.add_argument('-pf',
-                     '--position-fraction',
-                     help='>90%% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15, range: 0-1',
-                     type=float)
+    opt_rv = parser.add_argument_group('read validation')
+    opt_rv.add_argument('-mc',
+                        '--min-clip-quality',
+                        help='discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35, range: 0-93, exclusive',
+                        type=int)
+    opt_rv.add_argument('-mq',
+                        '--min-mapping-quality',
+                        help='discard reads with mapping quality below this value - default: 11, range: 0-60, exclusive',
+                        type=int)
+    opt_rv.add_argument('-mb',
+                        '--min-base-quality',
+                        help='discard reads with base quality at variant position below this value - default: 25, range: 0-93, exclusive',
+                        type=int)
+    opt_rv.add_argument('-ms',
+                        '--max-read-span',
+                        help='maximum +- position to use when detecting PCR duplicates. -1 will disable duplicate detection - default: 6, range: -1-, inclusive',
+                        type=int)
+    opt_fc = parser.add_argument_group('filter conditions')
+    opt_fc.add_argument('-al',
+                        '--al-filter-threshold',
+                        help='ALF; threshold for median of read alignment score per base of all relevant reads, at and below which a variant is flagged as ALF - default: 0.93, range: 0-, inclusive',
+                        type=float)
+    opt_fc.add_argument('-ed',
+                        '--edge-definition',
+                        help='HPF; percentage of a read that is considered to be "the edge" for the purposes of assessing variant location distribution - default: 0.15, range: 0-0.99, inclusive',
+                        type=float)
+    opt_fc.add_argument('-ef',
+                        '--edge-fraction',
+                        help='HPF; percentage of variants must occur within EDGE_FRACTION of read edges to allow HPF flag - default: 0.15, range: 0-0.99, exclusive',
+                        type=float)
+    opt_fc.add_argument('-mos',
+                        '--min-MAD-one-strand',
+                        help='HPF; min range of distances between variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 0, range: 0-, exclusive',
+                        type=int)
+    opt_fc.add_argument('-sos',
+                        '--min-sd-one-strand',
+                        help='HPF; min stdev of variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 4, range: 0-, exclusive',
+                        type=float)
+    opt_fc.add_argument('-mbsw',
+                        '--min-MAD-both-strand-weak',
+                        help='HPF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbsw is true - default: 2, range: 0-, exclusive',
+                        type=int)
+    opt_fc.add_argument('-sbsw',
+                        '--min-sd-both-strand-weak',
+                        help='HPF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbsw is true- default: 2, range: 0-, exclusive',
+                        type=float)
+    opt_fc.add_argument('-mbss',
+                        '--min-MAD-both-strand-strong',
+                        help='HPF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbss is true - default: 1, range: 0-, exclusive',
+                        type=int)
+    opt_fc.add_argument('-sbss',
+                        '--min-sd-both-strand-strong',
+                        help='HPF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbss is true - default: 10, range: 0-, exclusive',
+                        type=float)
+    opt_fc.add_argument('-mr',
+                        '--min-reads',
+                        help='HPF; number of reads at and below which the hairpin filtering logic considers a strand to have insufficient reads for testing - default: 1, range: 0-',
+                        type=int)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-r',
                       '--cram-reference',
@@ -482,12 +543,19 @@ def main_cli() -> None:
                 setattr(args, k, c.DEFAULTS[k])
 
     # test args are sensible, exit if not
-    if not any([(args.al_filter_threshold >= 0),
-                (0 <= args.min_clip_quality <= 93),
+    if not any([(0 <= args.min_clip_quality <= 93),
                 (0 <= args.min_mapping_quality <= 60),
                 (0 <= args.min_base_quality <= 93),
-                (0 <= args.position_fraction <= 1),
-                (args.max_read_span >= -1)]):
+                (args.max_read_span >= -1),
+                (args.al_filter_threshold >= 0),
+                (0 <= args.edge_definition <= 0.99),
+                (0 <= args.edge_fraction <= 0.99),
+                (args.min_MAD_one_strand >= 0),
+                (args.min_sd_one_strand >= 0),
+                (args.min_MAD_both_strand_weak >= 0),
+                (args.min_sd_both_strand_weak >= 0),
+                (args.min_MAD_both_strand_strong >= 0),
+                (args.min_sd_both_strand_strong >= 0)]):
         h.cleanup(msg='extended arg out of range, check helptext for ranges')
 
     try:
@@ -523,8 +591,10 @@ def main_cli() -> None:
                 msg='failed to read alignment file at {}, reporting: {}'.format(path, e))
         # grab the sample name from first SM field
         # in header field RG
-        alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']  # type: ignore - program ensures not unbound
-        vcf_sample_to_alignment_map[alignment_sample_name] = alignment  # type: ignore - program ensures not unbound
+        # type: ignore - program ensures not unbound
+        alignment_sample_name = alignment.header.to_dict()['RG'][0]['SM']
+        # type: ignore - program ensures not unbound
+        vcf_sample_to_alignment_map[alignment_sample_name] = alignment
     if args.name_mapping:
         if len(args.name_mapping) > len(args.alignments):
             h.cleanup(msg="more name mappings than alignments provided")
@@ -601,7 +671,15 @@ def main_cli() -> None:
                 args.min_base_quality,
                 args.max_read_span,
                 args.al_filter_threshold,
-                args.position_fraction
+                args.edge_definition,
+                args.edge_fraction,
+                args.min_MAD_one_strand,
+                args.min_sd_one_strand,
+                args.min_MAD_both_strands_weak,
+                args.min_sd_both_strands_weak,
+                args.min_mad_both_strands_strong,
+                args.min_sd_both_strands_strong,
+                args.min_reads
             )
         except c.NoAlts:
             logging.warning('{0: <7}:{1: >12} ¦ no alts for this record'.format(

From a0af8a6a140e1937d2a799925d24d08444219d92 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 11:55:41 +0000
Subject: [PATCH 159/165] finish exposing parames, README

---
 README.md             |  2 +-
 hairpin2/constants.py |  5 +++--
 hairpin2/main.py      | 11 ++++++-----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 20ac2ef..d112183 100644
--- a/README.md
+++ b/README.md
@@ -180,7 +180,7 @@ The code has been written with the intention of clarity and extensibility – ag
 
 ### TESTING
 
-A test suite has been provided to prove the validity of the algorithm. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be available on path. The tests can be found in the `test` directory. They are simple, and, once you have read them, it should be very easy to add your own further tests should you want to confirm any behaviour.
+A test suite has been provided to prove the validity of the algorithm, i.e. to prove that it does what it claims to do. To run these tests run `pytest -m "validate"` from within the install directory. `hairpin2` must have been installed from that same directory, and be available on path (for example in a virtual environment). The tests can be found in the `test` directory. The basic premise is that of basis path testing. This approach means the focus is on testing all nodes (statements) and edges (paths between statements), rather than trying every possible input combination. Since all input possibilities will pass via the same network/graph, if we prove that each part of that network functions correctly, we can be sure that the program functions as it claims to. The tests are simple, and, once you have read them, it should be very easy to add your own further tests should you feel the need to further confirm any behaviour.
 
 
 ### LICENCE
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index a2ad6e4..05907d0 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -35,9 +35,10 @@
                                          ('min_MAD_one_strand', 0),
                                          ('min_sd_one_strand', 4),
                                          ('min_MAD_both_strand_weak', 2),
-                                         ('min_sd_both_strand_strong', 2),
+                                         ('min_sd_both_strand_weak', 2),
                                          ('min_MAD_both_strand_strong', 1),
-                                         ('min_sd_both_strand_strong', 10)))
+                                         ('min_sd_both_strand_strong', 10),
+                                         ('min_reads', 1)))
 
 FiltCodes = IntEnum('FiltCodes',
                     ['SIXTYAI',
diff --git a/hairpin2/main.py b/hairpin2/main.py
index a7f81d1..96313c4 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -555,7 +555,8 @@ def main_cli() -> None:
                 (args.min_MAD_both_strand_weak >= 0),
                 (args.min_sd_both_strand_weak >= 0),
                 (args.min_MAD_both_strand_strong >= 0),
-                (args.min_sd_both_strand_strong >= 0)]):
+                (args.min_sd_both_strand_strong >= 0),
+                (args.min_reads >= 0)]):
         h.cleanup(msg='extended arg out of range, check helptext for ranges')
 
     try:
@@ -675,10 +676,10 @@ def main_cli() -> None:
                 args.edge_fraction,
                 args.min_MAD_one_strand,
                 args.min_sd_one_strand,
-                args.min_MAD_both_strands_weak,
-                args.min_sd_both_strands_weak,
-                args.min_mad_both_strands_strong,
-                args.min_sd_both_strands_strong,
+                args.min_MAD_both_strand_weak,
+                args.min_sd_both_strand_weak,
+                args.min_MAD_both_strand_strong,
+                args.min_sd_both_strand_strong,
                 args.min_reads
             )
         except c.NoAlts:

From 122c8c2913ce077456e9dacd938b0aa4062855a5 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 13:12:48 +0000
Subject: [PATCH 160/165] broaden remit, fix DEL validation

---
 README.md                                     | 39 +++++------
 hairpin2/constants.py                         |  6 +-
 hairpin2/main.py                              | 69 +++++++++----------
 ...date.py => test_is_variant_AD_validate.py} | 30 ++++----
 4 files changed, 70 insertions(+), 74 deletions(-)
 rename test/{test_is_variant_HP_validate.py => test_is_variant_AD_validate.py} (70%)

diff --git a/README.md b/README.md
index d112183..6c97018 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,11 @@
 
 `hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6).
 
-`hairpin2` is designed to flag variants as possible cruciform artefacts. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
+`hairpin2` is designed to flag variants with anomalous distributions indicating that they are artefactual. Initially, it was concieved to flag possible cruciform artefacts for LCM sequence data, but the concept extends to other artefacts including artefactual indels. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
-Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
+Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `ADF` if they have anomalous distributions, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
 
-The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
+The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `ADF` filter – artefacts with anomalous distributions usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `ADF`, often indicating other artefacts associated with poor signal-to-noise.
 
 
 ### DEPENDENCIES
@@ -93,45 +93,45 @@ filter conditions:
                         variant is flagged as ALF - default: 0.93, range: 0-,
                         inclusive
   -ed EDGE_DEFINITION, --edge-definition EDGE_DEFINITION
-                        HPF; percentage of a read that is considered to be
+                        ADF; percentage of a read that is considered to be
                         "the edge" for the purposes of assessing variant
                         location distribution - default: 0.15, range: 0-0.99,
                         inclusive
   -ef EDGE_FRACTION, --edge-fraction EDGE_FRACTION
-                        HPF; percentage of variants must occur within
-                        EDGE_FRACTION of read edges to allow HPF flag -
+                        ADF; percentage of variants must occur within
+                        EDGE_FRACTION of read edges to allow ADF flag -
                         default: 0.15, range: 0-0.99, exclusive
   -mos MIN_MAD_ONE_STRAND, --min-MAD-one-strand MIN_MAD_ONE_STRAND
-                        HPF; min range of distances between variant position
+                        ADF; min range of distances between variant position
                         and read start for valid reads when only one strand
                         has sufficient valid reads for testing - default: 0,
                         range: 0-, exclusive
   -sos MIN_SD_ONE_STRAND, --min-sd-one-strand MIN_SD_ONE_STRAND
-                        HPF; min stdev of variant position and read start for
+                        ADF; min stdev of variant position and read start for
                         valid reads when only one strand has sufficient valid
                         reads for testing - default: 4, range: 0-, exclusive
   -mbsw MIN_MAD_BOTH_STRAND_WEAK, --min-MAD-both-strand-weak MIN_MAD_BOTH_STRAND_WEAK
-                        HPF; min range of distances between variant position
+                        ADF; min range of distances between variant position
                         and read start for valid reads when both strands have
                         sufficient valid reads for testing AND -sbsw is true -
                         default: 2, range: 0-, exclusive
   -sbsw MIN_SD_BOTH_STRAND_WEAK, --min-sd-both-strand-weak MIN_SD_BOTH_STRAND_WEAK
-                        HPF; min stdev of variant position and read start for
+                        ADF; min stdev of variant position and read start for
                         valid reads when both strands have sufficient valid
                         reads for testing AND -mbsw is true- default: 2,
                         range: 0-, exclusive
   -mbss MIN_MAD_BOTH_STRAND_STRONG, --min-mad-both-strand-strong MIN_MAD_BOTH_STRAND_STRONG
-                        HPF; min range of distances between variant position
+                        ADF; min range of distances between variant position
                         and read start for valid reads when both strands have
                         sufficient valid reads for testing AND -sbss is true -
                         default: 1, range: 0-, exclusive
   -sbss MIN_SD_BOTH_STRAND_STRONG, --min-sd-both-strand-strong MIN_SD_BOTH_STRAND_STRONG
-                        HPF; min stdev of variant position and read start for
+                        ADF; min stdev of variant position and read start for
                         valid reads when both strands have sufficient valid
                         reads for testing AND -mbss is true - default: 10,
                         range: 0-, exclusive
   -mr MIN_READS, --min-reads MIN_READS
-                        HPF; number of reads at and below which the hairpin
+                        ADF; number of reads at and below which the hairpin
                         filtering logic considers a strand to have
                         insufficient reads for testing - default: 1, range: 0-
 
@@ -156,14 +156,13 @@ Parameters are hopefully mostly clear from the helptext, but some warrant furthe
 - --name-mapping – some variant callers, for example caveman, output sample names such as "TUMOUR" in VCF header columns. hairpin2 uses these column names to match to BAM samples via the SM tag - if these fields do not match, you'll need to provide a mapping here, for example "TUMOR:PD3738..."
 - --al-filter-threshold – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on ASRD.
 - --max-read-span – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `hairpin2` will attempt to filter out these duplicates, and MAX_READ_SPAN is then the maximum +- position to use during duplicate detection.
-- --edge-fraction – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within EDGE_FRACTION of read edges, allow for calling HPF flag.
 
-The parameters available for the HPF flag are probably best understood by reading the implementation of the function `is_variant_HP()` in `hairpin2/main.py`.
+The parameters available for the ADF flag are probably best understood by reading the implementation of the function `is_variant_AD()` in `hairpin2/main.py`.
 
-The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<True/False>|<code>` and `ALF=<alt>|<True/False>|<code>|<median AS score>`. The codes are as follows:  
+The tool tests records in a VCF file and applies the `ADF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `ADF=<alt>|<True/False>|<code>` and `ALF=<alt>|<True/False>|<code>|<median AS score>`. The codes are as follows:  
 
-> **0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
-> **1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
+> **0** – passed/failed on condition 60A(i) of Ellis et al. (`ADF` only)  
+> **1** – passed/failed on condition 60B(i) of Ellis et al. (`ADF` only)  
 > **2** – passed/failed on filter threshold (`ALF` only)  
 > **3** – insufficient appropriate reads to support calling flag – this covers a lot of possiblities, if more granularity is desired, please request it  
 > **4** – no samples have non 0,0 genotype for the record  
@@ -171,9 +170,9 @@ The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flag
 The basic procedure of this implementation is as follows:  
 >   For each record in the VCF, test every alt for that record as follows:  
 >   1. for samples exhibiting the mutation, retrieve reads covering the region
->   2. test each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
+>   2. test each read for validity for use in distribution testing (i.e. base quality, do they express the correct alt, and so on)
 >   3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
->   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
+>   4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `ADF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision  
 
 The code has been written with the intention of clarity and extensibility – again, further understanding may be achieved by reading `hairpin2/main.py`.
 
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 05907d0..661f398 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -96,8 +96,8 @@ def __iter__(self):
 
 
 @d.dataclass
-class HPFilter(FilterData):
-    name: str = d.field(default='HPF')
+class ADFilter(FilterData):
+    name: str = d.field(default='ADF')
 
 
 @d.dataclass
@@ -109,7 +109,7 @@ class ALFilter(FilterData):
 @d.dataclass
 class Filters:
     AL: ALFilter
-    HP: HPFilter
+    HP: ADFilter
 
     def __iter__(self):
         return (getattr(self, field.name) for field in d.fields(self))
diff --git a/hairpin2/main.py b/hairpin2/main.py
index 96313c4..e9c0d09 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -121,30 +121,27 @@ def flag_read_alt(
                     # type: ignore - can't be none
                     if read.query_sequence[mut_pos + 1:mut_pos + len(alt) + 1] != alt:
                         invalid_flag |= c.ValidatorFlags.NOT_ALT.value
-    # DEL - doesn't check for matches before and after...
+    # DEL
     if mut_type == 'D':
-        rng = list(range(vcf_start, vcf_stop))
+        rng = list(range(vcf_start - 1, vcf_stop + 1))
         mut_alns = [q
                     for q, r
                     in read.get_aligned_pairs()
                     if r in rng]
         if len(mut_alns) != len(rng):
             invalid_flag |= c.ValidatorFlags.SHORT.value
-        if any([x is not None for x in mut_alns]):
-            invalid_flag |= c.ValidatorFlags.BAD_OP.value
+        if (any([x is not None for x in mut_alns[1:-1]]) or
+            any([x is None for x in [mut_alns[0], mut_alns[-1]]])):
+                invalid_flag |= c.ValidatorFlags.BAD_OP.value
 
     return invalid_flag
 
 
-# detect PCR duplicates previously missed due to (hairpin) artefacts
+# detect PCR duplicates previously missed due to slippage
 # this implementation assumes that sorting on first element of each sublist
 # is appropriate, per Peter's initial implementation.
 # is an all against all comparison between all read lists more appropriate?
 # and between pairs of readlists, why is comparing sorted pairs most appropriate?
-# again, does all against all make more sense?
-# (if so, maybe two pointer comparison?)
-# it bothers me that it matters where in the chain this occurs
-# with more reads it's more likely they'll cluster as dupes right?
 def get_hidden_PCRdup_indices(
     readpair_ends: list[list[int]],
     max_span: int
@@ -242,7 +239,7 @@ def is_variant_AL(
 # per Peter's implementation
 # can set hairpin for mutations nowhere near alignment start
 # expose more ellis conditions as parameters?
-def is_variant_HP(
+def is_variant_AD(
     vstart: int,
     mut_reads: Iterable[pysam.AlignedSegment],
     edge_definition: float = 0.15,
@@ -254,9 +251,9 @@ def is_variant_HP(
     min_MAD_both_strand_strong: int = 1,
     min_sd_both_strand_strong: float = 10,
     min_reads: int = 1  # inclusive
-) -> c.HPFilter:
+) -> c.ADFilter:
 
-    hp_filt = c.HPFilter()
+    ad_filt = c.ADFilter()
     # *l*engths of *a*lignment starts *to* *m*utant query positions
     la2ms_f: list[int] = []
     la2ms_r: list[int] = []
@@ -280,7 +277,7 @@ def is_variant_HP(
     # hairpin conditions from Ellis et al. 2020, Nature Protocols
     # sometimes reported as 2021
     if len(la2ms_f) <= min_reads and len(la2ms_r) <= min_reads:
-        hp_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
+        ad_filt.code = c.FiltCodes.INSUFFICIENT_READS.value
     else:
         if len(la2ms_f) > min_reads:
             mad_f = max(la2ms_f) - min(la2ms_f)
@@ -289,10 +286,10 @@ def is_variant_HP(
                 if (((sum(near_start_f) / len(near_start_f)) < edge_clustering_threshold) and
                     mad_f > min_MAD_one_strand and
                         sd_f > min_sd_one_strand):
-                    hp_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
+                    ad_filt.code = c.FiltCodes.SIXTYAI.value  # 60A(i)
                 else:
-                    hp_filt.code = c.FiltCodes.SIXTYAI.value
-                    hp_filt.set()
+                    ad_filt.code = c.FiltCodes.SIXTYAI.value
+                    ad_filt.set()
         if len(la2ms_r) > min_reads:
             mad_r = max(la2ms_r) - min(la2ms_r)
             sd_r = stdev(la2ms_r)
@@ -300,10 +297,10 @@ def is_variant_HP(
                 if (((sum(near_start_r) / len(near_start_r)) < edge_clustering_threshold) and
                     mad_r > min_MAD_one_strand and
                         sd_r > min_sd_one_strand):
-                    hp_filt.code = c.FiltCodes.SIXTYAI.value
+                    ad_filt.code = c.FiltCodes.SIXTYAI.value
                 else:
-                    hp_filt.code = c.FiltCodes.SIXTYAI.value
-                    hp_filt.set()
+                    ad_filt.code = c.FiltCodes.SIXTYAI.value
+                    ad_filt.set()
         if len(la2ms_f) > min_reads and len(la2ms_r) > min_reads:
             frac_lt_thresh = (sum(near_start_f + near_start_r)
                               / (len(near_start_f) + len(near_start_r)))
@@ -313,12 +310,12 @@ def is_variant_HP(
                 # type: ignore
                 (mad_f > min_MAD_both_strand_strong and sd_f > min_sd_both_strand_strong) or
                     (mad_r > min_MAD_both_strand_strong and sd_r > min_sd_both_strand_strong)):  # type: ignore
-                hp_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
+                ad_filt.code = c.FiltCodes.SIXTYBI.value  # 60B(i)
             else:
-                hp_filt.code = c.FiltCodes.SIXTYBI.value
-                hp_filt.set()
+                ad_filt.code = c.FiltCodes.SIXTYBI.value
+                ad_filt.set()
 
-    return hp_filt
+    return ad_filt
 
 
 def test_record_all_alts(
@@ -393,11 +390,11 @@ def test_record_all_alts(
                                                 min_basequal)
         if len(alt_filt_reads) == 0:
             filt_d[alt] = c.Filters(c.ALFilter(code=c.FiltCodes.INSUFFICIENT_READS.value),
-                                    c.HPFilter(code=c.FiltCodes.INSUFFICIENT_READS.value))
+                                    c.ADFilter(code=c.FiltCodes.INSUFFICIENT_READS.value))
         else:
             filt_d[alt] = c.Filters(is_variant_AL(alt_filt_reads,
                                                   al_thresh),
-                                    is_variant_HP(vcf_rec.start,
+                                    is_variant_AD(vcf_rec.start,
                                                   alt_filt_reads,
                                                   edge_def,
                                                   edge_frac,
@@ -468,39 +465,39 @@ def main_cli() -> None:
                         type=float)
     opt_fc.add_argument('-ed',
                         '--edge-definition',
-                        help='HPF; percentage of a read that is considered to be "the edge" for the purposes of assessing variant location distribution - default: 0.15, range: 0-0.99, inclusive',
+                        help='ADF; percentage of a read that is considered to be "the edge" for the purposes of assessing variant location distribution - default: 0.15, range: 0-0.99, inclusive',
                         type=float)
     opt_fc.add_argument('-ef',
                         '--edge-fraction',
-                        help='HPF; percentage of variants must occur within EDGE_FRACTION of read edges to allow HPF flag - default: 0.15, range: 0-0.99, exclusive',
+                        help='ADF; percentage of variants must occur within EDGE_FRACTION of read edges to allow ADF flag - default: 0.15, range: 0-0.99, exclusive',
                         type=float)
     opt_fc.add_argument('-mos',
                         '--min-MAD-one-strand',
-                        help='HPF; min range of distances between variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 0, range: 0-, exclusive',
+                        help='ADF; min range of distances between variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 0, range: 0-, exclusive',
                         type=int)
     opt_fc.add_argument('-sos',
                         '--min-sd-one-strand',
-                        help='HPF; min stdev of variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 4, range: 0-, exclusive',
+                        help='ADF; min stdev of variant position and read start for valid reads when only one strand has sufficient valid reads for testing - default: 4, range: 0-, exclusive',
                         type=float)
     opt_fc.add_argument('-mbsw',
                         '--min-MAD-both-strand-weak',
-                        help='HPF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbsw is true - default: 2, range: 0-, exclusive',
+                        help='ADF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbsw is true - default: 2, range: 0-, exclusive',
                         type=int)
     opt_fc.add_argument('-sbsw',
                         '--min-sd-both-strand-weak',
-                        help='HPF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbsw is true- default: 2, range: 0-, exclusive',
+                        help='ADF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbsw is true- default: 2, range: 0-, exclusive',
                         type=float)
     opt_fc.add_argument('-mbss',
                         '--min-MAD-both-strand-strong',
-                        help='HPF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbss is true - default: 1, range: 0-, exclusive',
+                        help='ADF; min range of distances between variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -sbss is true - default: 1, range: 0-, exclusive',
                         type=int)
     opt_fc.add_argument('-sbss',
                         '--min-sd-both-strand-strong',
-                        help='HPF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbss is true - default: 10, range: 0-, exclusive',
+                        help='ADF; min stdev of variant position and read start for valid reads when both strands have sufficient valid reads for testing AND -mbss is true - default: 10, range: 0-, exclusive',
                         type=float)
     opt_fc.add_argument('-mr',
                         '--min-reads',
-                        help='HPF; number of reads at and below which the hairpin filtering logic considers a strand to have insufficient reads for testing - default: 1, range: 0-',
+                        help='ADF; number of reads at and below which the hairpin filtering logic considers a strand to have insufficient reads for testing - default: 1, range: 0-',
                         type=int)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-r',
@@ -636,10 +633,10 @@ def main_cli() -> None:
     out_head = vcf_in_handle.header  # type:ignore
     out_head.add_line("##FILTER=<ID=ALF,Description=\"Median alignment score of reads reporting variant less than {}, using samples {}\">".format(
         args.al_filter_threshold, ', '.join(vcf_sample_to_alignment_map.keys())))
-    out_head.add_line("##FILTER=<ID=HPF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(
+    out_head.add_line("##FILTER=<ID=ADF,Description=\"Variant arises from hairpin artefact, using samples {}\">".format(
         ', '.join(vcf_sample_to_alignment_map.keys())))
     out_head.add_line(
-        "##INFO=<ID=HPF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
+        "##INFO=<ID=ADF,Number=1,Type=String,Description=\"alt|code for each alt indicating hairpin filter decision code\">")
     out_head.add_line(
         "##INFO=<ID=ALF,Number=1,Type=String,Description=\"alt|code|score for each alt indicating AL filter conditions\">")
 
diff --git a/test/test_is_variant_HP_validate.py b/test/test_is_variant_AD_validate.py
similarity index 70%
rename from test/test_is_variant_HP_validate.py
rename to test/test_is_variant_AD_validate.py
index 2a05da1..f7e3e1f 100644
--- a/test/test_is_variant_HP_validate.py
+++ b/test/test_is_variant_AD_validate.py
@@ -1,4 +1,4 @@
-from hairpin2.main import is_variant_HP
+from hairpin2.main import is_variant_AD
 from hairpin2 import constants as c
 import pysam
 import pytest
@@ -24,62 +24,62 @@
 
 @pytest.mark.validate
 def test_path_insufficient_reads():
-    expected = c.HPFilter(code=3)
-    result = is_variant_HP(0, [])
+    expected = c.ADFilter(code=3)
+    result = is_variant_AD(0, [])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_f_60ai_set():
-    expected = c.HPFilter(flag=True, code=0)
-    result = is_variant_HP(150, [r, r])
+    expected = c.ADFilter(flag=True, code=0)
+    result = is_variant_AD(150, [r, r])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_f_60ai_noset():
-    expected = c.HPFilter(code=0)
+    expected = c.ADFilter(code=0)
     r1 = copy.deepcopy(r)
     r1.reference_start = 90
-    result = is_variant_HP(150, [r, r1])
+    result = is_variant_AD(150, [r, r1])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_r_60ai_set():
-    expected = c.HPFilter(flag=True, code=0)
+    expected = c.ADFilter(flag=True, code=0)
     rr = copy.deepcopy(r)
     rr.flag = rr.flag | 0x10
-    result = is_variant_HP(150, [rr, rr])
+    result = is_variant_AD(150, [rr, rr])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_r_60ai_noset():
-    expected = c.HPFilter(code=0)
+    expected = c.ADFilter(code=0)
     rr = copy.deepcopy(r)
     rr.flag = rr.flag | 0x10
     rr1 = copy.deepcopy(rr)
     rr1.reference_start = 90
-    result = is_variant_HP(150, [rr, rr1])
+    result = is_variant_AD(150, [rr, rr1])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_60bi_set():
-    expected = c.HPFilter(flag=True, code=1)
+    expected = c.ADFilter(flag=True, code=1)
     r1 = copy.deepcopy(r)
     r1.reference_start = 190
     rr = copy.deepcopy(r)
     rr.flag = rr.flag | 0x10
-    result = is_variant_HP(198, [r1, r1, rr, rr])
+    result = is_variant_AD(198, [r1, r1, rr, rr])
     assert expected == result
 
 
 @pytest.mark.validate
 def test_path_60bi_noset():
-    expected = c.HPFilter(code=1)
+    expected = c.ADFilter(code=1)
     rr = copy.deepcopy(r)
     rr.flag = rr.flag | 0x10
-    result = is_variant_HP(150, [r, r, rr, rr])
+    result = is_variant_AD(150, [r, r, rr, rr])
     assert expected == result

From e4dc8c42035ef0581004e0360c2c3ca84511fb19 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 13:15:23 +0000
Subject: [PATCH 161/165] bump version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 40b0030..3123603 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "hairpin2"
-version = "0.0.3rc"
-description = "CLI implementation of the hairpin detection algorithm concieved by Ellis et al, 2020."
+version = "1.0.0"
+description = "CLI implementation of the artefact detection algorithm concieved by Ellis et al, 2020."
 authors = ["Alex Byrne <ab63@sanger.ac.uk>"]
 license = "AGPL3"
 readme = "README.md"

From f0bd511d37449f7b4ad6d7aa9b83b4393835baf8 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 13:21:04 +0000
Subject: [PATCH 162/165] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6c97018..68c493b 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `ADF` if they have anomalous distributions, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
 
-The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `ADF` filter – artefacts with anomalous distributions usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `ADF`, often indicating other artefacts associated with poor signal-to-noise.
+The `ALF` filter indicates variants which occur with poor signal-to-noise, and also provides additional confidence in the `ADF` filter – artefacts with anomalous distributions often cause a marked decrease in alignment score, as is the case for cruciform artefacts.
 
 
 ### DEPENDENCIES

From 1c2880f17caef8ee25006c1097a2e81370168540 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 13:30:24 +0000
Subject: [PATCH 163/165] update README; helptext

---
 README.md        | 6 +++---
 hairpin2/main.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 68c493b..921658e 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 `hairpin2` is designed to flag variants with anomalous distributions indicating that they are artefactual. Initially, it was concieved to flag possible cruciform artefacts for LCM sequence data, but the concept extends to other artefacts including artefactual indels. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
-Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `ADF` if they have anomalous distributions, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
+Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `ADF` if variants have anomalous distributions indicating that they are likely to be artefactual, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
 
 The `ALF` filter indicates variants which occur with poor signal-to-noise, and also provides additional confidence in the `ADF` filter – artefacts with anomalous distributions often cause a marked decrease in alignment score, as is the case for cruciform artefacts.
 
@@ -133,7 +133,7 @@ filter conditions:
   -mr MIN_READS, --min-reads MIN_READS
                         ADF; number of reads at and below which the hairpin
                         filtering logic considers a strand to have
-                        insufficient reads for testing - default: 1, range: 0-
+                        insufficient reads for testing - default: 1, range: 0-, inclusive
 
 procedural:
   -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
@@ -154,7 +154,7 @@ procedural:
 Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
 
 - --name-mapping – some variant callers, for example caveman, output sample names such as "TUMOUR" in VCF header columns. hairpin2 uses these column names to match to BAM samples via the SM tag - if these fields do not match, you'll need to provide a mapping here, for example "TUMOR:PD3738..."
-- --al-filter-threshold – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on ASRD.
+- --al-filter-threshold – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In past implementations, where this value was known as `ASRD`, the default was set at 0.87.
 - --max-read-span – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `hairpin2` will attempt to filter out these duplicates, and MAX_READ_SPAN is then the maximum +- position to use during duplicate detection.
 
 The parameters available for the ADF flag are probably best understood by reading the implementation of the function `is_variant_AD()` in `hairpin2/main.py`.
diff --git a/hairpin2/main.py b/hairpin2/main.py
index e9c0d09..b58ec11 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -497,7 +497,7 @@ def main_cli() -> None:
                         type=float)
     opt_fc.add_argument('-mr',
                         '--min-reads',
-                        help='ADF; number of reads at and below which the hairpin filtering logic considers a strand to have insufficient reads for testing - default: 1, range: 0-',
+                        help='ADF; number of reads at and below which the hairpin filtering logic considers a strand to have insufficient reads for testing - default: 1, range: 0-, inclusive',
                         type=int)
     proc = parser.add_argument_group('procedural')
     proc.add_argument('-r',

From 94e5165e0d89dce6cab682c272009edcd80018c1 Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Fri, 1 Nov 2024 13:46:20 +0000
Subject: [PATCH 164/165] fix license

---
 LICENSE               | 675 ++----------------------------------------
 README.md             |  29 +-
 hairpin2/__init__.py  |  20 --
 hairpin2/constants.py |  20 --
 hairpin2/helpers.py   |  20 --
 hairpin2/main.py      |  29 +-
 hairpin2/ref2seq.py   |  29 +-
 pyproject.toml        |   2 +-
 8 files changed, 70 insertions(+), 754 deletions(-)

diff --git a/LICENSE b/LICENSE
index fe6b903..3c5f78f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,662 +1,23 @@
-                    GNU AFFERO GENERAL PUBLIC LICENSE
-                       Version 3, 19 November 2007
+MIT License
 
- Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
+Copyright (C) 2024 Genome Research Ltd.
 
-                            Preamble
+Author: Alex Byrne <ab63@sanger.ac.uk>
 
-  The GNU Affero General Public License is a free, copyleft license for
-software and other kinds of works, specifically designed to ensure
-cooperation with the community in the case of network server software.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-  The licenses for most software and other practical works are designed
-to take away your freedom to share and change the works.  By contrast,
-our General Public Licenses are intended to guarantee your freedom to
-share and change all versions of a program--to make sure it remains free
-software for all its users.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-them if you wish), that you receive source code or can get it if you
-want it, that you can change the software or use pieces of it in new
-free programs, and that you know you can do these things.
-
-  Developers that use our General Public Licenses protect your rights
-with two steps: (1) assert copyright on the software, and (2) offer
-you this License which gives you legal permission to copy, distribute
-and/or modify the software.
-
-  A secondary benefit of defending all users' freedom is that
-improvements made in alternate versions of the program, if they
-receive widespread use, become available for other developers to
-incorporate.  Many developers of free software are heartened and
-encouraged by the resulting cooperation.  However, in the case of
-software used on network servers, this result may fail to come about.
-The GNU General Public License permits making a modified version and
-letting the public access it on a server without ever releasing its
-source code to the public.
-
-  The GNU Affero General Public License is designed specifically to
-ensure that, in such cases, the modified source code becomes available
-to the community.  It requires the operator of a network server to
-provide the source code of the modified version running there to the
-users of that server.  Therefore, public use of a modified version, on
-a publicly accessible server, gives the public access to the source
-code of the modified version.
-
-  An older license, called the Affero General Public License and
-published by Affero, was designed to accomplish similar goals.  This is
-a different license, not a version of the Affero GPL, but Affero has
-released a new version of the Affero GPL which permits relicensing under
-this license.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-                       TERMS AND CONDITIONS
-
-  0. Definitions.
-
-  "This License" refers to version 3 of the GNU Affero General Public License.
-
-  "Copyright" also means copyright-like laws that apply to other kinds of
-works, such as semiconductor masks.
-
-  "The Program" refers to any copyrightable work licensed under this
-License.  Each licensee is addressed as "you".  "Licensees" and
-"recipients" may be individuals or organizations.
-
-  To "modify" a work means to copy from or adapt all or part of the work
-in a fashion requiring copyright permission, other than the making of an
-exact copy.  The resulting work is called a "modified version" of the
-earlier work or a work "based on" the earlier work.
-
-  A "covered work" means either the unmodified Program or a work based
-on the Program.
-
-  To "propagate" a work means to do anything with it that, without
-permission, would make you directly or secondarily liable for
-infringement under applicable copyright law, except executing it on a
-computer or modifying a private copy.  Propagation includes copying,
-distribution (with or without modification), making available to the
-public, and in some countries other activities as well.
-
-  To "convey" a work means any kind of propagation that enables other
-parties to make or receive copies.  Mere interaction with a user through
-a computer network, with no transfer of a copy, is not conveying.
-
-  An interactive user interface displays "Appropriate Legal Notices"
-to the extent that it includes a convenient and prominently visible
-feature that (1) displays an appropriate copyright notice, and (2)
-tells the user that there is no warranty for the work (except to the
-extent that warranties are provided), that licensees may convey the
-work under this License, and how to view a copy of this License.  If
-the interface presents a list of user commands or options, such as a
-menu, a prominent item in the list meets this criterion.
-
-  1. Source Code.
-
-  The "source code" for a work means the preferred form of the work
-for making modifications to it.  "Object code" means any non-source
-form of a work.
-
-  A "Standard Interface" means an interface that either is an official
-standard defined by a recognized standards body, or, in the case of
-interfaces specified for a particular programming language, one that
-is widely used among developers working in that language.
-
-  The "System Libraries" of an executable work include anything, other
-than the work as a whole, that (a) is included in the normal form of
-packaging a Major Component, but which is not part of that Major
-Component, and (b) serves only to enable use of the work with that
-Major Component, or to implement a Standard Interface for which an
-implementation is available to the public in source code form.  A
-"Major Component", in this context, means a major essential component
-(kernel, window system, and so on) of the specific operating system
-(if any) on which the executable work runs, or a compiler used to
-produce the work, or an object code interpreter used to run it.
-
-  The "Corresponding Source" for a work in object code form means all
-the source code needed to generate, install, and (for an executable
-work) run the object code and to modify the work, including scripts to
-control those activities.  However, it does not include the work's
-System Libraries, or general-purpose tools or generally available free
-programs which are used unmodified in performing those activities but
-which are not part of the work.  For example, Corresponding Source
-includes interface definition files associated with source files for
-the work, and the source code for shared libraries and dynamically
-linked subprograms that the work is specifically designed to require,
-such as by intimate data communication or control flow between those
-subprograms and other parts of the work.
-
-  The Corresponding Source need not include anything that users
-can regenerate automatically from other parts of the Corresponding
-Source.
-
-  The Corresponding Source for a work in source code form is that
-same work.
-
-  2. Basic Permissions.
-
-  All rights granted under this License are granted for the term of
-copyright on the Program, and are irrevocable provided the stated
-conditions are met.  This License explicitly affirms your unlimited
-permission to run the unmodified Program.  The output from running a
-covered work is covered by this License only if the output, given its
-content, constitutes a covered work.  This License acknowledges your
-rights of fair use or other equivalent, as provided by copyright law.
-
-  You may make, run and propagate covered works that you do not
-convey, without conditions so long as your license otherwise remains
-in force.  You may convey covered works to others for the sole purpose
-of having them make modifications exclusively for you, or provide you
-with facilities for running those works, provided that you comply with
-the terms of this License in conveying all material for which you do
-not control copyright.  Those thus making or running the covered works
-for you must do so exclusively on your behalf, under your direction
-and control, on terms that prohibit them from making any copies of
-your copyrighted material outside their relationship with you.
-
-  Conveying under any other circumstances is permitted solely under
-the conditions stated below.  Sublicensing is not allowed; section 10
-makes it unnecessary.
-
-  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
-
-  No covered work shall be deemed part of an effective technological
-measure under any applicable law fulfilling obligations under article
-11 of the WIPO copyright treaty adopted on 20 December 1996, or
-similar laws prohibiting or restricting circumvention of such
-measures.
-
-  When you convey a covered work, you waive any legal power to forbid
-circumvention of technological measures to the extent such circumvention
-is effected by exercising rights under this License with respect to
-the covered work, and you disclaim any intention to limit operation or
-modification of the work as a means of enforcing, against the work's
-users, your or third parties' legal rights to forbid circumvention of
-technological measures.
-
-  4. Conveying Verbatim Copies.
-
-  You may convey verbatim copies of the Program's source code as you
-receive it, in any medium, provided that you conspicuously and
-appropriately publish on each copy an appropriate copyright notice;
-keep intact all notices stating that this License and any
-non-permissive terms added in accord with section 7 apply to the code;
-keep intact all notices of the absence of any warranty; and give all
-recipients a copy of this License along with the Program.
-
-  You may charge any price or no price for each copy that you convey,
-and you may offer support or warranty protection for a fee.
-
-  5. Conveying Modified Source Versions.
-
-  You may convey a work based on the Program, or the modifications to
-produce it from the Program, in the form of source code under the
-terms of section 4, provided that you also meet all of these conditions:
-
-    a) The work must carry prominent notices stating that you modified
-    it, and giving a relevant date.
-
-    b) The work must carry prominent notices stating that it is
-    released under this License and any conditions added under section
-    7.  This requirement modifies the requirement in section 4 to
-    "keep intact all notices".
-
-    c) You must license the entire work, as a whole, under this
-    License to anyone who comes into possession of a copy.  This
-    License will therefore apply, along with any applicable section 7
-    additional terms, to the whole of the work, and all its parts,
-    regardless of how they are packaged.  This License gives no
-    permission to license the work in any other way, but it does not
-    invalidate such permission if you have separately received it.
-
-    d) If the work has interactive user interfaces, each must display
-    Appropriate Legal Notices; however, if the Program has interactive
-    interfaces that do not display Appropriate Legal Notices, your
-    work need not make them do so.
-
-  A compilation of a covered work with other separate and independent
-works, which are not by their nature extensions of the covered work,
-and which are not combined with it such as to form a larger program,
-in or on a volume of a storage or distribution medium, is called an
-"aggregate" if the compilation and its resulting copyright are not
-used to limit the access or legal rights of the compilation's users
-beyond what the individual works permit.  Inclusion of a covered work
-in an aggregate does not cause this License to apply to the other
-parts of the aggregate.
-
-  6. Conveying Non-Source Forms.
-
-  You may convey a covered work in object code form under the terms
-of sections 4 and 5, provided that you also convey the
-machine-readable Corresponding Source under the terms of this License,
-in one of these ways:
-
-    a) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by the
-    Corresponding Source fixed on a durable physical medium
-    customarily used for software interchange.
-
-    b) Convey the object code in, or embodied in, a physical product
-    (including a physical distribution medium), accompanied by a
-    written offer, valid for at least three years and valid for as
-    long as you offer spare parts or customer support for that product
-    model, to give anyone who possesses the object code either (1) a
-    copy of the Corresponding Source for all the software in the
-    product that is covered by this License, on a durable physical
-    medium customarily used for software interchange, for a price no
-    more than your reasonable cost of physically performing this
-    conveying of source, or (2) access to copy the
-    Corresponding Source from a network server at no charge.
-
-    c) Convey individual copies of the object code with a copy of the
-    written offer to provide the Corresponding Source.  This
-    alternative is allowed only occasionally and noncommercially, and
-    only if you received the object code with such an offer, in accord
-    with subsection 6b.
-
-    d) Convey the object code by offering access from a designated
-    place (gratis or for a charge), and offer equivalent access to the
-    Corresponding Source in the same way through the same place at no
-    further charge.  You need not require recipients to copy the
-    Corresponding Source along with the object code.  If the place to
-    copy the object code is a network server, the Corresponding Source
-    may be on a different server (operated by you or a third party)
-    that supports equivalent copying facilities, provided you maintain
-    clear directions next to the object code saying where to find the
-    Corresponding Source.  Regardless of what server hosts the
-    Corresponding Source, you remain obligated to ensure that it is
-    available for as long as needed to satisfy these requirements.
-
-    e) Convey the object code using peer-to-peer transmission, provided
-    you inform other peers where the object code and Corresponding
-    Source of the work are being offered to the general public at no
-    charge under subsection 6d.
-
-  A separable portion of the object code, whose source code is excluded
-from the Corresponding Source as a System Library, need not be
-included in conveying the object code work.
-
-  A "User Product" is either (1) a "consumer product", which means any
-tangible personal property which is normally used for personal, family,
-or household purposes, or (2) anything designed or sold for incorporation
-into a dwelling.  In determining whether a product is a consumer product,
-doubtful cases shall be resolved in favor of coverage.  For a particular
-product received by a particular user, "normally used" refers to a
-typical or common use of that class of product, regardless of the status
-of the particular user or of the way in which the particular user
-actually uses, or expects or is expected to use, the product.  A product
-is a consumer product regardless of whether the product has substantial
-commercial, industrial or non-consumer uses, unless such uses represent
-the only significant mode of use of the product.
-
-  "Installation Information" for a User Product means any methods,
-procedures, authorization keys, or other information required to install
-and execute modified versions of a covered work in that User Product from
-a modified version of its Corresponding Source.  The information must
-suffice to ensure that the continued functioning of the modified object
-code is in no case prevented or interfered with solely because
-modification has been made.
-
-  If you convey an object code work under this section in, or with, or
-specifically for use in, a User Product, and the conveying occurs as
-part of a transaction in which the right of possession and use of the
-User Product is transferred to the recipient in perpetuity or for a
-fixed term (regardless of how the transaction is characterized), the
-Corresponding Source conveyed under this section must be accompanied
-by the Installation Information.  But this requirement does not apply
-if neither you nor any third party retains the ability to install
-modified object code on the User Product (for example, the work has
-been installed in ROM).
-
-  The requirement to provide Installation Information does not include a
-requirement to continue to provide support service, warranty, or updates
-for a work that has been modified or installed by the recipient, or for
-the User Product in which it has been modified or installed.  Access to a
-network may be denied when the modification itself materially and
-adversely affects the operation of the network or violates the rules and
-protocols for communication across the network.
-
-  Corresponding Source conveyed, and Installation Information provided,
-in accord with this section must be in a format that is publicly
-documented (and with an implementation available to the public in
-source code form), and must require no special password or key for
-unpacking, reading or copying.
-
-  7. Additional Terms.
-
-  "Additional permissions" are terms that supplement the terms of this
-License by making exceptions from one or more of its conditions.
-Additional permissions that are applicable to the entire Program shall
-be treated as though they were included in this License, to the extent
-that they are valid under applicable law.  If additional permissions
-apply only to part of the Program, that part may be used separately
-under those permissions, but the entire Program remains governed by
-this License without regard to the additional permissions.
-
-  When you convey a copy of a covered work, you may at your option
-remove any additional permissions from that copy, or from any part of
-it.  (Additional permissions may be written to require their own
-removal in certain cases when you modify the work.)  You may place
-additional permissions on material, added by you to a covered work,
-for which you have or can give appropriate copyright permission.
-
-  Notwithstanding any other provision of this License, for material you
-add to a covered work, you may (if authorized by the copyright holders of
-that material) supplement the terms of this License with terms:
-
-    a) Disclaiming warranty or limiting liability differently from the
-    terms of sections 15 and 16 of this License; or
-
-    b) Requiring preservation of specified reasonable legal notices or
-    author attributions in that material or in the Appropriate Legal
-    Notices displayed by works containing it; or
-
-    c) Prohibiting misrepresentation of the origin of that material, or
-    requiring that modified versions of such material be marked in
-    reasonable ways as different from the original version; or
-
-    d) Limiting the use for publicity purposes of names of licensors or
-    authors of the material; or
-
-    e) Declining to grant rights under trademark law for use of some
-    trade names, trademarks, or service marks; or
-
-    f) Requiring indemnification of licensors and authors of that
-    material by anyone who conveys the material (or modified versions of
-    it) with contractual assumptions of liability to the recipient, for
-    any liability that these contractual assumptions directly impose on
-    those licensors and authors.
-
-  All other non-permissive additional terms are considered "further
-restrictions" within the meaning of section 10.  If the Program as you
-received it, or any part of it, contains a notice stating that it is
-governed by this License along with a term that is a further
-restriction, you may remove that term.  If a license document contains
-a further restriction but permits relicensing or conveying under this
-License, you may add to a covered work material governed by the terms
-of that license document, provided that the further restriction does
-not survive such relicensing or conveying.
-
-  If you add terms to a covered work in accord with this section, you
-must place, in the relevant source files, a statement of the
-additional terms that apply to those files, or a notice indicating
-where to find the applicable terms.
-
-  Additional terms, permissive or non-permissive, may be stated in the
-form of a separately written license, or stated as exceptions;
-the above requirements apply either way.
-
-  8. Termination.
-
-  You may not propagate or modify a covered work except as expressly
-provided under this License.  Any attempt otherwise to propagate or
-modify it is void, and will automatically terminate your rights under
-this License (including any patent licenses granted under the third
-paragraph of section 11).
-
-  However, if you cease all violation of this License, then your
-license from a particular copyright holder is reinstated (a)
-provisionally, unless and until the copyright holder explicitly and
-finally terminates your license, and (b) permanently, if the copyright
-holder fails to notify you of the violation by some reasonable means
-prior to 60 days after the cessation.
-
-  Moreover, your license from a particular copyright holder is
-reinstated permanently if the copyright holder notifies you of the
-violation by some reasonable means, this is the first time you have
-received notice of violation of this License (for any work) from that
-copyright holder, and you cure the violation prior to 30 days after
-your receipt of the notice.
-
-  Termination of your rights under this section does not terminate the
-licenses of parties who have received copies or rights from you under
-this License.  If your rights have been terminated and not permanently
-reinstated, you do not qualify to receive new licenses for the same
-material under section 10.
-
-  9. Acceptance Not Required for Having Copies.
-
-  You are not required to accept this License in order to receive or
-run a copy of the Program.  Ancillary propagation of a covered work
-occurring solely as a consequence of using peer-to-peer transmission
-to receive a copy likewise does not require acceptance.  However,
-nothing other than this License grants you permission to propagate or
-modify any covered work.  These actions infringe copyright if you do
-not accept this License.  Therefore, by modifying or propagating a
-covered work, you indicate your acceptance of this License to do so.
-
-  10. Automatic Licensing of Downstream Recipients.
-
-  Each time you convey a covered work, the recipient automatically
-receives a license from the original licensors, to run, modify and
-propagate that work, subject to this License.  You are not responsible
-for enforcing compliance by third parties with this License.
-
-  An "entity transaction" is a transaction transferring control of an
-organization, or substantially all assets of one, or subdividing an
-organization, or merging organizations.  If propagation of a covered
-work results from an entity transaction, each party to that
-transaction who receives a copy of the work also receives whatever
-licenses to the work the party's predecessor in interest had or could
-give under the previous paragraph, plus a right to possession of the
-Corresponding Source of the work from the predecessor in interest, if
-the predecessor has it or can get it with reasonable efforts.
-
-  You may not impose any further restrictions on the exercise of the
-rights granted or affirmed under this License.  For example, you may
-not impose a license fee, royalty, or other charge for exercise of
-rights granted under this License, and you may not initiate litigation
-(including a cross-claim or counterclaim in a lawsuit) alleging that
-any patent claim is infringed by making, using, selling, offering for
-sale, or importing the Program or any portion of it.
-
-  11. Patents.
-
-  A "contributor" is a copyright holder who authorizes use under this
-License of the Program or a work on which the Program is based.  The
-work thus licensed is called the contributor's "contributor version".
-
-  A contributor's "essential patent claims" are all patent claims
-owned or controlled by the contributor, whether already acquired or
-hereafter acquired, that would be infringed by some manner, permitted
-by this License, of making, using, or selling its contributor version,
-but do not include claims that would be infringed only as a
-consequence of further modification of the contributor version.  For
-purposes of this definition, "control" includes the right to grant
-patent sublicenses in a manner consistent with the requirements of
-this License.
-
-  Each contributor grants you a non-exclusive, worldwide, royalty-free
-patent license under the contributor's essential patent claims, to
-make, use, sell, offer for sale, import and otherwise run, modify and
-propagate the contents of its contributor version.
-
-  In the following three paragraphs, a "patent license" is any express
-agreement or commitment, however denominated, not to enforce a patent
-(such as an express permission to practice a patent or covenant not to
-sue for patent infringement).  To "grant" such a patent license to a
-party means to make such an agreement or commitment not to enforce a
-patent against the party.
-
-  If you convey a covered work, knowingly relying on a patent license,
-and the Corresponding Source of the work is not available for anyone
-to copy, free of charge and under the terms of this License, through a
-publicly available network server or other readily accessible means,
-then you must either (1) cause the Corresponding Source to be so
-available, or (2) arrange to deprive yourself of the benefit of the
-patent license for this particular work, or (3) arrange, in a manner
-consistent with the requirements of this License, to extend the patent
-license to downstream recipients.  "Knowingly relying" means you have
-actual knowledge that, but for the patent license, your conveying the
-covered work in a country, or your recipient's use of the covered work
-in a country, would infringe one or more identifiable patents in that
-country that you have reason to believe are valid.
-
-  If, pursuant to or in connection with a single transaction or
-arrangement, you convey, or propagate by procuring conveyance of, a
-covered work, and grant a patent license to some of the parties
-receiving the covered work authorizing them to use, propagate, modify
-or convey a specific copy of the covered work, then the patent license
-you grant is automatically extended to all recipients of the covered
-work and works based on it.
-
-  A patent license is "discriminatory" if it does not include within
-the scope of its coverage, prohibits the exercise of, or is
-conditioned on the non-exercise of one or more of the rights that are
-specifically granted under this License.  You may not convey a covered
-work if you are a party to an arrangement with a third party that is
-in the business of distributing software, under which you make payment
-to the third party based on the extent of your activity of conveying
-the work, and under which the third party grants, to any of the
-parties who would receive the covered work from you, a discriminatory
-patent license (a) in connection with copies of the covered work
-conveyed by you (or copies made from those copies), or (b) primarily
-for and in connection with specific products or compilations that
-contain the covered work, unless you entered into that arrangement,
-or that patent license was granted, prior to 28 March 2007.
-
-  Nothing in this License shall be construed as excluding or limiting
-any implied license or other defenses to infringement that may
-otherwise be available to you under applicable patent law.
-
-  12. No Surrender of Others' Freedom.
-
-  If conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot convey a
-covered work so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you may
-not convey it at all.  For example, if you agree to terms that obligate you
-to collect a royalty for further conveying from those to whom you convey
-the Program, the only way you could satisfy both those terms and this
-License would be to refrain entirely from conveying the Program.
-
-  13. Remote Network Interaction; Use with the GNU General Public License.
-
-  Notwithstanding any other provision of this License, if you modify the
-Program, your modified version must prominently offer all users
-interacting with it remotely through a computer network (if your version
-supports such interaction) an opportunity to receive the Corresponding
-Source of your version by providing access to the Corresponding Source
-from a network server at no charge, through some standard or customary
-means of facilitating copying of software.  This Corresponding Source
-shall include the Corresponding Source for any work covered by version 3
-of the GNU General Public License that is incorporated pursuant to the
-following paragraph.
-
-  Notwithstanding any other provision of this License, you have
-permission to link or combine any covered work with a work licensed
-under version 3 of the GNU General Public License into a single
-combined work, and to convey the resulting work.  The terms of this
-License will continue to apply to the part which is the covered work,
-but the work with which it is combined will remain governed by version
-3 of the GNU General Public License.
-
-  14. Revised Versions of this License.
-
-  The Free Software Foundation may publish revised and/or new versions of
-the GNU Affero General Public License from time to time.  Such new versions
-will be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-  Each version is given a distinguishing version number.  If the
-Program specifies that a certain numbered version of the GNU Affero General
-Public License "or any later version" applies to it, you have the
-option of following the terms and conditions either of that numbered
-version or of any later version published by the Free Software
-Foundation.  If the Program does not specify a version number of the
-GNU Affero General Public License, you may choose any version ever published
-by the Free Software Foundation.
-
-  If the Program specifies that a proxy can decide which future
-versions of the GNU Affero General Public License can be used, that proxy's
-public statement of acceptance of a version permanently authorizes you
-to choose that version for the Program.
-
-  Later license versions may give you additional or different
-permissions.  However, no additional obligations are imposed on any
-author or copyright holder as a result of your choosing to follow a
-later version.
-
-  15. Disclaimer of Warranty.
-
-  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
-APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
-HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
-OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
-THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
-IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
-ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. Limitation of Liability.
-
-  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
-THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
-GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
-USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
-DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
-PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
-EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
-SUCH DAMAGES.
-
-  17. Interpretation of Sections 15 and 16.
-
-  If the disclaimer of warranty and limitation of liability provided
-above cannot be given local legal effect according to their terms,
-reviewing courts shall apply local law that most closely approximates
-an absolute waiver of all civil liability in connection with the
-Program, unless a warranty or assumption of liability accompanies a
-copy of the Program in return for a fee.
-
-                     END OF TERMS AND CONDITIONS
-
-            How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-state the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License as published
-    by the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-Also add information on how to contact you by electronic and paper mail.
-
-  If your software can interact with users remotely through a computer
-network, you should also make sure that it provides a way for users to
-get its source.  For example, if your program is a web application, its
-interface could display a "Source" link that leads users to an archive
-of the code.  There are many ways you could offer source, and different
-solutions will be better for different programs; see section 13 for the
-specific requirements.
-
-  You should also get your employer (if you work as a programmer) or school,
-if any, to sign a "copyright disclaimer" for the program, if necessary.
-For more information on this, and how to apply and follow the GNU AGPL, see
-<https://www.gnu.org/licenses/>.
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 921658e..a40d8fe 100644
--- a/README.md
+++ b/README.md
@@ -191,16 +191,21 @@ Copyright (C) 2024 Genome Research Ltd.
 
 Author: Alex Byrne <ab63@sanger.ac.uk>
 
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU Affero General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU Affero General Public License for more details.
-
-You should have received a copy of the GNU Affero General Public License
-along with this program.  If not, see <http://www.gnu.org/licenses/>.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
 ```
diff --git a/hairpin2/__init__.py b/hairpin2/__init__.py
index c7fe64c..cd85363 100644
--- a/hairpin2/__init__.py
+++ b/hairpin2/__init__.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 def _set_version() -> str:  # noqa: C901
     """Set the package version from the project metadata in pyproject.toml."""
     from warnings import warn
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 661f398..8154573 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from enum import IntEnum, Flag
 from typing import Callable
 import dataclasses as d
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index f5113ab..a644d87 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -1,23 +1,3 @@
-# hairpin2
-#
-# Copyright (C) 2024 Genome Research Ltd.
-#
-# Author: Alex Byrne <ab63@sanger.ac.uk>
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
 from enum import IntEnum, Flag
 import logging
 import sys
diff --git a/hairpin2/main.py b/hairpin2/main.py
index b58ec11..cd87fc3 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -4,18 +4,23 @@
 #
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 
 import pysam
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index c923255..e514e4d 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -4,18 +4,23 @@
 #
 # Author: Alex Byrne <ab63@sanger.ac.uk>
 #
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 
 
 import pysam
diff --git a/pyproject.toml b/pyproject.toml
index 3123603..4c4977b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "hairpin2"
 version = "1.0.0"
 description = "CLI implementation of the artefact detection algorithm concieved by Ellis et al, 2020."
 authors = ["Alex Byrne <ab63@sanger.ac.uk>"]
-license = "AGPL3"
+license = "MIT"
 readme = "README.md"
 
 [tool.poetry.dependencies]

From 93d3d30d2c1f5195b086de4f6668c6db6862471b Mon Sep 17 00:00:00 2001
From: ab63 <ab63@sanger.ac.uk>
Date: Mon, 4 Nov 2024 12:12:04 +0000
Subject: [PATCH 165/165] fix deletion assumption, licenses

---
 CHANGES.md                              |  10 --
 README.md                               |   6 +-
 hairpin2/constants.py                   |  24 +++++
 hairpin2/helpers.py                     |  24 +++++
 hairpin2/main.py                        |   6 +-
 hairpin2/ref2seq.py                     |   1 -
 internal_doc.md                         | 131 ------------------------
 test/test_flag_read_alt_validate.py     |  32 +++++-
 test/test_flag_read_broad_valiate.py    |  24 +++++
 test/test_is_variant_AD_validate.py     |  24 +++++
 test/test_is_variant_AL_validate.py     |  24 +++++
 test/test_ref2querypos_validate.py      |  24 +++++
 test/test_ref_end_via_cigar_validate.py |  24 +++++
 13 files changed, 202 insertions(+), 152 deletions(-)
 delete mode 100644 internal_doc.md

diff --git a/CHANGES.md b/CHANGES.md
index c51e12a..8b13789 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,11 +1 @@
-### 0.0.2a
 
-- CRAM/SAM support
-- JSON argument input/output
-- added ability to ignore samples in VCF (e.g. normal) by not providing alignments for them to the -a flag
-- added `--name-mapping` flag to allow mapping VCF sample names to alignment SM tags when they do not match, as these are used to connect VCF samples with alignments
-- improved doc, helptext, and argument clarity
-
-### 0.0.1a
-
-- first release
diff --git a/README.md b/README.md
index a40d8fe..fa0f761 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 `hairpin2` – CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6).
 
-`hairpin2` is designed to flag variants with anomalous distributions indicating that they are artefactual. Initially, it was concieved to flag possible cruciform artefacts for LCM sequence data, but the concept extends to other artefacts including artefactual indels. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
+`hairpin2` is designed to flag variants with anomalous distributions indicating that they are artefactual. Initially, it was concieved to flag possible cruciform artefacts for LCM sequence data, but the concept has been extended to other artefacts including artefactual indels. It operates on a VCF file containing one or more samples, and alignment files for all samples to be tested.
 
 Given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `ADF` if variants have anomalous distributions indicating that they are likely to be artefactual, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold.
 
@@ -35,9 +35,9 @@ export PATH=${PATH}:${INST_PATH}/bin
 hairpin -h
 ```
 
-### ASSUMPTIONS
+### ASSUMPTIONS & LIMITATIONS
 
-`hairpin2` is designed for paired data where alignment records have the `MC` tag and the complete CIGAR string is present in the `CIGAR` field (rather than the `CG:B,I` tag). If the `MC` tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. No further assumptions are made – other alignment tags and VCF fields are used, however they are mandatory per the relevant format specifications.
+`hairpin2` is designed for paired data where alignment records have the `MC` tag and the complete CIGAR string is present in the `CIGAR` field (rather than the `CG:B,I` tag). If the `MC` tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. The tool can handle substitions, insertions, and deletions formatted per the VCF specification. At this time, the tool will not investigate mutations notated with angle brackets, e.g. `<DEL>`, complex mutations, or monomorphic reference. No further assumptions are made – other alignment tags and VCF fields are used, however they are mandatory per the relevant format specifications. If these requirements are limiting and you need the tool to be extended in some way, please request it.
 
 
 ### USAGE
diff --git a/hairpin2/constants.py b/hairpin2/constants.py
index 8154573..6dedcf0 100644
--- a/hairpin2/constants.py
+++ b/hairpin2/constants.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from enum import IntEnum, Flag
 from typing import Callable
 import dataclasses as d
diff --git a/hairpin2/helpers.py b/hairpin2/helpers.py
index a644d87..42f3abf 100644
--- a/hairpin2/helpers.py
+++ b/hairpin2/helpers.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from enum import IntEnum, Flag
 import logging
 import sys
diff --git a/hairpin2/main.py b/hairpin2/main.py
index cd87fc3..771d910 100644
--- a/hairpin2/main.py
+++ b/hairpin2/main.py
@@ -22,7 +22,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-
 import pysam
 from hairpin2 import ref2seq as r2s, constants as c, helpers as h
 import hairpin2
@@ -128,7 +127,7 @@ def flag_read_alt(
                         invalid_flag |= c.ValidatorFlags.NOT_ALT.value
     # DEL
     if mut_type == 'D':
-        rng = list(range(vcf_start - 1, vcf_stop + 1))
+        rng = list(range(vcf_start, vcf_stop + 1))
         mut_alns = [q
                     for q, r
                     in read.get_aligned_pairs()
@@ -377,7 +376,8 @@ def test_record_all_alts(
         if (vcf_rec.rlen == len(alt)
                 and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*']))):
             mut_type = 'S'
-        elif len(alt) < vcf_rec.rlen or alt == '.':  # DEL - DOES NOT SUPPORT <DEL> TYPE IDS
+        elif (len(alt) < vcf_rec.rlen
+                and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*']))):  # DEL - DOES NOT SUPPORT <DEL> TYPE IDS OR .
             mut_type = 'D'
         elif (vcf_rec.rlen == 1
                 and set(alt).issubset(set(['A', 'C', 'T', 'G', 'N', '*']))):  # INS - DOES NOT SUPPORT <INS> TYPE IDS
diff --git a/hairpin2/ref2seq.py b/hairpin2/ref2seq.py
index e514e4d..355dc02 100644
--- a/hairpin2/ref2seq.py
+++ b/hairpin2/ref2seq.py
@@ -22,7 +22,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-
 import pysam
 
 
diff --git a/internal_doc.md b/internal_doc.md
deleted file mode 100644
index f9a0ec3..0000000
--- a/internal_doc.md
+++ /dev/null
@@ -1,131 +0,0 @@
-### INTRODUCTION
-
-`hairpin2` - CLI implementation of the hairpin detection algorithm concieved by [Ellis et al, 2020](https://www.nature.com/articles/s41596-020-00437-6). Implemented by Peter Campbell and Alex Byrne (primary contact for this tool - ab63).
-
-For paired data, given a VCF, and BAM files for the samples of that VCF, return a VCF with variants flagged with `HPF` if they are suspected cruciform artefacts, and `ALF` if relevant reads have lower median alignment score per base than a specified threshold. The `ALF` filter indicates poor signal-to-noise, and provides additional confidence in the `HPF` filter – cruciform artefacts usually cause a marked decrease in alignment score. The `ALF` flag also may appear on variants without `HPF`, often indicating other artefacts associated with poor signal-to-noise.
-
-`hairpin2` has been designed to replace `AdditionalBamStatistics`, which forms a key part of the the LCM processing pipelines known as "Mathijs' Scripts" and "Tim Butler's scripts" (there may also be other names and other pipelines which incoroprate this tool). 
-
-Improvements and differences to the original `AdditionalBamStatistics` implementation include:
-- No more ambiguous/cryptic/unfixable errors – the tool should work on all appropriate data, and if it is unable to produce the expected output it will clearly inform the user (but see N.B. at end of this section)
-- Transparency – reasoning for flagging decisions logged in VCF
-- Centrally maintained and versioned – for reproducibility/citing/distribution
-- Significant speedup (on testing data at least) – 50s runtime on 542-variant caveman VCF
-- The module adds **filter flags**, `HPF` and `ALF`, to a VCF. It **does not** output into separate files containing passed and failed positions
-- The `ALF` flag supersedes the `ASRD` info field
-
-**N.B.** this program is currently in an alpha/testing phase – it is available on the farm, but is likely to change, or have new features added, rapidly, per user responses. **It also may be broken in some way; if so please get in touch**. It is not currently publicly available – it will be made public as soon as it is out of this alpha phase.
-
-
-### MODULE ACCESS
-
-For local or VM use, see README for install instructions.
-For farm22 use, available as a module.
-```
-module avail hairpin2
-module load <version>
-```
-**N.B. do not confuse with the module `hairpin` – this is `hairpin2`**. `hairpin` was a stopgap version of Mathijs' Scripts that relied on some of Mathijs' original code, and was unreliable and error prone.
-
-
-### ASSUMPTIONS
-
-`hairpin2` is designed for paired data where alignment records have the `MC` tag and the complete CIGAR string is present in the `CIGAR` field (rather than the `CG:B,I` tag). If the `MC` tag is not present in your data, it can be added using `samtools fixmate` or `biobambam2 bamsormadup`. No further assumptions are made – other alignment tags and VCF fields are used, however they are mandatory per the relevant format specifications.
-
-
-### USAGE
-
-```
-usage: hairpin2 [-h] [-v] -i VCF_IN -o VCF_OUT -a ALIGNMENTS [ALIGNMENTS ...] -f {s,b,c} [-al AL_FILTER_THRESHOLD] [-mc MIN_CLIP_QUALITY] [-mq MIN_MAPPING_QUALITY]
-                [-mb MIN_BASE_QUALITY] [-ms MAX_READ_SPAN] [-pf POSITION_FRACTION] [-r CRAM_REFERENCE] [-m VCF:aln [VCF:aln ...]] [-ji INPUT_JSON] [-jo OUTPUT_JSON]
-
-cruciform artefact flagging algorithm based on Ellis et al. 2020 (DOI: 10.1038/s41596-020-00437-6)
-
-info:
-  -h, --help            show this help message and exit
-  -v, --version         print version
-
-mandatory:
-  -i VCF_IN, --vcf-in VCF_IN
-                        path to input VCF
-  -o VCF_OUT, --vcf-out VCF_OUT
-                        path to write output VCF
-  -a ALIGNMENTS [ALIGNMENTS ...], --alignments ALIGNMENTS [ALIGNMENTS ...]
-                        list of paths to (S/B/CR)AMs (indicated by --format) for samples in input VCF, whitespace separated - (s/b/cr)ai expected in same directories
-  -f {s,b,c}, --format {s,b,c}
-                        format of alignment files; s indicates SAM, b indicates BAM, and c indicates CRAM
-
-extended:
-  -al AL_FILTER_THRESHOLD, --al-filter-threshold AL_FILTER_THRESHOLD
-                        threshold for median of read alignment score per base of all relevant reads, below which a variant is flagged as ALF - default: 0.93
-  -mc MIN_CLIP_QUALITY, --min-clip-quality MIN_CLIP_QUALITY
-                        discard reads with mean base quality of aligned bases below this value, if they have soft-clipped bases - default: 35
-  -mq MIN_MAPPING_QUALITY, --min-mapping-quality MIN_MAPPING_QUALITY
-                        discard reads with mapping quality below this value - default: 11
-  -mb MIN_BASE_QUALITY, --min-base-quality MIN_BASE_QUALITY
-                        discard reads with base quality at variant position below this value - default: 25
-  -ms MAX_READ_SPAN, --max-read-span MAX_READ_SPAN
-                        maximum +- position to use when detecting PCR duplicates - default: 6
-  -pf POSITION_FRACTION, --position-fraction POSITION_FRACTION
-                        >90% of variant must occur within POSITION_FRACTION of read edges to allow HPF flag - default: 0.15
-
-procedural:
-  -r CRAM_REFERENCE, --cram-reference CRAM_REFERENCE
-                        path to FASTA format CRAM reference, overrides $REF_PATH and UR tags - ignored if --format is not CRAM
-  -m VCF:aln [VCF:aln ...], --name-mapping VCF:aln [VCF:aln ...]
-                        map VCF sample names to alignment SM tags; useful if they differ
-  -ji INPUT_JSON, --input-json INPUT_JSON
-                        path to JSON of input parameters, from which extended arguments will be loaded - overridden by arguments provided on command line
-  -jo OUTPUT_JSON, --output-json OUTPUT_JSON
-                        log input arguments to JSON
-```
-
-**N.B.** the above usage block indicates the call for the tool is `hairpin2` – this is correct for local/vm installs, but for farm usage, for the time being, it is `hairpin2-alpha`
-
-Parameters are hopefully mostly clear from the helptext, but some warrant further explanation:
-
-- `--al-filter-threshold` – the default value of 0.93 was arrived at by trial and error – since different aligners/platforms calculate alignment score differently, you may want to modify this value appropriately. In "Mathijs' Scripts", the default was set at 0.87 for filtering on `ASRD`.  
-- `--max-read-span` – long homopolymer tracts can cause stuttering, where a PCR duplicate will have, for example, an additional A in a tract of As. These reads will align a base or two earlier on the reference genome than they should. As a result pcr duplicate flag machinery fails and they are not flagged as duplicates. `MAX_READ_SPAN` is then the maximum +- position to use when detecting PCR duplicates.  
-- `--position-fraction` – cruciform artefacts usually contain segments that do not align to the reference genome, resulting in the segment being soft-clipped. The subsequent aligned portion will then contain false variants, which arise from the artefact. These false variants appear with anomalous regularity at alignment boundaries – unlike true variants. If, for a given variant, more than 90% of the variant bases are within `POSITION_FRACTION` of read edges, allow for calling `HPF` flag.
-
-##### Usage in context of Mathijs pipeline
-
-This section is under construction - if you have questions in the meantime please ask Rashesh
-
-> Mathjis LCM filters includes the following steps:
-> 1. Preselect: Filters the CaVEMan calls for “PASS” && “CLPM=0” && “ASMD>=140”
-> 2. Hairpin Filtering
-> 3. Filtering based on fragment numbers.  
->
-> Which are split across the following steps: (As per his scripts)  
-> - preselect
-> - imitateANNOVAR
-> - annotateBAMStatistics
-> - additionalBAMStatistics
-> - filtering  
->
-> The `hairpin2` module replaces the “additionalBAMStatistics” and most of the “filtering” code. So [one may still need] to run the preselect [sans the ASMD filter] and fragment based filter.  
-
-(pre)filtering is not performed by this module, as the filtering is not relevant to hairpin detection and should be performed separately. Filtering can be performed using the `vcfilter` or `bcftools` modules.  
-
-
-
-### DETAILS
-
-The tool tests records in a VCF file and applies the `HPF` and `ALF` filter flags as appropriate. Reasoning for decisions is recorded in the INFO field of the VCF records, in the form `HPF=<alt>|<code>` and `ALF=<alt>|<code>|<median AS score>`. The codes are as follows:  
-
-**0** – passed/failed on condition 60A(i) of Ellis et al. (`HPF` only)  
-**1** – passed/failed on condition 60B(i) of Ellis et al. (`HPF` only)  
-**2** – passed/failed on filter threshold (`ALF` only)  
-**3** – insufficient appropriate reads to support calling flag – this covers a lot of possiblities, if more granularity is desired, please request it  
-**4** – no samples have non 0,0 genotype for the record  
-  
-
-The basic procedure of this implementation is as follows. For each record in the VCF, test every alt for that record by:  
-1. retrieving reads from samples exhibiting the mutations
-2. testing each read for validity for use in hairpin testing (i.e. base quality, do they express the correct alt, and so on)
-3. performing statistical analysis on aggregates of the position of the mutation relative to the start and end of the aligned portion of the reads
-4. on the results of the statistical analysis, pass or fail the record for the filters `ALF` and `HPF`, and log a code and relevant info to the `INFO` field indicating the reason for the decision
-
-The code has been written with the intention of clarity and extensibility – further understanding may be achieved by reading `hairpin2/main.py`.
-
diff --git a/test/test_flag_read_alt_validate.py b/test/test_flag_read_alt_validate.py
index 0fb6c2a..c0ae24b 100644
--- a/test/test_flag_read_alt_validate.py
+++ b/test/test_flag_read_alt_validate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.main import flag_read_alt
 from hairpin2 import constants as c
 import pysam
@@ -88,8 +112,8 @@ def test_path_del_bad_op():
     expected = c.ValidatorFlags.BAD_OP.value
     result = flag_read_alt(read=r,
                            vcf_start=99,
-                           vcf_stop=100,
-                           alt='.',
+                           vcf_stop=101,
+                           alt='C',
                            mut_type='D',
                            min_basequal=25)
     assert expected == result
@@ -102,9 +126,9 @@ def test_path_good_del():
     rc = copy.deepcopy(r)
     rc.cigarstring = '4M2D6M'
     result = flag_read_alt(read=rc,
-                           vcf_start=99,
+                           vcf_start=98,
                            vcf_stop=101,
-                           alt='.',
+                           alt='CC',
                            mut_type='D',
                            min_basequal=25)
     assert expected == result
diff --git a/test/test_flag_read_broad_valiate.py b/test/test_flag_read_broad_valiate.py
index 48d51b2..93b548f 100644
--- a/test/test_flag_read_broad_valiate.py
+++ b/test/test_flag_read_broad_valiate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.main import flag_read_broad
 from hairpin2 import constants as c
 import pysam
diff --git a/test/test_is_variant_AD_validate.py b/test/test_is_variant_AD_validate.py
index f7e3e1f..c5a4621 100644
--- a/test/test_is_variant_AD_validate.py
+++ b/test/test_is_variant_AD_validate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.main import is_variant_AD
 from hairpin2 import constants as c
 import pysam
diff --git a/test/test_is_variant_AL_validate.py b/test/test_is_variant_AL_validate.py
index 7eed3e7..2197a1e 100644
--- a/test/test_is_variant_AL_validate.py
+++ b/test/test_is_variant_AL_validate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.main import is_variant_AL
 from hairpin2 import constants as c
 import pysam
diff --git a/test/test_ref2querypos_validate.py b/test/test_ref2querypos_validate.py
index a1d4b9c..a9eee2b 100644
--- a/test/test_ref2querypos_validate.py
+++ b/test/test_ref2querypos_validate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.ref2seq import ref2querypos
 import pytest
 import pysam
diff --git a/test/test_ref_end_via_cigar_validate.py b/test/test_ref_end_via_cigar_validate.py
index c2e48e5..28d3cb2 100644
--- a/test/test_ref_end_via_cigar_validate.py
+++ b/test/test_ref_end_via_cigar_validate.py
@@ -1,3 +1,27 @@
+# hairpin2
+#
+# Copyright (C) 2024 Genome Research Ltd.
+#
+# Author: Alex Byrne <ab63@sanger.ac.uk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
 from hairpin2.ref2seq import ref_end_via_cigar
 import pytest