From d0f417c13da93a9bcbf2bedb5bc1062d3ccfce1f Mon Sep 17 00:00:00 2001
From: chaklim <chaklim@gmail.com>
Date: Tue, 14 Jan 2020 15:24:39 +0800
Subject: [PATCH 1/3] Fix AF bug

- no need to count insertion and deletion for depth calculation
- refactor to remove nested if
---
 dataPrepScripts/ExtractVariantCandidates.py | 70 +++++++++++----------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/dataPrepScripts/ExtractVariantCandidates.py b/dataPrepScripts/ExtractVariantCandidates.py
index 3c743e6..4e3dc17 100644
--- a/dataPrepScripts/ExtractVariantCandidates.py
+++ b/dataPrepScripts/ExtractVariantCandidates.py
@@ -319,15 +319,17 @@ def make_candidates(args):
         positions = [x for x in pileup.keys() if x < POS] if not is_finish_reading_output else list(pileup.keys())
         positions.sort()
         for zero_based_position in positions:
-            baseCount = depth = reference_base = temp_key = None
+            base_count = depth = reference_base = temp_key = None
 
             # ctg and bed checking (region [ctg_start, ctg_end] is 1-based, inclusive start and end positions)
             pass_ctg = not is_ctg_range_given or ctg_start <= zero_based_position+1 <= ctg_end
             pass_bed = not is_bed_file_given or is_region_in(tree, ctg_name, zero_based_position)
+            if not pass_bed or not pass_ctg:
+                continue
 
             # output probability checking
             pass_output_probability = True
-            if pass_ctg and pass_bed and is_building_training_dataset and is_variant_file_given:
+            if is_building_training_dataset and is_variant_file_given:
                 temp_key = ctg_name + ":" + str(zero_based_position+1)
                 pass_output_probability = (
                     temp_key not in variants_map and (
@@ -335,43 +337,47 @@ def make_candidates(args):
                         (temp_key not in non_variants_map and random.uniform(0, 1) <= output_probability_outside_variant)
                     )
                 )
-            elif pass_ctg and pass_bed and is_building_training_dataset:
+            elif is_building_training_dataset:
                 pass_output_probability = random.uniform(0, 1) <= output_probability
+            if not pass_output_probability:
+                continue
 
-            # depth checking
-            pass_depth = False
-            if pass_ctg and pass_bed and pass_output_probability:
-                baseCount = list(pileup[zero_based_position].items())
-                depth = sum(x[1] for x in baseCount)
-                pass_depth = depth >= minimum_depth_for_candidate
-
-            # af checking
-            pass_af = False
-            if pass_ctg and pass_bed and pass_output_probability and pass_depth:
+            # for depth checking and af checking
+            try:
                 reference_base = evc_base_from(reference_sequence[
                     zero_based_position - (0 if reference_start is None else (reference_start - 1))
                 ])
-                denominator = depth if depth > 0 else 1
-                baseCount.sort(key=lambda x: -x[1])  # sort baseCount descendingly
-                p0, p1 = float(baseCount[0][1]) / denominator, float(baseCount[1][1]) / denominator
-                pass_af = (
-                    (p0 <= 1.0 - minimum_af_for_candidate and p1 >= minimum_af_for_candidate) or
-                    baseCount[0][0] != reference_base
-                )
+                position_dict = pileup[zero_based_position]
+            except:
+                continue
+
+            # depth checking
+            base_count = list(position_dict.items())
+            depth = sum(x[1] for x in base_count) - position_dict["I"] - position_dict["D"]
+            if depth < minimum_depth_for_candidate:
+                continue
+
+            # af checking
+            denominator = depth if depth > 0 else 1
+            base_count.sort(key=lambda x: -x[1])  # sort base_count descendingly
+            pass_af = (
+                base_count[0][0] != reference_base or
+                (float(base_count[1][1]) / denominator) >= minimum_af_for_candidate
+            )
+            if not pass_af:
+                continue
 
             # output 1-based candidate
-            need_output_candidate = pass_ctg and pass_bed and pass_output_probability and pass_depth and pass_af
-            if need_output_candidate:
-                if temp_key is not None and temp_key in non_variants_map:
-                    no_of_candidates_near_variant += 1
-                elif temp_key is not None and temp_key not in non_variants_map:
-                    no_of_candidates_outside_variant += 1
-
-                output = [ctg_name, zero_based_position+1, reference_base, depth]
-                output.extend(["%s %d" % x for x in baseCount])
-                output = " ".join([str(x) for x in output]) + "\n"
-
-                can_fp.stdin.write(output)
+            if temp_key is not None and temp_key in non_variants_map:
+                no_of_candidates_near_variant += 1
+            elif temp_key is not None and temp_key not in non_variants_map:
+                no_of_candidates_outside_variant += 1
+
+            output = [ctg_name, zero_based_position+1, reference_base, depth]
+            output.extend(["%s %d" % x for x in base_count])
+            output = " ".join([str(x) for x in output]) + "\n"
+
+            can_fp.stdin.write(output)
 
         for zero_based_position in positions:
             del pileup[zero_based_position]

From 3aab91a8095c13364c98b97038ab2715d5a23848 Mon Sep 17 00:00:00 2001
From: chaklim <chaklim@gmail.com>
Date: Fri, 17 Jan 2020 01:07:33 +0800
Subject: [PATCH 2/3] Fix insertion bug

- remove SNP information before inferred insertion bases
---
 clair/call_var.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clair/call_var.py b/clair/call_var.py
index 6dc09de..8a869bb 100644
--- a/clair/call_var.py
+++ b/clair/call_var.py
@@ -405,6 +405,10 @@ def inferred_insertion_bases_from(tensor_input):
         for base_index in range(0, 4):
             insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
             insertion_tensor[base_index + 4] = 0
+            insertion_tensor[base_index] -= (
+                tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
+            )
+
         if (
             position < (flanking_base_number + minimum_variant_length_that_need_infer) or
             sum(insertion_tensor) >= inferred_indel_length_minimum_allele_frequency * sum(reference_tensor)
@@ -437,6 +441,10 @@ def insertion_bases_using_tensor(tensor_input, variant_length):
         for base_index in range(0, 4):
             insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
             insertion_tensor[base_index + 4] = 0
+            insertion_tensor[base_index] -= (
+                tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
+            )
+
         insertion_bases += num2base[np.argmax(insertion_tensor) % 4]
     return insertion_bases
 

From fdeca05e83e02b855399b3ad010cc41346a4e611 Mon Sep 17 00:00:00 2001
From: LUO Ruibang <rbluo@Ruibangs-MacBook-Air.local>
Date: Tue, 11 Feb 2020 00:02:43 +0800
Subject: [PATCH 3/3] update README

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 1285d05..1e1cc12 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,11 @@ Single-molecule sequencing technologies have emerged in recent years and revolut
 
 This is the formal release of Clair (Clair v2, Dec 2019). You can find the experimental Clair v1 (Jan 2019) at [https://github.com/aquaskyline/Clair](https://github.com/aquaskyline/Clair). The preprint of Clair v2 is available in [bioRxiv](https://www.biorxiv.org/content/10.1101/865782v2).
 
+## What are we working on right now?
+* Testing small technics to resolve complex variants, e.g. a deletion that spans a SNP closely followed.
+* An ONT model that supports coverage up to 600x using the HG002 datasets by "The Human Pangenome Reference Consortium".
+* A full alignment representation for higher performance in the low complexity genomics regions.
+
 ---
 
 ## Contents