Merge pull request #16 from HKU-BAL/develop

update README
HKU-BAL · Feb 10, 2020 · 7ef7e72 · 7ef7e72
2 parents a449c34 + fdeca05
commit 7ef7e72
Show file tree

Hide file tree

Showing 3 changed files with 51 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,11 @@ Single-molecule sequencing technologies have emerged in recent years and revolut
 
 This is the formal release of Clair (Clair v2, Dec 2019). You can find the experimental Clair v1 (Jan 2019) at [https://github.com/aquaskyline/Clair](https://github.com/aquaskyline/Clair). The preprint of Clair v2 is available in [bioRxiv](https://www.biorxiv.org/content/10.1101/865782v2).
 
+## What are we working on right now?
+* Testing small technics to resolve complex variants, e.g. a deletion that spans a SNP closely followed.
+* An ONT model that supports coverage up to 600x using the HG002 datasets by "The Human Pangenome Reference Consortium".
+* A full alignment representation for higher performance in the low complexity genomics regions.
+
 ---
 
 ## Contents

diff --git a/clair/call_var.py b/clair/call_var.py
@@ -405,6 +405,10 @@ def inferred_insertion_bases_from(tensor_input):
         for base_index in range(0, 4):
             insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
             insertion_tensor[base_index + 4] = 0
+            insertion_tensor[base_index] -= (
+                tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
+            )
+
         if (
             position < (flanking_base_number + minimum_variant_length_that_need_infer) or
             sum(insertion_tensor) >= inferred_indel_length_minimum_allele_frequency * sum(reference_tensor)
@@ -437,6 +441,10 @@ def insertion_bases_using_tensor(tensor_input, variant_length):
         for base_index in range(0, 4):
             insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
             insertion_tensor[base_index + 4] = 0
+            insertion_tensor[base_index] -= (
+                tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
+            )
+
         insertion_bases += num2base[np.argmax(insertion_tensor) % 4]
     return insertion_bases
 

diff --git a/dataPrepScripts/ExtractVariantCandidates.py b/dataPrepScripts/ExtractVariantCandidates.py
@@ -319,59 +319,65 @@ def make_candidates(args):
         positions = [x for x in pileup.keys() if x < POS] if not is_finish_reading_output else list(pileup.keys())
         positions.sort()
         for zero_based_position in positions:
-            baseCount = depth = reference_base = temp_key = None
+            base_count = depth = reference_base = temp_key = None
 
             # ctg and bed checking (region [ctg_start, ctg_end] is 1-based, inclusive start and end positions)
             pass_ctg = not is_ctg_range_given or ctg_start <= zero_based_position+1 <= ctg_end
             pass_bed = not is_bed_file_given or is_region_in(tree, ctg_name, zero_based_position)
+            if not pass_bed or not pass_ctg:
+                continue
 
             # output probability checking
             pass_output_probability = True
-            if pass_ctg and pass_bed and is_building_training_dataset and is_variant_file_given:
+            if is_building_training_dataset and is_variant_file_given:
                 temp_key = ctg_name + ":" + str(zero_based_position+1)
                 pass_output_probability = (
                     temp_key not in variants_map and (
                         (temp_key in non_variants_map and random.uniform(0, 1) <= output_probability_near_variant) or
                         (temp_key not in non_variants_map and random.uniform(0, 1) <= output_probability_outside_variant)
                     )
                 )
-            elif pass_ctg and pass_bed and is_building_training_dataset:
+            elif is_building_training_dataset:
                 pass_output_probability = random.uniform(0, 1) <= output_probability
+            if not pass_output_probability:
+                continue
 
-            # depth checking
-            pass_depth = False
-            if pass_ctg and pass_bed and pass_output_probability:
-                baseCount = list(pileup[zero_based_position].items())
-                depth = sum(x[1] for x in baseCount)
-                pass_depth = depth >= minimum_depth_for_candidate
-
-            # af checking
-            pass_af = False
-            if pass_ctg and pass_bed and pass_output_probability and pass_depth:
+            # for depth checking and af checking
+            try:
                 reference_base = evc_base_from(reference_sequence[
                     zero_based_position - (0 if reference_start is None else (reference_start - 1))
                 ])
-                denominator = depth if depth > 0 else 1
-                baseCount.sort(key=lambda x: -x[1])  # sort baseCount descendingly
-                p0, p1 = float(baseCount[0][1]) / denominator, float(baseCount[1][1]) / denominator
-                pass_af = (
-                    (p0 <= 1.0 - minimum_af_for_candidate and p1 >= minimum_af_for_candidate) or
-                    baseCount[0][0] != reference_base
-                )
+                position_dict = pileup[zero_based_position]
+            except:
+                continue
+
+            # depth checking
+            base_count = list(position_dict.items())
+            depth = sum(x[1] for x in base_count) - position_dict["I"] - position_dict["D"]
+            if depth < minimum_depth_for_candidate:
+                continue
+
+            # af checking
+            denominator = depth if depth > 0 else 1
+            base_count.sort(key=lambda x: -x[1])  # sort base_count descendingly
+            pass_af = (
+                base_count[0][0] != reference_base or
+                (float(base_count[1][1]) / denominator) >= minimum_af_for_candidate
+            )
+            if not pass_af:
+                continue
 
             # output 1-based candidate
-            need_output_candidate = pass_ctg and pass_bed and pass_output_probability and pass_depth and pass_af
-            if need_output_candidate:
-                if temp_key is not None and temp_key in non_variants_map:
-                    no_of_candidates_near_variant += 1
-                elif temp_key is not None and temp_key not in non_variants_map:
-                    no_of_candidates_outside_variant += 1
-
-                output = [ctg_name, zero_based_position+1, reference_base, depth]
-                output.extend(["%s %d" % x for x in baseCount])
-                output = " ".join([str(x) for x in output]) + "\n"
-
-                can_fp.stdin.write(output)
+            if temp_key is not None and temp_key in non_variants_map:
+                no_of_candidates_near_variant += 1
+            elif temp_key is not None and temp_key not in non_variants_map:
+                no_of_candidates_outside_variant += 1
+
+            output = [ctg_name, zero_based_position+1, reference_base, depth]
+            output.extend(["%s %d" % x for x in base_count])
+            output = " ".join([str(x) for x in output]) + "\n"
+
+            can_fp.stdin.write(output)
 
         for zero_based_position in positions:
             del pileup[zero_based_position]