Skip to content

Commit

Permalink
Merge pull request #16 from HKU-BAL/develop
Browse files Browse the repository at this point in the history
update README
  • Loading branch information
aquaskyline authored Feb 10, 2020
2 parents a449c34 + fdeca05 commit 7ef7e72
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 32 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Single-molecule sequencing technologies have emerged in recent years and revolut

This is the formal release of Clair (Clair v2, Dec 2019). You can find the experimental Clair v1 (Jan 2019) at [https://github.com/aquaskyline/Clair](https://github.com/aquaskyline/Clair). The preprint of Clair v2 is available in [bioRxiv](https://www.biorxiv.org/content/10.1101/865782v2).

## What are we working on right now?
* Testing small technics to resolve complex variants, e.g. a deletion that spans a SNP closely followed.
* An ONT model that supports coverage up to 600x using the HG002 datasets by "The Human Pangenome Reference Consortium".
* A full alignment representation for higher performance in the low complexity genomics regions.

---

## Contents
Expand Down
8 changes: 8 additions & 0 deletions clair/call_var.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,10 @@ def inferred_insertion_bases_from(tensor_input):
for base_index in range(0, 4):
insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
insertion_tensor[base_index + 4] = 0
insertion_tensor[base_index] -= (
tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
)

if (
position < (flanking_base_number + minimum_variant_length_that_need_infer) or
sum(insertion_tensor) >= inferred_indel_length_minimum_allele_frequency * sum(reference_tensor)
Expand Down Expand Up @@ -437,6 +441,10 @@ def insertion_bases_using_tensor(tensor_input, variant_length):
for base_index in range(0, 4):
insertion_tensor[base_index] = insertion_tensor[base_index] + insertion_tensor[base_index + 4]
insertion_tensor[base_index + 4] = 0
insertion_tensor[base_index] -= (
tensor_input[position, base_index, Channel.SNP] + tensor_input[position, base_index + 4, Channel.SNP]
)

insertion_bases += num2base[np.argmax(insertion_tensor) % 4]
return insertion_bases

Expand Down
70 changes: 38 additions & 32 deletions dataPrepScripts/ExtractVariantCandidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,59 +319,65 @@ def make_candidates(args):
positions = [x for x in pileup.keys() if x < POS] if not is_finish_reading_output else list(pileup.keys())
positions.sort()
for zero_based_position in positions:
baseCount = depth = reference_base = temp_key = None
base_count = depth = reference_base = temp_key = None

# ctg and bed checking (region [ctg_start, ctg_end] is 1-based, inclusive start and end positions)
pass_ctg = not is_ctg_range_given or ctg_start <= zero_based_position+1 <= ctg_end
pass_bed = not is_bed_file_given or is_region_in(tree, ctg_name, zero_based_position)
if not pass_bed or not pass_ctg:
continue

# output probability checking
pass_output_probability = True
if pass_ctg and pass_bed and is_building_training_dataset and is_variant_file_given:
if is_building_training_dataset and is_variant_file_given:
temp_key = ctg_name + ":" + str(zero_based_position+1)
pass_output_probability = (
temp_key not in variants_map and (
(temp_key in non_variants_map and random.uniform(0, 1) <= output_probability_near_variant) or
(temp_key not in non_variants_map and random.uniform(0, 1) <= output_probability_outside_variant)
)
)
elif pass_ctg and pass_bed and is_building_training_dataset:
elif is_building_training_dataset:
pass_output_probability = random.uniform(0, 1) <= output_probability
if not pass_output_probability:
continue

# depth checking
pass_depth = False
if pass_ctg and pass_bed and pass_output_probability:
baseCount = list(pileup[zero_based_position].items())
depth = sum(x[1] for x in baseCount)
pass_depth = depth >= minimum_depth_for_candidate

# af checking
pass_af = False
if pass_ctg and pass_bed and pass_output_probability and pass_depth:
# for depth checking and af checking
try:
reference_base = evc_base_from(reference_sequence[
zero_based_position - (0 if reference_start is None else (reference_start - 1))
])
denominator = depth if depth > 0 else 1
baseCount.sort(key=lambda x: -x[1]) # sort baseCount descendingly
p0, p1 = float(baseCount[0][1]) / denominator, float(baseCount[1][1]) / denominator
pass_af = (
(p0 <= 1.0 - minimum_af_for_candidate and p1 >= minimum_af_for_candidate) or
baseCount[0][0] != reference_base
)
position_dict = pileup[zero_based_position]
except:
continue

# depth checking
base_count = list(position_dict.items())
depth = sum(x[1] for x in base_count) - position_dict["I"] - position_dict["D"]
if depth < minimum_depth_for_candidate:
continue

# af checking
denominator = depth if depth > 0 else 1
base_count.sort(key=lambda x: -x[1]) # sort base_count descendingly
pass_af = (
base_count[0][0] != reference_base or
(float(base_count[1][1]) / denominator) >= minimum_af_for_candidate
)
if not pass_af:
continue

# output 1-based candidate
need_output_candidate = pass_ctg and pass_bed and pass_output_probability and pass_depth and pass_af
if need_output_candidate:
if temp_key is not None and temp_key in non_variants_map:
no_of_candidates_near_variant += 1
elif temp_key is not None and temp_key not in non_variants_map:
no_of_candidates_outside_variant += 1

output = [ctg_name, zero_based_position+1, reference_base, depth]
output.extend(["%s %d" % x for x in baseCount])
output = " ".join([str(x) for x in output]) + "\n"

can_fp.stdin.write(output)
if temp_key is not None and temp_key in non_variants_map:
no_of_candidates_near_variant += 1
elif temp_key is not None and temp_key not in non_variants_map:
no_of_candidates_outside_variant += 1

output = [ctg_name, zero_based_position+1, reference_base, depth]
output.extend(["%s %d" % x for x in base_count])
output = " ".join([str(x) for x in output]) + "\n"

can_fp.stdin.write(output)

for zero_based_position in positions:
del pileup[zero_based_position]
Expand Down

0 comments on commit 7ef7e72

Please sign in to comment.