Skip to content

Commit

Permalink
Fix format and annotation issues (#27)
Browse files Browse the repository at this point in the history
* fixed format issue

* fixed D19 issue

* Fixed D21 error

* fixed D21 typo

* fixed minor error in FGA

* updated strait razor format test data

* fixed partial sequences error

* updated flanking report test file

* Fixed D21 LUS error

* added tests for new formatting rules

* fixed style errors

Co-authored-by: Rebecca Mitchell <[email protected]>
  • Loading branch information
rnmitchell and Rebecca Mitchell authored Jun 15, 2020
1 parent 27ef0f6 commit 9ac308e
Show file tree
Hide file tree
Showing 8 changed files with 59,228 additions and 59,154 deletions.
28 changes: 24 additions & 4 deletions lusSTR/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,15 @@
from lusSTR.repeat import collapse_all_repeats, collapse_repeats_by_length
from lusSTR.repeat import sequence_to_bracketed_form, split_by_n
from lusSTR.repeat import reverse_complement, reverse_complement_bracketed
from pkg_resources import resource_filename


def get_str_metadata_file():
return resource_filename('lusSTR', 'str_markers.json')


with open(get_str_metadata_file(), 'r') as fh:
str_marker_data = json.load(fh)


def split_sequence_into_two_strings(sequence, repeat_for_split):
Expand Down Expand Up @@ -50,15 +59,25 @@ def main(args):
except IndexError:
project = 'NA'
analysis = 'NA'

metadata = str_marker_data[locus]
if (
len(sequence) <= (metadata['Foren_5'] + metadata['Foren_3']) and not args.uas
and args.kit == 'forenseq'
):
flank_summary = [
sampleid, project, analysis, locus, reads, 'NA', sequence, 'NA', 'NA', 'NA',
'Partial sequence'
]
flanks_list.append(flank_summary)
continue
marker = lusSTR.marker.STRMarkerObject(locus, sequence, uas=args.uas, kit=args.kit)
summary = [sampleid, project, analysis, locus] + marker.summary + [reads]
list_of_lists.append(summary)

if not args.uas and args.kit == 'forenseq':
flank_summary = [
sampleid, project, analysis, locus, reads, marker.canonical, marker.sequence,
marker.flank_5p, marker.annotation, marker.flank_3p
marker.flank_5p, marker.annotation, marker.flank_3p, "NA"
]
flanks_list.append(flank_summary)

Expand All @@ -71,8 +90,9 @@ def main(args):
name = os.path.splitext(args.out)[0]
if not args.uas:
flanks_columns = [
'SampleID', 'Project', 'Analysis', 'Locus', 'Reads', 'Length_Allele', 'Full_Sequence',
'5_Flank_Anno', 'UAS_Region_Anno', '3_Flank_Anno'
'SampleID', 'Project', 'Analysis', 'Locus', 'Reads', 'Length_Allele',
'Full_Sequence', '5_Flank_Anno', 'UAS_Region_Anno', '3_Flank_Anno',
'Potential_Issues'
]
final_flank_output = pd.DataFrame(flanks_list, columns=flanks_columns)
final_flank_output.to_csv(f'{name}_flanks_anno.txt', sep='\t', index=False)
Expand Down
2 changes: 1 addition & 1 deletion lusSTR/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def main(args):
path = args.input
analysisID = path.rstrip(os.sep)
analysisID_final = os.path.basename(analysisID)
results_final['Project'] = "NA"
results_final['Project'] = analysisID_final
results_final['Analysis'] = analysisID_final

output_file = sys.stdout
Expand Down
46 changes: 29 additions & 17 deletions lusSTR/marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,13 @@

import json
import lusSTR
from lusSTR.annot import split_sequence_into_two_strings
from lusSTR.annot import get_str_metadata_file, split_sequence_into_two_strings
from lusSTR.repeat import collapse_repeats_by_length, sequence_to_bracketed_form
from lusSTR.repeat import reverse_complement, reverse_complement_bracketed
from lusSTR.repeat import repeat_copy_number, collapse_all_repeats, split_by_n
from pkg_resources import resource_filename
import re


def get_str_metadata_file():
return resource_filename('lusSTR', 'str_markers.json')


with open(get_str_metadata_file(), 'r') as fh:
str_marker_data = json.load(fh)

Expand Down Expand Up @@ -146,6 +141,15 @@ def canonical(self):
canon_allele = f'{allele_int}.{allele_dec}'
return canon_allele

@property
def indel_flag(self):
'''Check for potential indels within flanking regions'''
if str(self.canonical) not in self.data['Alleles']:
flag = 'Possible indel or partial sequence'
else:
flag = ' '
return flag

@property
def cannot_split(self):
return self.locus in [
Expand Down Expand Up @@ -454,7 +458,7 @@ def annotation(self):
annotation which is consistent with previously published annotation for this locus.
'''
sequence = self.forward_sequence
if len(sequence) % self.repeat_size == 0:
if len(sequence) % self.repeat_size == 0 or (not ('GGAA') in sequence):
return collapse_repeats_by_length(sequence, self.repeat_size)
else:
final = list()
Expand Down Expand Up @@ -565,7 +569,9 @@ def annotation(self):
prev = m.end()
if (
prev == (len(forward_strand_brack_form) - 1) or
prev == (len(forward_strand_brack_form) - 2)
prev == (len(forward_strand_brack_form) - 2) or
prev == (len(forward_strand_brack_form) - 4) or
prev == (len(forward_strand_brack_form) - 5)
):
return forward_strand_brack_form
else:
Expand Down Expand Up @@ -619,8 +625,12 @@ def designation(self):
repeats = repeat_copy_number(i, repeat)
lus_sec.append(repeats)
if lus_allele is None:
lus_allele = lus_sec[1]
sec_allele = lus_sec[0]
if len(lus_sec) == 2:
lus_allele = lus_sec[1]
sec_allele = lus_sec[0]
else:
lus_allele = 0
sec_allele = lus_sec[0]

finalcount = 0
for m in re.finditer(self.data['Tert'], self.annotation):
Expand Down Expand Up @@ -690,13 +700,15 @@ def annotation(self):
final.append(sequence_to_bracketed_form(first_string, 4, self.repeats))
else:
final.append(collapse_repeats_by_length(first_string, 4))
if (len(second_string) % 4 != 0):
third_string = second_string[:-6]
final.append(collapse_repeats_by_length(third_string, 4))
final.append(second_string[-6:-4])
final.append(second_string[-4:])
else:
final.append(collapse_repeats_by_length(second_string, 4))
if (second_string != ""):
if (len(second_string) % 4 != 0):
if (len(second_string) > 6):
third_string = second_string[:-6]
final.append(collapse_repeats_by_length(third_string, 4))
final.append(second_string[-6:-4])
final.append(second_string[-4:])
else:
final.append(collapse_repeats_by_length(second_string, 4))
final_string = ' '.join(final)
return re.sub(r' +', ' ', final_string)

Expand Down
Loading

0 comments on commit 9ac308e

Please sign in to comment.