-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Reimplement and rename get_annotation function * loci_need_split_anno --> collapse_repeats_by_length * Fix style violations * split_string --> sequence_to_bracketed_form * New repeat module
- Loading branch information
Showing
6 changed files
with
161 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2020, Battelle National Biodefense Institute. | ||
# | ||
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR) | ||
# and is licensed under the BSD license: see LICENSE.txt. | ||
# ----------------------------------------------------------------------------- | ||
|
||
import re | ||
|
||
|
||
def collapse_tandem_repeat(fullseq, repeat): | ||
'''Collapse tandem stretches of the specified repeat sequence in a larger sequence. | ||
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT') | ||
'TAG [ATT]3 TAGTAG ATT TAGTAG' | ||
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG') | ||
'TAG ATTATTATT [TAG]2 ATT [TAG]2' | ||
''' | ||
if repeat not in fullseq: | ||
return fullseq | ||
i = fullseq.find(repeat) | ||
prefix = fullseq[:i] | ||
suffix = fullseq[i:] | ||
count = 0 | ||
while suffix.startswith(repeat): | ||
count += 1 | ||
suffix = suffix[len(repeat):] | ||
if count == 1: | ||
formatted = f' {repeat} ' | ||
else: | ||
formatted = f' [{repeat}]{count} ' | ||
final = prefix + formatted + collapse_tandem_repeat(suffix, repeat) | ||
final = final.strip() | ||
final = re.sub(r' +', ' ', final) | ||
return final | ||
|
||
|
||
def collapse_all_repeats(sequence, repeats): | ||
'''Convert a sequence to bracketed form by collapsing stretches of tandem repeats. | ||
>>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG']) | ||
'TAG [ATT]3 [TAG]2 ATT [TAG]2' | ||
''' | ||
collapsed_seq = sequence | ||
for repeat in repeats: | ||
collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat) | ||
return collapsed_seq | ||
|
||
|
||
def split_by_n(sequence, n): | ||
'''Split a sequence into non-overlapping chunks of length n.''' | ||
while sequence: | ||
yield sequence[:n] | ||
sequence = sequence[n:] | ||
|
||
|
||
def get_blocks(sequence, n): | ||
'''Split a sequence into chunks of length n, and count adjacent repeated chunks.''' | ||
count = 0 | ||
prev = None | ||
for unit in split_by_n(sequence, n): | ||
if unit != prev: | ||
if prev is not None: | ||
yield prev, count | ||
prev = unit | ||
count = 0 | ||
count += 1 | ||
yield prev, count | ||
|
||
|
||
def collapse_repeats_by_length(sequence, n): | ||
'''Convert to bracketed annotation form by splitting the sequence into blocks of size n.''' | ||
units = list() | ||
for unit, count in get_blocks(sequence, n): | ||
if count == 1: | ||
units.append(unit) | ||
else: | ||
units.append(f'[{unit}]{count}') | ||
result = ' '.join(units) | ||
result = re.sub(r' +', ' ', result) | ||
return result | ||
|
||
|
||
def sequence_to_bracketed_form(sequence, n, repeats): | ||
'''Convert sequence to bracketed annotation. | ||
Uses a combination of repeat-based and length-based methods to convert a sequence containing | ||
tandem repeats into a concise bracketed representation. | ||
''' | ||
collapsed = collapse_all_repeats(sequence, repeats) | ||
blocks = list() | ||
for unit in collapsed.split(' '): | ||
if len(unit) > n and '[' not in unit: | ||
for x in split_by_n(unit, n): | ||
blocks.append(x) | ||
else: | ||
blocks.append(unit) | ||
result = ' '.join(blocks) | ||
result = re.sub(r' +', ' ', result) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2020, Battelle National Biodefense Institute. | ||
# | ||
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR) | ||
# and is licensed under the BSD license: see LICENSE.txt. | ||
# ----------------------------------------------------------------------------- | ||
|
||
import lusSTR | ||
from lusSTR.repeat import collapse_tandem_repeat, collapse_all_repeats | ||
from lusSTR.repeat import split_by_n, get_blocks | ||
from lusSTR.repeat import collapse_repeats_by_length, sequence_to_bracketed_form | ||
import pytest | ||
|
||
|
||
@pytest.mark.parametrize('sequence, repeat_list, output', [ | ||
( | ||
'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT', | ||
['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT' | ||
), | ||
( | ||
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG', | ||
['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'], | ||
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG' | ||
) | ||
]) | ||
def test_collapse_all_repeats(sequence, repeat_list, output): | ||
final_output = collapse_all_repeats(sequence, repeat_list) | ||
assert final_output == output | ||
|
||
|
||
def test_split_by_n(): | ||
sequence = 'AGGTAGGTAGGTCGAACGAATTGG' | ||
blocks = list(split_by_n(sequence, n=4)) | ||
assert blocks == [ | ||
'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG' | ||
] | ||
|
||
|
||
def test_sequence_to_bracketed_form(): | ||
sequence = ( | ||
'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC' | ||
'TATCTATCTATCTATCTATCTATCTATCTATCTA' | ||
) | ||
repeats = ['TCTA', 'TCTG'] | ||
final_output = sequence_to_bracketed_form(sequence, 6, repeats) | ||
assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11' | ||
|
||
|
||
def test_collapse_repeats_by_length(): | ||
sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA' | ||
assert collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters