Skip to content

Commit

Permalink
New "repeat" module (#23)
Browse files Browse the repository at this point in the history
* Reimplement and rename get_annotation function

* loci_need_split_anno --> collapse_repeats_by_length

* Fix style violations

* split_string --> sequence_to_bracketed_form

* New repeat module
  • Loading branch information
standage authored May 27, 2020
1 parent 763b688 commit 0091ed3
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 140 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ help: Makefile

## test: run the automated test suite and print coverage information
test:
pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_suite.py
pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_*.py

## style: check code style against PEP8
style:
pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_suite.py
pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_*.py

## devenv: configure a development environment
devenv:
Expand Down
1 change: 1 addition & 0 deletions lusSTR/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# -----------------------------------------------------------------------------

from lusSTR import annot
from lusSTR import repeat
from lusSTR.annot import str_dict
from lusSTR import format
from lusSTR import cli
Expand Down
102 changes: 3 additions & 99 deletions lusSTR/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
from pkg_resources import resource_filename
import re
import sys

import lusSTR
from lusSTR.repeat import collapse_all_repeats, collapse_repeats_by_length
from lusSTR.repeat import sequence_to_bracketed_form, split_by_n


def get_str_metadata_file():
Expand All @@ -25,73 +28,6 @@ def get_str_metadata_file():
str_dict = json.load(fh)


def collapse_tandem_repeat(fullseq, repeat):
'''Collapse tandem stretches of the specified repeat sequence in a larger sequence.
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
'TAG [ATT]3 TAGTAG ATT TAGTAG'
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
'TAG ATTATTATT [TAG]2 ATT [TAG]2'
'''
if repeat not in fullseq:
return fullseq
i = fullseq.find(repeat)
prefix = fullseq[:i]
suffix = fullseq[i:]
count = 0
while suffix.startswith(repeat):
count += 1
suffix = suffix[len(repeat):]
if count == 1:
formatted = f' {repeat} '
else:
formatted = f' [{repeat}]{count} '
final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
final = final.strip()
final = re.sub(r' +', ' ', final)
return final


def collapse_all_repeats(sequence, repeats):
'''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.
>>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
'TAG [ATT]3 [TAG]2 ATT [TAG]2'
'''
collapsed_seq = sequence
for repeat in repeats:
collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
return collapsed_seq


def split_by_n(sequence, n):
'''
Function to divide sequence into chunks of n
'''
while sequence:
yield sequence[:n]
sequence = sequence[n:]


def sequence_to_bracketed_form(sequence, n, repeats):
'''Convert sequence to bracketed annotation.
Uses a combination of repeat-based and length-based methods to convert a sequence containing
tandem repeats into a concise bracketed representation.
'''
collapsed = collapse_all_repeats(sequence, repeats)
blocks = list()
for unit in collapsed.split(' '):
if len(unit) > n and '[' not in unit:
for x in split_by_n(unit, n):
blocks.append(x)
else:
blocks.append(unit)
result = ' '.join(blocks)
result = re.sub(r' +', ' ', result)
return result


def rev_complement_anno(sequence):
'''
Function creates reverse complement of sequence
Expand Down Expand Up @@ -173,38 +109,6 @@ def rev_comp_uas_output_bracket(forward_bracket, n):
return re.sub(' ', ' ', reverse_strand_bracketed_form)


def get_blocks(sequence, n):
'''
Function to split a sequence into blocks of size n
This function is used as a part of the collapse_repeats_by_length() function. It splits the
sequence into blocks of size n bases (as specified in the str_markers.json file).
'''
count = 0
prev = None
for unit in split_by_n(sequence, n):
if unit != prev:
if prev is not None:
yield prev, count
prev = unit
count = 0
count += 1
yield prev, count


def collapse_repeats_by_length(sequence, n):
'''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
units = list()
for unit, count in get_blocks(sequence, n):
if count == 1:
units.append(unit)
else:
units.append(f'[{unit}]{count}')
result = ' '.join(units)
result = re.sub(r' +', ' ', result)
return result


def traditional_str_allele(sequence, n, n_sub_out):
'''
Function used to calculate the traditional STR allele designation
Expand Down
102 changes: 102 additions & 0 deletions lusSTR/repeat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2020, Battelle National Biodefense Institute.
#
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
# and is licensed under the BSD license: see LICENSE.txt.
# -----------------------------------------------------------------------------

import re


def collapse_tandem_repeat(fullseq, repeat):
'''Collapse tandem stretches of the specified repeat sequence in a larger sequence.
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
'TAG [ATT]3 TAGTAG ATT TAGTAG'
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
'TAG ATTATTATT [TAG]2 ATT [TAG]2'
'''
if repeat not in fullseq:
return fullseq
i = fullseq.find(repeat)
prefix = fullseq[:i]
suffix = fullseq[i:]
count = 0
while suffix.startswith(repeat):
count += 1
suffix = suffix[len(repeat):]
if count == 1:
formatted = f' {repeat} '
else:
formatted = f' [{repeat}]{count} '
final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
final = final.strip()
final = re.sub(r' +', ' ', final)
return final


def collapse_all_repeats(sequence, repeats):
'''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.
>>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
'TAG [ATT]3 [TAG]2 ATT [TAG]2'
'''
collapsed_seq = sequence
for repeat in repeats:
collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
return collapsed_seq


def split_by_n(sequence, n):
'''Split a sequence into non-overlapping chunks of length n.'''
while sequence:
yield sequence[:n]
sequence = sequence[n:]


def get_blocks(sequence, n):
'''Split a sequence into chunks of length n, and count adjacent repeated chunks.'''
count = 0
prev = None
for unit in split_by_n(sequence, n):
if unit != prev:
if prev is not None:
yield prev, count
prev = unit
count = 0
count += 1
yield prev, count


def collapse_repeats_by_length(sequence, n):
'''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
units = list()
for unit, count in get_blocks(sequence, n):
if count == 1:
units.append(unit)
else:
units.append(f'[{unit}]{count}')
result = ' '.join(units)
result = re.sub(r' +', ' ', result)
return result


def sequence_to_bracketed_form(sequence, n, repeats):
'''Convert sequence to bracketed annotation.
Uses a combination of repeat-based and length-based methods to convert a sequence containing
tandem repeats into a concise bracketed representation.
'''
collapsed = collapse_all_repeats(sequence, repeats)
blocks = list()
for unit in collapsed.split(' '):
if len(unit) > n and '[' not in unit:
for x in split_by_n(unit, n):
blocks.append(x)
else:
blocks.append(unit)
result = ' '.join(blocks)
result = re.sub(r' +', ' ', result)
return result
53 changes: 53 additions & 0 deletions lusSTR/tests/test_repeat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2020, Battelle National Biodefense Institute.
#
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
# and is licensed under the BSD license: see LICENSE.txt.
# -----------------------------------------------------------------------------

import lusSTR
from lusSTR.repeat import collapse_tandem_repeat, collapse_all_repeats
from lusSTR.repeat import split_by_n, get_blocks
from lusSTR.repeat import collapse_repeats_by_length, sequence_to_bracketed_form
import pytest


@pytest.mark.parametrize('sequence, repeat_list, output', [
(
'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
),
(
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
)
])
def test_collapse_all_repeats(sequence, repeat_list, output):
final_output = collapse_all_repeats(sequence, repeat_list)
assert final_output == output


def test_split_by_n():
sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
blocks = list(split_by_n(sequence, n=4))
assert blocks == [
'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
]


def test_sequence_to_bracketed_form():
sequence = (
'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
'TATCTATCTATCTATCTATCTATCTATCTATCTA'
)
repeats = ['TCTA', 'TCTG']
final_output = sequence_to_bracketed_form(sequence, 6, repeats)
assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'


def test_collapse_repeats_by_length():
sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
assert collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'
39 changes: 0 additions & 39 deletions lusSTR/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,6 @@ def test_format():
assert filecmp.cmp(formatoutput, outfile.name) is True


@pytest.mark.parametrize('sequence, repeat_list, output', [
(
'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
),
(
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
)
])
def test_collapse_all_repeats(sequence, repeat_list, output):
final_output = lusSTR.annot.collapse_all_repeats(sequence, repeat_list)
assert final_output == output


def test_split_by_n():
sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
blocks = list(lusSTR.annot.split_by_n(sequence, n=4))
assert blocks == [
'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
]


def test_sequence_to_bracketed_form():
sequence = (
'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
'TATCTATCTATCTATCTATCTATCTATCTATCTA'
)
repeats = ['TCTA', 'TCTG']
final_output = lusSTR.annot.sequence_to_bracketed_form(sequence, 6, repeats)
assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'


def test_extract():
s = '[ATCT]3 ATGT [ATCT]12'
repeat = 'ATCT'
Expand Down Expand Up @@ -92,11 +58,6 @@ def test_rev_comp_uas_output_bracket():
assert rev_comp_bracket == 'CCAA [TTCG]2 [ACCT]3'


def test_collapse_repeats_by_length():
sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
assert lusSTR.annot.collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'


@pytest.mark.parametrize('sequence, bracket_form', [
(
'TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATC'
Expand Down

0 comments on commit 0091ed3

Please sign in to comment.