Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New "repeat" module #23

Merged
merged 6 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ help: Makefile

## test: run the automated test suite and print coverage information
test:
pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_suite.py
pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_*.py

## style: check code style against PEP8
style:
pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_suite.py
pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_*.py

## devenv: configure a development environment
devenv:
Expand Down
1 change: 1 addition & 0 deletions lusSTR/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# -----------------------------------------------------------------------------

from lusSTR import annot
from lusSTR import repeat
from lusSTR.annot import str_dict
from lusSTR import format
from lusSTR import cli
Expand Down
102 changes: 3 additions & 99 deletions lusSTR/annot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
from pkg_resources import resource_filename
import re
import sys

import lusSTR
from lusSTR.repeat import collapse_all_repeats, collapse_repeats_by_length
from lusSTR.repeat import sequence_to_bracketed_form, split_by_n


def get_str_metadata_file():
Expand All @@ -25,73 +28,6 @@ def get_str_metadata_file():
str_dict = json.load(fh)


def collapse_tandem_repeat(fullseq, repeat):
'''Collapse tandem stretches of the specified repeat sequence in a larger sequence.

>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
'TAG [ATT]3 TAGTAG ATT TAGTAG'
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
'TAG ATTATTATT [TAG]2 ATT [TAG]2'
'''
if repeat not in fullseq:
return fullseq
i = fullseq.find(repeat)
prefix = fullseq[:i]
suffix = fullseq[i:]
count = 0
while suffix.startswith(repeat):
count += 1
suffix = suffix[len(repeat):]
if count == 1:
formatted = f' {repeat} '
else:
formatted = f' [{repeat}]{count} '
final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
final = final.strip()
final = re.sub(r' +', ' ', final)
return final


def collapse_all_repeats(sequence, repeats):
'''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.

>>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
'TAG [ATT]3 [TAG]2 ATT [TAG]2'
'''
collapsed_seq = sequence
for repeat in repeats:
collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
return collapsed_seq


def split_by_n(sequence, n):
'''
Function to divide sequence into chunks of n
'''
while sequence:
yield sequence[:n]
sequence = sequence[n:]


def sequence_to_bracketed_form(sequence, n, repeats):
'''Convert sequence to bracketed annotation.

Uses a combination of repeat-based and length-based methods to convert a sequence containing
tandem repeats into a concise bracketed representation.
'''
collapsed = collapse_all_repeats(sequence, repeats)
blocks = list()
for unit in collapsed.split(' '):
if len(unit) > n and '[' not in unit:
for x in split_by_n(unit, n):
blocks.append(x)
else:
blocks.append(unit)
result = ' '.join(blocks)
result = re.sub(r' +', ' ', result)
return result


def rev_complement_anno(sequence):
'''
Function creates reverse complement of sequence
Expand Down Expand Up @@ -173,38 +109,6 @@ def rev_comp_uas_output_bracket(forward_bracket, n):
return re.sub(' ', ' ', reverse_strand_bracketed_form)


def get_blocks(sequence, n):
'''
Function to split a sequence into blocks of size n

This function is used as a part of the collapse_repeats_by_length() function. It splits the
sequence into blocks of size n bases (as specified in the str_markers.json file).
'''
count = 0
prev = None
for unit in split_by_n(sequence, n):
if unit != prev:
if prev is not None:
yield prev, count
prev = unit
count = 0
count += 1
yield prev, count


def collapse_repeats_by_length(sequence, n):
'''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
units = list()
for unit, count in get_blocks(sequence, n):
if count == 1:
units.append(unit)
else:
units.append(f'[{unit}]{count}')
result = ' '.join(units)
result = re.sub(r' +', ' ', result)
return result


def traditional_str_allele(sequence, n, n_sub_out):
'''
Function used to calculate the traditional STR allele designation
Expand Down
102 changes: 102 additions & 0 deletions lusSTR/repeat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2020, Battelle National Biodefense Institute.
#
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
# and is licensed under the BSD license: see LICENSE.txt.
# -----------------------------------------------------------------------------

import re


def collapse_tandem_repeat(fullseq, repeat):
'''Collapse tandem stretches of the specified repeat sequence in a larger sequence.

>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
'TAG [ATT]3 TAGTAG ATT TAGTAG'
>>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
'TAG ATTATTATT [TAG]2 ATT [TAG]2'
'''
if repeat not in fullseq:
return fullseq
i = fullseq.find(repeat)
prefix = fullseq[:i]
suffix = fullseq[i:]
count = 0
while suffix.startswith(repeat):
count += 1
suffix = suffix[len(repeat):]
if count == 1:
formatted = f' {repeat} '
else:
formatted = f' [{repeat}]{count} '
final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
final = final.strip()
final = re.sub(r' +', ' ', final)
return final


def collapse_all_repeats(sequence, repeats):
'''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.

>>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
'TAG [ATT]3 [TAG]2 ATT [TAG]2'
'''
collapsed_seq = sequence
for repeat in repeats:
collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
return collapsed_seq


def split_by_n(sequence, n):
'''Split a sequence into non-overlapping chunks of length n.'''
while sequence:
yield sequence[:n]
sequence = sequence[n:]


def get_blocks(sequence, n):
'''Split a sequence into chunks of length n, and count adjacent repeated chunks.'''
count = 0
prev = None
for unit in split_by_n(sequence, n):
if unit != prev:
if prev is not None:
yield prev, count
prev = unit
count = 0
count += 1
yield prev, count


def collapse_repeats_by_length(sequence, n):
'''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
units = list()
for unit, count in get_blocks(sequence, n):
if count == 1:
units.append(unit)
else:
units.append(f'[{unit}]{count}')
result = ' '.join(units)
result = re.sub(r' +', ' ', result)
return result


def sequence_to_bracketed_form(sequence, n, repeats):
'''Convert sequence to bracketed annotation.

Uses a combination of repeat-based and length-based methods to convert a sequence containing
tandem repeats into a concise bracketed representation.
'''
collapsed = collapse_all_repeats(sequence, repeats)
blocks = list()
for unit in collapsed.split(' '):
if len(unit) > n and '[' not in unit:
for x in split_by_n(unit, n):
blocks.append(x)
else:
blocks.append(unit)
result = ' '.join(blocks)
result = re.sub(r' +', ' ', result)
return result
53 changes: 53 additions & 0 deletions lusSTR/tests/test_repeat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2020, Battelle National Biodefense Institute.
#
# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
# and is licensed under the BSD license: see LICENSE.txt.
# -----------------------------------------------------------------------------

import lusSTR
from lusSTR.repeat import collapse_tandem_repeat, collapse_all_repeats
from lusSTR.repeat import split_by_n, get_blocks
from lusSTR.repeat import collapse_repeats_by_length, sequence_to_bracketed_form
import pytest


@pytest.mark.parametrize('sequence, repeat_list, output', [
(
'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
),
(
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
)
])
def test_collapse_all_repeats(sequence, repeat_list, output):
final_output = collapse_all_repeats(sequence, repeat_list)
assert final_output == output


def test_split_by_n():
sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
blocks = list(split_by_n(sequence, n=4))
assert blocks == [
'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
]


def test_sequence_to_bracketed_form():
sequence = (
'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
'TATCTATCTATCTATCTATCTATCTATCTATCTA'
)
repeats = ['TCTA', 'TCTG']
final_output = sequence_to_bracketed_form(sequence, 6, repeats)
assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'


def test_collapse_repeats_by_length():
sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
assert collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'
39 changes: 0 additions & 39 deletions lusSTR/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,40 +27,6 @@ def test_format():
assert filecmp.cmp(formatoutput, outfile.name) is True


@pytest.mark.parametrize('sequence, repeat_list, output', [
(
'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
),
(
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
)
])
def test_collapse_all_repeats(sequence, repeat_list, output):
final_output = lusSTR.annot.collapse_all_repeats(sequence, repeat_list)
assert final_output == output


def test_split_by_n():
sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
blocks = list(lusSTR.annot.split_by_n(sequence, n=4))
assert blocks == [
'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
]


def test_sequence_to_bracketed_form():
sequence = (
'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
'TATCTATCTATCTATCTATCTATCTATCTATCTA'
)
repeats = ['TCTA', 'TCTG']
final_output = lusSTR.annot.sequence_to_bracketed_form(sequence, 6, repeats)
assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'


def test_extract():
s = '[ATCT]3 ATGT [ATCT]12'
repeat = 'ATCT'
Expand Down Expand Up @@ -92,11 +58,6 @@ def test_rev_comp_uas_output_bracket():
assert rev_comp_bracket == 'CCAA [TTCG]2 [ACCT]3'


def test_collapse_repeats_by_length():
sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
assert lusSTR.annot.collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'


@pytest.mark.parametrize('sequence, bracket_form', [
(
'TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATC'
Expand Down