New "repeat" module (#23)

* Reimplement and rename get_annotation function * loci_need_split_anno --> collapse_repeats_by_length * Fix style violations * split_string --> sequence_to_bracketed_form * New repeat module
bioforensics · May 27, 2020 · 0091ed3 · 0091ed3
1 parent 763b688
commit 0091ed3
Show file tree

Hide file tree

Showing 6 changed files with 161 additions and 140 deletions.
diff --git a/Makefile b/Makefile
@@ -6,11 +6,11 @@ help: Makefile
 
 ## test:      run the automated test suite and print coverage information
 test:
-	pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_suite.py
+	pytest --cov=lusSTR --doctest-modules lusSTR/annot.py lusSTR/tests/test_*.py
 
 ## style:     check code style against PEP8
 style:
-	pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_suite.py
+	pycodestyle --max-line-length=99 lusSTR/*.py lusSTR/tests/test_*.py
 
 ## devenv:    configure a development environment
 devenv:

diff --git a/lusSTR/__init__.py b/lusSTR/__init__.py
@@ -8,6 +8,7 @@
 # -----------------------------------------------------------------------------
 
 from lusSTR import annot
+from lusSTR import repeat
 from lusSTR.annot import str_dict
 from lusSTR import format
 from lusSTR import cli

diff --git a/lusSTR/annot.py b/lusSTR/annot.py
@@ -14,7 +14,10 @@
 from pkg_resources import resource_filename
 import re
 import sys
+
 import lusSTR
+from lusSTR.repeat import collapse_all_repeats, collapse_repeats_by_length
+from lusSTR.repeat import sequence_to_bracketed_form, split_by_n
 
 
 def get_str_metadata_file():
@@ -25,73 +28,6 @@ def get_str_metadata_file():
     str_dict = json.load(fh)
 
 
-def collapse_tandem_repeat(fullseq, repeat):
-    '''Collapse tandem stretches of the specified repeat sequence in a larger sequence.
-
-    >>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
-    'TAG [ATT]3 TAGTAG ATT TAGTAG'
-    >>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
-    'TAG ATTATTATT [TAG]2 ATT [TAG]2'
-    '''
-    if repeat not in fullseq:
-        return fullseq
-    i = fullseq.find(repeat)
-    prefix = fullseq[:i]
-    suffix = fullseq[i:]
-    count = 0
-    while suffix.startswith(repeat):
-        count += 1
-        suffix = suffix[len(repeat):]
-    if count == 1:
-        formatted = f' {repeat} '
-    else:
-        formatted = f' [{repeat}]{count} '
-    final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
-    final = final.strip()
-    final = re.sub(r' +', ' ', final)
-    return final
-
-
-def collapse_all_repeats(sequence, repeats):
-    '''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.
-
-    >>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
-    'TAG [ATT]3 [TAG]2 ATT [TAG]2'
-    '''
-    collapsed_seq = sequence
-    for repeat in repeats:
-        collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
-    return collapsed_seq
-
-
-def split_by_n(sequence, n):
-    '''
-    Function to divide sequence into chunks of n
-    '''
-    while sequence:
-        yield sequence[:n]
-        sequence = sequence[n:]
-
-
-def sequence_to_bracketed_form(sequence, n, repeats):
-    '''Convert sequence to bracketed annotation.
-
-    Uses a combination of repeat-based and length-based methods to convert a sequence containing
-    tandem repeats into a concise bracketed representation.
-    '''
-    collapsed = collapse_all_repeats(sequence, repeats)
-    blocks = list()
-    for unit in collapsed.split(' '):
-        if len(unit) > n and '[' not in unit:
-            for x in split_by_n(unit, n):
-                blocks.append(x)
-        else:
-            blocks.append(unit)
-    result = ' '.join(blocks)
-    result = re.sub(r' +', ' ', result)
-    return result
-
-
 def rev_complement_anno(sequence):
     '''
     Function creates reverse complement of sequence
@@ -173,38 +109,6 @@ def rev_comp_uas_output_bracket(forward_bracket, n):
     return re.sub('  ', ' ', reverse_strand_bracketed_form)
 
 
-def get_blocks(sequence, n):
-    '''
-    Function to split a sequence into blocks of size n
-
-    This function is used as a part of the collapse_repeats_by_length() function. It splits the
-    sequence into blocks of size n bases (as specified in the str_markers.json file).
-    '''
-    count = 0
-    prev = None
-    for unit in split_by_n(sequence, n):
-        if unit != prev:
-            if prev is not None:
-                yield prev, count
-            prev = unit
-            count = 0
-        count += 1
-    yield prev, count
-
-
-def collapse_repeats_by_length(sequence, n):
-    '''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
-    units = list()
-    for unit, count in get_blocks(sequence, n):
-        if count == 1:
-            units.append(unit)
-        else:
-            units.append(f'[{unit}]{count}')
-    result = '  '.join(units)
-    result = re.sub(r' +', ' ', result)
-    return result
-
-
 def traditional_str_allele(sequence, n, n_sub_out):
     '''
     Function used to calculate the traditional STR allele designation

diff --git a/lusSTR/repeat.py b/lusSTR/repeat.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2020, Battelle National Biodefense Institute.
+#
+# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
+# and is licensed under the BSD license: see LICENSE.txt.
+# -----------------------------------------------------------------------------
+
+import re
+
+
+def collapse_tandem_repeat(fullseq, repeat):
+    '''Collapse tandem stretches of the specified repeat sequence in a larger sequence.
+
+    >>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'ATT')
+    'TAG [ATT]3 TAGTAG ATT TAGTAG'
+    >>> collapse_tandem_repeat('TAGATTATTATTTAGTAGATTTAGTAG', 'TAG')
+    'TAG ATTATTATT [TAG]2 ATT [TAG]2'
+    '''
+    if repeat not in fullseq:
+        return fullseq
+    i = fullseq.find(repeat)
+    prefix = fullseq[:i]
+    suffix = fullseq[i:]
+    count = 0
+    while suffix.startswith(repeat):
+        count += 1
+        suffix = suffix[len(repeat):]
+    if count == 1:
+        formatted = f' {repeat} '
+    else:
+        formatted = f' [{repeat}]{count} '
+    final = prefix + formatted + collapse_tandem_repeat(suffix, repeat)
+    final = final.strip()
+    final = re.sub(r' +', ' ', final)
+    return final
+
+
+def collapse_all_repeats(sequence, repeats):
+    '''Convert a sequence to bracketed form by collapsing stretches of tandem repeats.
+
+    >>> collapse_all_repeats('TAGATTATTATTTAGTAGATTTAGTAG', ['ATT', 'TAG'])
+    'TAG [ATT]3 [TAG]2 ATT [TAG]2'
+    '''
+    collapsed_seq = sequence
+    for repeat in repeats:
+        collapsed_seq = collapse_tandem_repeat(collapsed_seq, repeat)
+    return collapsed_seq
+
+
+def split_by_n(sequence, n):
+    '''Split a sequence into non-overlapping chunks of length n.'''
+    while sequence:
+        yield sequence[:n]
+        sequence = sequence[n:]
+
+
+def get_blocks(sequence, n):
+    '''Split a sequence into chunks of length n, and count adjacent repeated chunks.'''
+    count = 0
+    prev = None
+    for unit in split_by_n(sequence, n):
+        if unit != prev:
+            if prev is not None:
+                yield prev, count
+            prev = unit
+            count = 0
+        count += 1
+    yield prev, count
+
+
+def collapse_repeats_by_length(sequence, n):
+    '''Convert to bracketed annotation form by splitting the sequence into blocks of size n.'''
+    units = list()
+    for unit, count in get_blocks(sequence, n):
+        if count == 1:
+            units.append(unit)
+        else:
+            units.append(f'[{unit}]{count}')
+    result = '  '.join(units)
+    result = re.sub(r' +', ' ', result)
+    return result
+
+
+def sequence_to_bracketed_form(sequence, n, repeats):
+    '''Convert sequence to bracketed annotation.
+
+    Uses a combination of repeat-based and length-based methods to convert a sequence containing
+    tandem repeats into a concise bracketed representation.
+    '''
+    collapsed = collapse_all_repeats(sequence, repeats)
+    blocks = list()
+    for unit in collapsed.split(' '):
+        if len(unit) > n and '[' not in unit:
+            for x in split_by_n(unit, n):
+                blocks.append(x)
+        else:
+            blocks.append(unit)
+    result = ' '.join(blocks)
+    result = re.sub(r' +', ' ', result)
+    return result
diff --git a/lusSTR/tests/test_repeat.py b/lusSTR/tests/test_repeat.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2020, Battelle National Biodefense Institute.
+#
+# This file is part of lusSTR (http://github.com/bioforensics/lusSTR)
+# and is licensed under the BSD license: see LICENSE.txt.
+# -----------------------------------------------------------------------------
+
+import lusSTR
+from lusSTR.repeat import collapse_tandem_repeat, collapse_all_repeats
+from lusSTR.repeat import split_by_n, get_blocks
+from lusSTR.repeat import collapse_repeats_by_length, sequence_to_bracketed_form
+import pytest
+
+
+@pytest.mark.parametrize('sequence, repeat_list, output', [
+    (
+        'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
+        ['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
+    ),
+    (
+        'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
+        ['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
+        'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
+    )
+])
+def test_collapse_all_repeats(sequence, repeat_list, output):
+    final_output = collapse_all_repeats(sequence, repeat_list)
+    assert final_output == output
+
+
+def test_split_by_n():
+    sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
+    blocks = list(split_by_n(sequence, n=4))
+    assert blocks == [
+        'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
+    ]
+
+
+def test_sequence_to_bracketed_form():
+    sequence = (
+        'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
+        'TATCTATCTATCTATCTATCTATCTATCTATCTA'
+    )
+    repeats = ['TCTA', 'TCTG']
+    final_output = sequence_to_bracketed_form(sequence, 6, repeats)
+    assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'
+
+
+def test_collapse_repeats_by_length():
+    sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
+    assert collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'
diff --git a/lusSTR/tests/test_suite.py b/lusSTR/tests/test_suite.py
@@ -27,40 +27,6 @@ def test_format():
         assert filecmp.cmp(formatoutput, outfile.name) is True
 
 
-@pytest.mark.parametrize('sequence, repeat_list, output', [
-    (
-        'AGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGACAGAT',
-        ['AGAT', 'AGAC'], 'AGAC [AGAT]11 [AGAC]6 AGAT'
-    ),
-    (
-        'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG',
-        ['TCTA', 'CATA', 'TCTG', 'CACA', 'CCTA'],
-        'TAGATAGATAGATAGATGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGG'
-    )
-])
-def test_collapse_all_repeats(sequence, repeat_list, output):
-    final_output = lusSTR.annot.collapse_all_repeats(sequence, repeat_list)
-    assert final_output == output
-
-
-def test_split_by_n():
-    sequence = 'AGGTAGGTAGGTCGAACGAATTGG'
-    blocks = list(lusSTR.annot.split_by_n(sequence, n=4))
-    assert blocks == [
-        'AGGT', 'AGGT', 'AGGT', 'CGAA', 'CGAA', 'TTGG'
-    ]
-
-
-def test_sequence_to_bracketed_form():
-    sequence = (
-        'TCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATC'
-        'TATCTATCTATCTATCTATCTATCTATCTATCTA'
-    )
-    repeats = ['TCTA', 'TCTG']
-    final_output = lusSTR.annot.sequence_to_bracketed_form(sequence, 6, repeats)
-    assert final_output == '[TCTA]3 [TCTG]6 [TCTA]3 TA [TCTA]3 TCA [TCTA]2 TCCATA [TCTA]11'
-
-
 def test_extract():
     s = '[ATCT]3 ATGT [ATCT]12'
     repeat = 'ATCT'
@@ -92,11 +58,6 @@ def test_rev_comp_uas_output_bracket():
     assert rev_comp_bracket == 'CCAA [TTCG]2 [ACCT]3'
 
 
-def test_collapse_repeats_by_length():
-    sequence = 'TCTATCTATCTATCTATCTATCTATCTATATATCTATCTATCTATCTA'
-    assert lusSTR.annot.collapse_repeats_by_length(sequence, 4) == '[TCTA]7 TATA [TCTA]4'
-
-
 @pytest.mark.parametrize('sequence, bracket_form', [
     (
         'TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATC'