exclusion of positively selected genes implemented and tested

instituteofcancerresearch · Sep 1, 2023 · 2cc0b45 · 2cc0b45
1 parent 7bd7e1e
commit 2cc0b45
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 17 deletions.
diff --git a/src/SOPRANO/objects.py b/src/SOPRANO/objects.py
@@ -37,11 +37,11 @@ class _GenomicPaths:
 
 
 @dataclass(frozen=True)
-class _AuxiliaryPaths:
+class AuxiliaryPaths:
     genes_to_exclude: pathlib.Path
 
 
-AuxiliaryPaths = _AuxiliaryPaths(
+AuxiliaryFiles = AuxiliaryPaths(
     genes_to_exclude=_data_dir().joinpath("genes2exclude.txt")
 )
 

diff --git a/src/SOPRANO/prepare_coordinates.py b/src/SOPRANO/prepare_coordinates.py
@@ -336,11 +336,18 @@ def _exclude_positively_selected_genes_disabled(paths: AnalysisPaths):
     subprocess_pipes.pipe(["cp", paths.exclusions_shuffled, paths.epitopes])
 
 
-def _exclude_positively_selected_genes(paths: AnalysisPaths):
+def _exclude_positively_selected_genes(
+    paths: AnalysisPaths, aux_paths: AuxiliaryPaths
+):
     """
     Implement
     fgrep -w -v -f $SUPA/genes2exclude.txt $TMP/$NAME.epitopes.ori2 >
         $TMP/$NAME.epitopes.bed
+
+    -w : match only whole words between files
+    -v : invert selection, i.e. select non-matching lines
+    -f : takes pattern from file
+
     :param paths:
     :return:
     """
@@ -351,14 +358,12 @@ def _exclude_positively_selected_genes(paths: AnalysisPaths):
             "-w",
             "-v",
             "-f",
-            AuxiliaryPaths.genes_to_exclude.as_posix(),
+            aux_paths.genes_to_exclude.as_posix(),
             paths.exclusions_shuffled.as_posix(),
         ],
         output_path=paths.epitopes,
     )
 
-    # TODO: Unit test
-
 
 def get_protein_complement(paths: AnalysisPaths):
     """

diff --git a/src/SOPRANO/sh_utils/subprocess_pipes.py b/src/SOPRANO/sh_utils/subprocess_pipes.py
@@ -52,6 +52,7 @@ def pipe(
     :return: string representation of stdout from cumulative piped processes
     """
 
+    # TODO: Should we be raising an error on non-zero exit status?
     ps = subprocess.run(args[0], input=_input, capture_output=True)
 
     if len(args) > 1:

diff --git a/tests/test_units/test_prepare_coordinates.py b/tests/test_units/test_prepare_coordinates.py
@@ -3,7 +3,7 @@
 import pytest
 
 import SOPRANO.prepare_coordinates as prep_coords
-from SOPRANO.objects import AnalysisPaths, TranscriptPaths
+from SOPRANO.objects import AnalysisPaths, AuxiliaryPaths, TranscriptPaths
 
 
 def tab_line(*args):
@@ -124,6 +124,9 @@ def tab_line(*args):
 # Fictitious target regions for randomization
 mock_target_regions = [tab_line("ENST00000000233", 500, 1000)]
 
+# Used to build mock aux files
+mock_genes2exclude = [tab_line("ENST00000001008")]
+
 
 def check_expected_content(
     expected_content: list, written_content_path: pathlib.Path
@@ -152,6 +155,7 @@ def test_files(tmp_path):
     anno_path = inputs_dir.joinpath("input.anno")
     bed_path = inputs_dir.joinpath("input.bed")
     targets_path = inputs_dir.joinpath("targets.bed")
+    genes2exclude_path = inputs_dir.joinpath("genes2exclude.txt")
 
     trans_path = transcripts_dir.joinpath("transcript_length.txt")
     trans_prot_path = transcripts_dir.joinpath(
@@ -162,26 +166,35 @@ def test_files(tmp_path):
         "test_data", bed_path, tmpdir, target_regions_path=targets_path
     )
     transcripts = TranscriptPaths(trans_path, trans_prot_path)
+    auxiliaries = AuxiliaryPaths(genes2exclude_path)
 
     for _input_path, _input_content in zip(
-        (anno_path, bed_path, trans_path, trans_prot_path, targets_path),
+        (
+            anno_path,
+            bed_path,
+            trans_path,
+            trans_prot_path,
+            targets_path,
+            genes2exclude_path,
+        ),
         (
             mock_input_content,
             mock_bed_content,
             mock_transcript_content,
             mock_protein_transcript_content,
             mock_target_regions,
+            mock_genes2exclude,
         ),
     ):
         with open(_input_path, "w") as f:
             f.writelines(_input_content)
 
-    return paths, transcripts
+    return paths, transcripts, auxiliaries
 
 
 @pytest.mark.dependency(name="_filter_transcript_file")
 def test__filter_transcript_file(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
 
     expected_content = [
         tab_line("ENST00000000233", 543),
@@ -202,7 +215,7 @@ def test__filter_transcript_file(test_files):
     name="filter_trans_files", depends=["_filter_transcript_file"]
 )
 def test_filter_transcript_files(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
 
     expected_trans_content = [
         tab_line("ENST00000000233", 543),
@@ -226,7 +239,7 @@ def test_filter_transcript_files(test_files):
 
 @pytest.mark.dependency(name="_define_excl_regs")
 def test__define_excluded_regions_for_randomization(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
 
     expected_content = mock_bed_content + [
         tab_line("ENST00000000233", 0, 2),
@@ -243,7 +256,7 @@ def test__define_excluded_regions_for_randomization(test_files):
 
 @pytest.mark.dependency(depends=["_define_excl_regs", "filter_trans_files"])
 def test__sort_excluded_regions_for_randomization(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
 
     # Every item has an "ENST<...>  0   2" pair
     # So after sorting, we expect that
@@ -288,7 +301,7 @@ def test__sort_excluded_regions_for_randomization(test_files):
 
 @pytest.mark.dependency(depends=["filter_trans_files"])
 def test__randomize_with_target_file(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
 
     prep_coords.filter_transcript_files(paths, transcripts)
 
@@ -311,7 +324,7 @@ def test__randomize_with_target_file(test_files):
 
 
 def test__non_randomized(test_files):
-    paths, transcripts = test_files
+    paths, transcripts, auxiliaries = test_files
     # expected_content = [
     #     tab_line("ENST00000000233", 115, 124),
     #     tab_line("ENST00000000233", 164, 177),
@@ -325,5 +338,32 @@ def test__non_randomized(test_files):
     # check_expected_content(expected_content, paths.exclusions_shuffled)
 
 
-def test__exclude_positively_selected_genes_disabled():
-    pass
+def test__exclude_positively_selected_genes_disabled(test_files):
+    paths, transcripts, auxiliaries = test_files
+
+    # Dummy data written to shuffled exclusions file:
+    # When positively selected genes are disabled, should just copy this file
+    paths.exclusions_shuffled.write_text(tab_line("chr1", 123, 456))
+
+    prep_coords._exclude_positively_selected_genes_disabled(paths)
+
+    with open(paths.exclusions_shuffled, "r") as f:
+        expected_content = f.readlines()
+
+    check_expected_content(expected_content, paths.epitopes)
+
+
+def test__exclude_positively_selected_genes(test_files):
+    paths, transcripts, auxiliaries = test_files
+
+    # Write dummy data for "shuffle file"
+    paths.exclusions_shuffled.write_text(
+        tab_line("ENST00000000233", 164, 177)
+        + tab_line("ENST00000001008", 113, 124)
+    )
+
+    # In the exclusions aux file, we have ENST00000001008
+    expected_content = [tab_line("ENST00000000233", 164, 177)]
+    prep_coords._exclude_positively_selected_genes(paths, auxiliaries)
+
+    check_expected_content(expected_content, paths.epitopes)