diff --git a/src/SOPRANO/objects.py b/src/SOPRANO/objects.py index b6026fe..e15d33d 100755 --- a/src/SOPRANO/objects.py +++ b/src/SOPRANO/objects.py @@ -37,11 +37,11 @@ class _GenomicPaths: @dataclass(frozen=True) -class _AuxiliaryPaths: +class AuxiliaryPaths: genes_to_exclude: pathlib.Path -AuxiliaryPaths = _AuxiliaryPaths( +AuxiliaryFiles = AuxiliaryPaths( genes_to_exclude=_data_dir().joinpath("genes2exclude.txt") ) diff --git a/src/SOPRANO/prepare_coordinates.py b/src/SOPRANO/prepare_coordinates.py index 16f8428..5e97e35 100755 --- a/src/SOPRANO/prepare_coordinates.py +++ b/src/SOPRANO/prepare_coordinates.py @@ -336,11 +336,18 @@ def _exclude_positively_selected_genes_disabled(paths: AnalysisPaths): subprocess_pipes.pipe(["cp", paths.exclusions_shuffled, paths.epitopes]) -def _exclude_positively_selected_genes(paths: AnalysisPaths): +def _exclude_positively_selected_genes( + paths: AnalysisPaths, aux_paths: AuxiliaryPaths +): """ Implement fgrep -w -v -f $SUPA/genes2exclude.txt $TMP/$NAME.epitopes.ori2 > $TMP/$NAME.epitopes.bed + + -w : match only whole words between files + -v : invert selection, i.e. select non-matching lines + -f : takes pattern from file + :param paths: :return: """ @@ -351,14 +358,12 @@ def _exclude_positively_selected_genes(paths: AnalysisPaths): "-w", "-v", "-f", - AuxiliaryPaths.genes_to_exclude.as_posix(), + aux_paths.genes_to_exclude.as_posix(), paths.exclusions_shuffled.as_posix(), ], output_path=paths.epitopes, ) - # TODO: Unit test - def get_protein_complement(paths: AnalysisPaths): """ diff --git a/src/SOPRANO/sh_utils/subprocess_pipes.py b/src/SOPRANO/sh_utils/subprocess_pipes.py index 50d1ab1..fd6f55c 100755 --- a/src/SOPRANO/sh_utils/subprocess_pipes.py +++ b/src/SOPRANO/sh_utils/subprocess_pipes.py @@ -52,6 +52,7 @@ def pipe( :return: string representation of stdout from cumulative piped processes """ + # TODO: Should we be raising an error on non-zero exit status? ps = subprocess.run(args[0], input=_input, capture_output=True) if len(args) > 1: diff --git a/tests/test_units/test_prepare_coordinates.py b/tests/test_units/test_prepare_coordinates.py index f837aa5..a5d9eac 100755 --- a/tests/test_units/test_prepare_coordinates.py +++ b/tests/test_units/test_prepare_coordinates.py @@ -3,7 +3,7 @@ import pytest import SOPRANO.prepare_coordinates as prep_coords -from SOPRANO.objects import AnalysisPaths, TranscriptPaths +from SOPRANO.objects import AnalysisPaths, AuxiliaryPaths, TranscriptPaths def tab_line(*args): @@ -124,6 +124,9 @@ def tab_line(*args): # Fictitious target regions for randomization mock_target_regions = [tab_line("ENST00000000233", 500, 1000)] +# Used to build mock aux files +mock_genes2exclude = [tab_line("ENST00000001008")] + def check_expected_content( expected_content: list, written_content_path: pathlib.Path @@ -152,6 +155,7 @@ def test_files(tmp_path): anno_path = inputs_dir.joinpath("input.anno") bed_path = inputs_dir.joinpath("input.bed") targets_path = inputs_dir.joinpath("targets.bed") + genes2exclude_path = inputs_dir.joinpath("genes2exclude.txt") trans_path = transcripts_dir.joinpath("transcript_length.txt") trans_prot_path = transcripts_dir.joinpath( @@ -162,26 +166,35 @@ def test_files(tmp_path): "test_data", bed_path, tmpdir, target_regions_path=targets_path ) transcripts = TranscriptPaths(trans_path, trans_prot_path) + auxiliaries = AuxiliaryPaths(genes2exclude_path) for _input_path, _input_content in zip( - (anno_path, bed_path, trans_path, trans_prot_path, targets_path), + ( + anno_path, + bed_path, + trans_path, + trans_prot_path, + targets_path, + genes2exclude_path, + ), ( mock_input_content, mock_bed_content, mock_transcript_content, mock_protein_transcript_content, mock_target_regions, + mock_genes2exclude, ), ): with open(_input_path, "w") as f: f.writelines(_input_content) - return paths, transcripts + return paths, transcripts, auxiliaries @pytest.mark.dependency(name="_filter_transcript_file") def test__filter_transcript_file(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files expected_content = [ tab_line("ENST00000000233", 543), @@ -202,7 +215,7 @@ def test__filter_transcript_file(test_files): name="filter_trans_files", depends=["_filter_transcript_file"] ) def test_filter_transcript_files(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files expected_trans_content = [ tab_line("ENST00000000233", 543), @@ -226,7 +239,7 @@ def test_filter_transcript_files(test_files): @pytest.mark.dependency(name="_define_excl_regs") def test__define_excluded_regions_for_randomization(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files expected_content = mock_bed_content + [ tab_line("ENST00000000233", 0, 2), @@ -243,7 +256,7 @@ def test__define_excluded_regions_for_randomization(test_files): @pytest.mark.dependency(depends=["_define_excl_regs", "filter_trans_files"]) def test__sort_excluded_regions_for_randomization(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files # Every item has an "ENST<...> 0 2" pair # So after sorting, we expect that @@ -288,7 +301,7 @@ def test__sort_excluded_regions_for_randomization(test_files): @pytest.mark.dependency(depends=["filter_trans_files"]) def test__randomize_with_target_file(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files prep_coords.filter_transcript_files(paths, transcripts) @@ -311,7 +324,7 @@ def test__randomize_with_target_file(test_files): def test__non_randomized(test_files): - paths, transcripts = test_files + paths, transcripts, auxiliaries = test_files # expected_content = [ # tab_line("ENST00000000233", 115, 124), # tab_line("ENST00000000233", 164, 177), @@ -325,5 +338,32 @@ def test__non_randomized(test_files): # check_expected_content(expected_content, paths.exclusions_shuffled) -def test__exclude_positively_selected_genes_disabled(): - pass +def test__exclude_positively_selected_genes_disabled(test_files): + paths, transcripts, auxiliaries = test_files + + # Dummy data written to shuffled exclusions file: + # When positively selected genes are disabled, should just copy this file + paths.exclusions_shuffled.write_text(tab_line("chr1", 123, 456)) + + prep_coords._exclude_positively_selected_genes_disabled(paths) + + with open(paths.exclusions_shuffled, "r") as f: + expected_content = f.readlines() + + check_expected_content(expected_content, paths.epitopes) + + +def test__exclude_positively_selected_genes(test_files): + paths, transcripts, auxiliaries = test_files + + # Write dummy data for "shuffle file" + paths.exclusions_shuffled.write_text( + tab_line("ENST00000000233", 164, 177) + + tab_line("ENST00000001008", 113, 124) + ) + + # In the exclusions aux file, we have ENST00000001008 + expected_content = [tab_line("ENST00000000233", 164, 177)] + prep_coords._exclude_positively_selected_genes(paths, auxiliaries) + + check_expected_content(expected_content, paths.epitopes)