From 9191e687b246f6758afd8349c871207130b7e1ee Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Fri, 22 Mar 2024 14:05:11 -0600 Subject: [PATCH 1/8] Move read filtering to after merging in CRISPResso (#39) * Move read filtering to after merging This is in an effort to be consistent with the behavior and results of CRISPRessoPooled. * Properly assign the correct file names for read filtering * Add space around operators --- CRISPResso2/CRISPRessoCORE.py | 64 ++++++++++++++--------------------- 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index 9a7c278e..5ac57018 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -2117,44 +2117,6 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited info('Done!', {'percent_complete': 4}) - if args.min_average_read_quality>0 or args.min_single_bp_quality>0 or args.min_bp_quality_or_N>0: - if args.bam_input != '': - raise CRISPRessoShared.BadParameterException('The read filtering options are not available with bam input') - info('Filtering reads with average bp quality < %d and single bp quality < %d and replacing bases with quality < %d with N ...' % (args.min_average_read_quality, args.min_single_bp_quality, args.min_bp_quality_or_N)) - min_av_quality = None - if args.min_average_read_quality > 0: - min_av_quality = args.min_average_read_quality - - min_single_bp_quality = None - if args.min_single_bp_quality > 0: - min_single_bp_quality = args.min_single_bp_quality - - min_bp_quality_or_N = None - if args.min_bp_quality_or_N > 0: - min_bp_quality_or_N = args.min_bp_quality_or_N - - if args.fastq_r2!='': - output_filename_r1=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_filtered.fastq.gz') - output_filename_r2=_jp(os.path.basename(args.fastq_r2.replace('.fastq', '')).replace('.gz', '')+'_filtered.fastq.gz') - - from CRISPResso2 import filterFastqs - filterFastqs.filterFastqs(fastq_r1=args.fastq_r1, fastq_r2=args.fastq_r2, fastq_r1_out=output_filename_r1, fastq_r2_out=output_filename_r2, min_bp_qual_in_read=min_single_bp_quality, min_av_read_qual=min_av_quality, min_bp_qual_or_N=min_bp_quality_or_N) - - args.fastq_r1 = output_filename_r1 - args.fastq_r2 = output_filename_r2 - files_to_remove += [output_filename_r1] - files_to_remove += [output_filename_r2] - - else: - output_filename_r1=_jp(os.path.basename(args.fastq_r1.replace('.fastq', '')).replace('.gz', '')+'_filtered.fastq.gz') - - from CRISPResso2 import filterFastqs - filterFastqs.filterFastqs(fastq_r1=args.fastq_r1, fastq_r1_out=output_filename_r1, min_bp_qual_in_read=min_single_bp_quality, min_av_read_qual=min_av_quality, min_bp_qual_or_N=min_bp_quality_or_N) - - args.fastq_r1 = output_filename_r1 - files_to_remove += [output_filename_r1] - - #Trim and merge reads if args.bam_input != '' and args.trim_sequences: raise CRISPRessoShared.BadParameterException('Read trimming options are not available with bam input') @@ -2292,7 +2254,33 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited info('Wrote force-merged reads to ' + new_merged_filename) info('Done!', {'percent_complete': 7}) + else: # single end reads with no trimming + processed_output_filename = args.fastq_r1 + + if args.min_average_read_quality > 0 or args.min_single_bp_quality > 0 or args.min_bp_quality_or_N > 0: + if args.bam_input != '': + raise CRISPRessoShared.BadParameterException('The read filtering options are not available with bam input') + info('Filtering reads with average bp quality < %d and single bp quality < %d and replacing bases with quality < %d with N ...' % (args.min_average_read_quality, args.min_single_bp_quality, args.min_bp_quality_or_N)) + min_av_quality = None + if args.min_average_read_quality > 0: + min_av_quality = args.min_average_read_quality + + min_single_bp_quality = None + if args.min_single_bp_quality > 0: + min_single_bp_quality = args.min_single_bp_quality + + min_bp_quality_or_N = None + if args.min_bp_quality_or_N > 0: + min_bp_quality_or_N = args.min_bp_quality_or_N + + output_filename_r1 = _jp(os.path.basename( + processed_output_filename.replace('.fastq', '')).replace('.gz', '') + '_filtered.fastq.gz', + ) + + from CRISPResso2 import filterFastqs + filterFastqs.filterFastqs(fastq_r1=processed_output_filename, fastq_r1_out=output_filename_r1, min_bp_qual_in_read=min_single_bp_quality, min_av_read_qual=min_av_quality, min_bp_qual_or_N=min_bp_quality_or_N) + processed_output_filename = output_filename_r1 #count reads N_READS_AFTER_PREPROCESSING = 0 From e30cec33d9f0ce867251e170ffa820a51d31724e Mon Sep 17 00:00:00 2001 From: Samuel Nichols Date: Fri, 22 Mar 2024 14:31:38 -0600 Subject: [PATCH 2/8] GitHub actions on pr (#51) * Run integration tests on pull_request * Run pytest on pull_request * Run pylint on pull_request --- .github/workflows/integration_tests.yml | 1 + .github/workflows/pylint.yml | 1 + .github/workflows/pytest.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index fae5a96b..cb9135b0 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -2,6 +2,7 @@ name: Run Integration Tests on: push: + pull_request: jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 67b57b6f..ee421c7d 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -2,6 +2,7 @@ name: Pylint on: push: + pull_request: jobs: build: diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index fb2a86dc..c13c8e7c 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -2,6 +2,7 @@ name: Run Pytest on: push: + pull_request: jobs: build: From 42e59597039eb4580abdfb0a9beb636f404b0f7a Mon Sep 17 00:00:00 2001 From: Samuel Nichols Date: Fri, 22 Mar 2024 17:38:37 -0600 Subject: [PATCH 3/8] Run tests on PR only when opening PR (#53) --- .github/workflows/integration_tests.yml | 2 ++ .github/workflows/pylint.yml | 2 ++ .github/workflows/pytest.yml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index cb9135b0..a9f3442a 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -2,7 +2,9 @@ name: Run Integration Tests on: push: + pull_request: + types: [opened, reopened] jobs: build: diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index ee421c7d..564df075 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -2,7 +2,9 @@ name: Pylint on: push: + pull_request: + types: [opened, reopened] jobs: build: diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index c13c8e7c..5cec2da2 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -2,7 +2,9 @@ name: Run Pytest on: push: + pull_request: + types: [opened, reopened] jobs: build: From e87d92ebaf95a1147b657a4c356df137d9aae04e Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Mon, 25 Mar 2024 09:50:30 -0600 Subject: [PATCH 4/8] Update reports (#52) * Update report changes * Switch branch of integration test repo * Remove extraneous `crispresso_data_path` * Point integration tests back to master --- .github/workflows/integration_tests.yml | 2 +- CRISPResso2/CRISPRessoReports/README.md | 24 ++++++++++++++++--- .../templates/batchReport.html | 1 - .../templates/multiReport.html | 5 ++-- .../templates/pooledReport.html | 5 ++-- .../templates/wgsReport.html | 5 ++-- 6 files changed, 29 insertions(+), 13 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index a9f3442a..16941ebd 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -40,7 +40,7 @@ jobs: run: | mkdir ../CRISPResso2_copy cp -r * ../CRISPResso2_copy - + - name: Copy C2_tests repo uses: actions/checkout@master with: diff --git a/CRISPResso2/CRISPRessoReports/README.md b/CRISPResso2/CRISPRessoReports/README.md index b31cff80..39487ca6 100644 --- a/CRISPResso2/CRISPRessoReports/README.md +++ b/CRISPResso2/CRISPRessoReports/README.md @@ -86,10 +86,16 @@ Also, note that the default commit message may have a summary of all commits, pl 1. In the parent repo, switch to (or create) the branch on `CRISPRessoReports` that will have the changes you push. -If you are creating a new branch based off of `CRISPRessoReports` master, run this: +If you are creating a new branch based off of `CRISPRessoReports` master, run this to switch to the reports master branch: ``` shell -git checkout -b -reports reports/master +git checkout reports/master +``` + +Then, run to actually create (and switch to) the branch that you will be working with: + +``` shell +git checkout -b -reports ``` Or if you would like to push to an existing branch on `CRISPRessoReports`, run this: @@ -106,6 +112,18 @@ git merge --squash -Xsubtree="CRISPResso2/CRISPRessoReports" --no-commit --allow *Note:* `` is the branch of the parent repo that contains the changes inside the `CRISPRessoReports` sub-directory. +3. Push to `CRISPRessoReports`. + +``` shell +git push +``` + +4. Switch back to your branch on `CRISPResso` or `C2Web`. + +``` shell +git checkout +``` + ### I am working on a feature that requires changing `CRISPRessoReports`, what do I do? If a feature that you are working on requires changes to CRISPRessoReports, you will need to perform a few steps to get setup. @@ -113,7 +131,7 @@ If a feature that you are working on requires changes to CRISPRessoReports, you 1. Create a feature branch in the parent repo, based on the parent repo master. ``` shell -git checkout -b origin/master +git checkout -b ``` 2. Create a feature branch on `CRISPRessoReports`. diff --git a/CRISPResso2/CRISPRessoReports/templates/batchReport.html b/CRISPResso2/CRISPRessoReports/templates/batchReport.html index 1255424c..49a6c9ea 100644 --- a/CRISPResso2/CRISPRessoReports/templates/batchReport.html +++ b/CRISPResso2/CRISPRessoReports/templates/batchReport.html @@ -45,7 +45,6 @@ {% endblock %} {% block content %} -
diff --git a/CRISPResso2/CRISPRessoReports/templates/multiReport.html b/CRISPResso2/CRISPRessoReports/templates/multiReport.html index 10d30063..2831d9fc 100644 --- a/CRISPResso2/CRISPRessoReports/templates/multiReport.html +++ b/CRISPResso2/CRISPRessoReports/templates/multiReport.html @@ -46,7 +46,7 @@ {% endblock %} {% block content %} - +
@@ -61,7 +61,7 @@
{{report_name}}
{% for run_name in run_names %} - {{run_name}} + {{run_name}} {% endfor %}
@@ -162,6 +162,7 @@
Summary Plots
{# column #}
+
{% endblock %} {% block foot %} diff --git a/CRISPResso2/CRISPRessoReports/templates/pooledReport.html b/CRISPResso2/CRISPRessoReports/templates/pooledReport.html index ba944d90..899f39c1 100644 --- a/CRISPResso2/CRISPRessoReports/templates/pooledReport.html +++ b/CRISPResso2/CRISPRessoReports/templates/pooledReport.html @@ -44,7 +44,6 @@ {% endblock %} {% block content %} -
@@ -60,8 +59,8 @@
{{report_name}}
{% for region_name in run_names %} - {{region_name}} - {% endfor %} + {{region_name}} + {% endfor %}
diff --git a/CRISPResso2/CRISPRessoReports/templates/wgsReport.html b/CRISPResso2/CRISPRessoReports/templates/wgsReport.html index 5f2008a4..47ab9002 100644 --- a/CRISPResso2/CRISPRessoReports/templates/wgsReport.html +++ b/CRISPResso2/CRISPRessoReports/templates/wgsReport.html @@ -45,7 +45,6 @@ {% endblock %} {% block content %} -
@@ -60,8 +59,8 @@
{{report_name}}
{% for region_name in run_names %} - {{region_name}} - {% endfor %} + {{region_name}} + {% endfor %}
From d171561b4a98f1f5b323b9f1b19c027ef40b059a Mon Sep 17 00:00:00 2001 From: Kendell Clement Date: Thu, 28 Mar 2024 14:33:57 -0600 Subject: [PATCH 5/8] Update integration_tests.yml --- .github/workflows/integration_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 16941ebd..bbab366f 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -42,7 +42,7 @@ jobs: cp -r * ../CRISPResso2_copy - name: Copy C2_tests repo - uses: actions/checkout@master + uses: actions/checkout@v3 with: repository: edilytics/CRISPResso2_tests token: ${{ secrets.ACCESS_CRISPRESSO2_TESTS }} From b2cfb911171dd8550f057e67239f51a8dfff91aa Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Thu, 28 Mar 2024 14:41:31 -0600 Subject: [PATCH 6/8] Fix the assignment of multiple quantification window coordinates (#38) (#403) * Mckay/pd warnings (#45) * refactor errors='ignore' to try except * refactored integer slice to iloc[] * moved to_numeric try except to function * Refactor to_numeric_ignore_errors to to_numeric_ignore_columns This change is slightly cleaner because it addresses the root issue that some columns are strings (and can therefore not be converted to numeric types). Now if an error does occur when converting the dfs to numeric types it won't be swallowed up. * Add documentation to to_numeric_ignore_columns --------- * Extract out quantification window coordinate function * Refactor get_quant_window_coordinates function into two The rationale behind this is that the behavior around the cloned amplicon is quite different than if the qwc are specified directly for the amplicon. * Handling qwc: add unit tests, refactor some more and add documentation * Extract out get_relative_coordinates function This function just computes the relative indexes without doing an alignment. * Add clarifying unit tests for `get_relative_coordinates` * Refactor cloned indexes to use ref_positions instead of s1inds * fixed function for getting cloned qwc idxs * added tests for cloned qwc function * Introduce pandas sorting in CRISPRessoCompare (#47) * Fix interleaved fastq input in CRISPRessoPooled and suppress CRISPRessoWGS params (#42) * Extract out split_interleaved_fastq function to CRISPRessoShared * Implement splitting interleaved fastq files in CRISPRessoPooled * Suppress split_interleaved_input from CRISPRessoWGS parameters * Suppress other parameters in CRISPRessoWGS * Move where interleaved fastq files are split to be trimmed properly * Bug Fix - 367 (#35) * - Fixed references to ref_names_for_pe * removed extra tabs * trying to match empty line, no tabs * - changed references to ref_names[0] * Mckay/pd warnings (#45) * refactor errors='ignore' to try except * refactored integer slice to iloc[] * moved to_numeric try except to function * Refactor to_numeric_ignore_errors to to_numeric_ignore_columns This change is slightly cleaner because it addresses the root issue that some columns are strings (and can therefore not be converted to numeric types). Now if an error does occur when converting the dfs to numeric types it won't be swallowed up. * Add documentation to to_numeric_ignore_columns --------- --------- * removed if check * implemented last test * changed NT to BadParameterException * changed tests, NT to BadParameter exceptions * Uncomment and correct tests for `get_relative_coordinates` * finished qwc tests * 0 is an acceptable qwc * new get_relative_coords function * added relative coordinate tests * removed unused functions * formatting * check for 0 qwc * remove test code * remove comment * Move read filtering to after merging in CRISPResso (#39) * Move read filtering to after merging This is in an effort to be consistent with the behavior and results of CRISPRessoPooled. * Properly assign the correct file names for read filtering * Add space around operators * GitHub actions on pr (#51) * Run integration tests on pull_request * Run pytest on pull_request * Run pylint on pull_request * Run tests on PR only when opening PR (#53) * Update reports (#52) * Update report changes * Switch branch of integration test repo * Remove extraneous `crispresso_data_path` * Point integration tests back to master --------- Co-authored-by: mbowcut2 <55161542+mbowcut2@users.noreply.github.com> Co-authored-by: McKay Co-authored-by: Samuel Nichols --- CRISPResso2/CRISPRessoCORE.py | 93 +++++++++++++++++--- CRISPResso2/CRISPRessoShared.py | 60 +++++++++---- tests/unit_tests/test_CRISPRessoCORE.py | 102 +++++++++++++++++++++- tests/unit_tests/test_CRISPRessoShared.py | 55 +++++++++++- 4 files changed, 277 insertions(+), 33 deletions(-) diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index 5ac57018..2531a0a7 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -136,6 +136,79 @@ def get_n_reads_bam(bam_filename,bam_chr_loc=""): ######################################### + +def split_quant_window_coordinates(quant_window_coordinates): + """Split the quantification window coordinates to be iterated over. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of tuples + Where each element is a tuple and the first element of the tuple is the + start of the range and the second element is the end of the range. + """ + coord_re = re.compile(r'^(\d+)-(\d+)$') + try: + return [tuple(map(int, coord_re.match(c).groups())) for c in quant_window_coordinates.split('_')] + except: + raise CRISPRessoShared.BadParameterException("Cannot parse analysis window coordinate '" + str(quant_window_coordinates)) + + +def get_include_idxs_from_quant_window_coordinates(quant_window_coordinates): + """Get the include_idxs from the quantification window coordinates. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein. + """ + return [ + i + for coord in split_quant_window_coordinates(quant_window_coordinates) + for i in range(coord[0], coord[1] + 1) + ] + + +def get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, idxs): + """Get the include_idxs from the quantification window coordinates, but adjusted according to s1ind. + + Parameters + ---------- + quant_window_coordinates: str + The quantification window coordinates, in the form "5-10_100-101", where + the "_" delimits separate ranges and the "-" delimits the range itself. + idxs: list of int + The index values mapped to the amplicon from which this is being cloned. + + Returns + ------- + list of int + The include_idxs to be used for quantification, which is the quantification + window coordinates expanded to include all individual indexes contained therein, + but adjusted according to s1inds. + """ + include_idxs = [] + for i in range(1, len(idxs)): + if abs(idxs[i-1]) == idxs[i]: + idxs[i] = -1 * abs(idxs[i]) + for coord in split_quant_window_coordinates(quant_window_coordinates): + include_idxs.extend(idxs[coord[0]:coord[1] + 1]) + + return list(filter(lambda x: x >= 0, include_idxs)) + + def get_pe_scaffold_search(prime_edited_ref_sequence, prime_editing_pegRNA_extension_seq, prime_editing_pegRNA_scaffold_seq, prime_editing_pegRNA_scaffold_min_match_length): """ For prime editing, determines the scaffold string to search for (the shortest substring of args.prime_editing_pegRNA_scaffold_seq not in the prime-edited reference sequence) @@ -1966,21 +2039,15 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited refs[ref_name]['contains_guide'] = refs[clone_ref_name]['contains_guide'] #quantification window coordinates override other options - if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "": + if amplicon_quant_window_coordinates_arr[clone_ref_idx] != "" and amplicon_quant_window_coordinates_arr[this_ref_idx] != '0': if amplicon_quant_window_coordinates_arr[this_ref_idx] != "": - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[this_ref_idx] + this_include_idxs = get_include_idxs_from_quant_window_coordinates(amplicon_quant_window_coordinates_arr[this_ref_idx]) else: - this_quant_window_coordinates = amplicon_quant_window_coordinates_arr[clone_ref_idx] - this_include_idxs = [] - these_coords = this_quant_window_coordinates.split("_") - for coord in these_coords: - coordRE = re.match(r'^(\d+)-(\d+)$', coord) - if coordRE: - start = s1inds[int(coordRE.group(1))] - end = s1inds[int(coordRE.group(2)) + 1] - this_include_idxs.extend(range(start, end)) - else: - raise NTException("Cannot parse analysis window coordinate '" + str(coord)) + this_include_idxs = get_cloned_include_idxs_from_quant_window_coordinates( + amplicon_quant_window_coordinates_arr[clone_ref_idx], + s1inds.copy(), + ) + #subtract any indices in 'exclude_idxs' -- e.g. in case some of the cloned include_idxs were near the read ends (excluded) this_exclude_idxs = sorted(list(set(refs[ref_name]['exclude_idxs']))) this_include_idxs = sorted(list(set(np.setdiff1d(this_include_idxs, this_exclude_idxs)))) diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py index 26a212a6..8e82c4e3 100644 --- a/CRISPResso2/CRISPRessoShared.py +++ b/CRISPResso2/CRISPRessoShared.py @@ -1677,7 +1677,47 @@ def set_guide_array(vals, guides, property_name): for idx, val in enumerate(vals_array): if val != '': ret_array[idx] = int(val) - return ret_array + return ret_array + + +def get_relative_coordinates(to_sequence, from_sequence): + """Given an alignment, get the relative coordinates of the second sequence to the first. + + For example, from_sequence[i] matches to to_sequence[inds[i]]. A `-1` + indicates a gap at the beginning of `to_sequence`. + + Parameters + ---------- + to_sequence : str + The alignment of the first sequence (where the coordinates are relative to) + from_sequence : str + The alignment of the second sequence + + Returns + ------- + s1inds_gap_left : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the left value. + s1inds_gap_right : list of int + The relative coordinates of the second sequence to the first, where gaps + in the first sequence are filled with the right value. + """ + s1inds_gap_left = [] + s1inds_gap_right = [] + s1idx_left = -1 + s1idx_right = 0 + s2idx = -1 + for ix in range(len(to_sequence)): + if to_sequence[ix] != "-": + s1idx_left += 1 + if from_sequence[ix] != "-": + s2idx += 1 + s1inds_gap_left.append(s1idx_left) + s1inds_gap_right.append(s1idx_right) + if to_sequence[ix] != "-": + s1idx_right += 1 + + return s1inds_gap_left, s1inds_gap_right def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_wunsch_gap_open, @@ -1701,23 +1741,7 @@ def get_alignment_coordinates(to_sequence, from_sequence, aln_matrix, needleman_ gap_open=needleman_wunsch_gap_open, gap_extend=needleman_wunsch_gap_extend, gap_incentive=this_gap_incentive) - # print(fws1) - # print(fws2) - s1inds_l = [] - s1inds_r = [] - s1ix_l = -1 - s1ix_r = 0 - s2ix = -1 - for ix in range(len(fws1)): - if fws1[ix] != "-": - s1ix_l += 1 - if fws2[ix] != "-": - s2ix += 1 - s1inds_l.append(s1ix_l) - s1inds_r.append(s1ix_r) - if fws1[ix] != "-": - s1ix_r += 1 - return s1inds_l, s1inds_r + return get_relative_coordinates(fws1, fws2) def get_mismatches(seq_1, seq_2, aln_matrix, needleman_wunsch_gap_open, needleman_wunsch_gap_extend): diff --git a/tests/unit_tests/test_CRISPRessoCORE.py b/tests/unit_tests/test_CRISPRessoCORE.py index 268b6727..9a14549f 100644 --- a/tests/unit_tests/test_CRISPRessoCORE.py +++ b/tests/unit_tests/test_CRISPRessoCORE.py @@ -1,7 +1,7 @@ """Unit tests for CRISPResso2CORE.""" import pytest -from CRISPResso2 import CRISPRessoCORE +from CRISPResso2 import CRISPRessoCORE, CRISPRessoShared def test_get_consensus_alignment_from_pairs(): """Tests for generating consensus alignments from paired reads.""" @@ -93,6 +93,106 @@ def test_get_consensus_alignment_from_pairs(): assert ref_seq == "ATCGATCGAT" assert score == 50 #double check this score... should be 5/10 + +def test_split_quant_window_coordinates_single(): + assert [(5, 10)] == CRISPRessoCORE.split_quant_window_coordinates('5-10') + + +def test_split_quant_window_coordinates_multiple(): + assert CRISPRessoCORE.split_quant_window_coordinates('2-5_10-12') == [(2, 5), (10, 12)] + + +def test_split_quant_window_coordinates_error(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('a-5') + + +def test_split_quant_window_coordinates_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('_') + + +def test_split_quant_window_coordinates_partially_empty(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('1-3_') + + +def test_split_quant_window_coordinates_blank(): + with pytest.raises(CRISPRessoShared.BadParameterException): + CRISPRessoCORE.split_quant_window_coordinates('') + + +def test_get_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + assert CRISPRessoCORE.get_include_idxs_from_quant_window_coordinates(quant_window_coordinates) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates(): + quant_window_coordinates = '1-10_12-20' + s1inds = list(range(22)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 11)), *list(range(12, 21))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp insertion at the beginning (left) + s1inds = list(range(5, 27)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(6, 16)), *list(range(17, 26))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_beginning(): + quant_window_coordinates = '1-10_12-20' + # represents a 5bp deletion at the beginning (left) + s1inds = [-1, -1, -1, -1, -1 ] + list(range(26)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(0, 6)), *list(range(7, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion(): + quant_window_coordinates = '10-20_35-40' + # represents a 7bp deletion in the middle + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 21)), *list(range(35-7, 41-7))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_modified(): + quant_window_coordinates = '10-25_35-40' + # represents a 7bp deletion in the middle, where part of the QW is deleted + # [0, 1, 3, 4, ... , 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, ... , 33] + s1inds = list(range(23)) + [22, 22, 22, 22, 22, 22, 22] + list(range(23, 34)) + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(10, 23)), *list(range(35-7, 41-7))] + + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_end_modified(): + # 5 bp deletion at end of 20 bp sequence + quant_window_coordinates = '1-5_10-20' + s1inds = [*list(range(16)), *[15, 15, 15, 15, 15]] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *list(range(10, 16))] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion(): + # 5 bp deletion and 5 bp insertion + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 20] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*list(range(1, 6)), *[6, 7, 8, 9, 15, 16, 17, 18, 19, 20]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_and_deletion_modified(): + quant_window_coordinates = '1-5_10-20' + s1inds = [0, 1, 2, 2, 4, 5, 6, 7, 7, 7, 7, 7, 7, 8, 9, 10, 15, 16, 17, 18, 19] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [*[1,2,4,5], *[8, 9, 10, 15, 16, 17, 18, 19]] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_insertion_across_qw(): + # 6 bp insertion in middle of 4 bp sequence + quant_window_coordinates = '1-4' + s1inds = [0,1,2,9,10] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1,2,9,10] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_deletion_entire_qw(): + # 5 bp deletion of entire qw + quant_window_coordinates = '1-4_7-10' + s1inds = [0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [1, 2, 3, 4] + +def test_get_cloned_include_idxs_from_quant_window_coordinates_include_zero(): + quant_window_coordinates = '0-5' + s1inds = [0, 1, 2, 3, 4, 5] + assert CRISPRessoCORE.get_cloned_include_idxs_from_quant_window_coordinates(quant_window_coordinates, s1inds) == [0, 1, 2, 3, 4, 5] + if __name__ == "__main__": # execute only if run as a script test_get_consensus_alignment_from_pairs() diff --git a/tests/unit_tests/test_CRISPRessoShared.py b/tests/unit_tests/test_CRISPRessoShared.py index da1bc479..f6ce67a0 100644 --- a/tests/unit_tests/test_CRISPRessoShared.py +++ b/tests/unit_tests/test_CRISPRessoShared.py @@ -20,4 +20,57 @@ def test_get_mismatches(): -5, -3, ) - assert len(mismatch_cords) == 6 + assert len(mismatch_cords) == 6 + +def test_get_relative_coordinates(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'TTCGT') + assert s1inds_gap_left == [0, 1, 2, 3, 4] + assert s1inds_gap_right == [0, 1, 2, 3, 4] + + +def test_get_relative_coordinates_to_gap(): + # unaligned sequences + seq_1 = 'TTCGT' + seq_2 = 'TTCT' + + # aligned_sequences + to_sequence = 'TTC-T' + from_sequence = 'TTCGT' + + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates(to_sequence, from_sequence) + assert s1inds_gap_left == [0, 1, 2, 2, 3] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + + + assert seq_1[0] == seq_2[s1inds_gap_left[0]] + assert seq_1[1] == seq_2[s1inds_gap_left[1]] + assert seq_1[2] == seq_2[s1inds_gap_left[2]] + assert seq_1[4] == seq_2[s1inds_gap_left[4]] + + +def test_get_relative_coordinates_start_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('--CGT', 'TTCGT') + assert s1inds_gap_left == [-1, -1, 0, 1, 2] + assert s1inds_gap_right == [0, 0, 0, 1, 2] + + +def test_get_relative_coordinates_from_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATCGT', 'ATC-T') + assert s1inds_gap_left == [0, 1, 2, 4] + assert s1inds_gap_right == [0, 1, 2, 4] + +def test_get_relative_coordinates_end_gap(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATC--', 'ATCGT') + assert s1inds_gap_left == [0, 1, 2, 2, 2] + assert s1inds_gap_right == [0, 1, 2, 3, 3] + +def test_get_relative_coordinates_multiple_gaps(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('AT--TC--G--CC', 'ATCGTCGCGTTCC') + assert s1inds_gap_left == [0, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 5, 6] + assert s1inds_gap_right == [0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 5, 5, 6] + +def test_get_relative_coordinates_ind_and_dels(): + s1inds_gap_left, s1inds_gap_right = CRISPRessoShared.get_relative_coordinates('ATG--C', 'A-GCTC') + assert s1inds_gap_left == [0, 2, 2, 2, 3] + assert s1inds_gap_right == [0, 2, 3, 3, 3] + From 074155bed8796d6b80ac7af6aeb976d2635699da Mon Sep 17 00:00:00 2001 From: Cole Lyman Date: Thu, 28 Mar 2024 15:00:55 -0600 Subject: [PATCH 7/8] Fix 'Prime-edited' key not found (#32) * Move 'Prime-edited' amplicon name check By moving this, it will check if there is an amplicon named 'Prime-edited' (which is a reserved name) even if the `prime_editing_pegRNA_extension_seq` parameter is empty. * Only search for scaffold integration when pegRNA extension seq is provided * Remove spaces at the end of lines --- CRISPResso2/CRISPRessoCORE.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py index 2531a0a7..99aa4838 100644 --- a/CRISPResso2/CRISPRessoCORE.py +++ b/CRISPResso2/CRISPRessoCORE.py @@ -480,7 +480,7 @@ def process_fastq(fastq_filename, variantCache, ref_names, refs, args): aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold seq to search - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align @@ -555,7 +555,7 @@ def process_bam(bam_filename, bam_chr_loc, output_bam, variantCache, ref_names, aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align @@ -694,7 +694,7 @@ def process_fastq_write_out(fastq_input, fastq_output, variantCache, ref_names, aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) #scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} #cache for reads that don't align not_aln[''] = "" #add empty sequence to the not_aln in case the fastq has an extra newline at the end @@ -823,7 +823,7 @@ def process_single_fastq_write_bam_out(fastq_input, bam_output, bam_header, vari aln_matrix = CRISPResso2Align.read_matrix(aln_matrix_loc) pe_scaffold_dna_info = (0, None) # scaffold start loc, scaffold sequence - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": pe_scaffold_dna_info = get_pe_scaffold_search(refs['Prime-edited']['sequence'], args.prime_editing_pegRNA_extension_seq, args.prime_editing_pegRNA_scaffold_seq, args.prime_editing_pegRNA_scaffold_min_match_length) not_aln = {} # cache for reads that don't align not_aln[''] = "" # add empty sequence to the not_aln in case the fastq has an extra newline at the end @@ -1428,6 +1428,8 @@ def rreplace(s, old, new): #Prime editing + if 'Prime-edited' in amplicon_name_arr: + raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.") prime_editing_extension_seq_dna = "" #global var for the editing extension sequence for the scaffold quantification below prime_editing_edited_amp_seq = "" if args.prime_editing_pegRNA_extension_seq != "": @@ -1489,8 +1491,6 @@ def rreplace(s, old, new): if new_ref in amplicon_seq_arr: raise CRISPRessoShared.BadParameterException('The calculated prime-edited amplicon is the same as the reference sequence.') amplicon_seq_arr.append(new_ref) - if 'Prime-edited' in amplicon_name_arr: - raise CRISPRessoShared.BadParameterException("An amplicon named 'Prime-edited' must not be provided.") amplicon_name_arr.append('Prime-edited') amplicon_quant_window_coordinates_arr.append('') prime_editing_edited_amp_seq = new_ref @@ -2380,7 +2380,7 @@ def get_prime_editing_guides(this_amp_seq, this_amp_name, ref0_seq, prime_edited info('Done!', {'percent_complete': 20}) - if args.prime_editing_pegRNA_scaffold_seq != "": + if args.prime_editing_pegRNA_scaffold_seq != "" and args.prime_editing_pegRNA_extension_seq != "": #introduce a new ref (that we didn't align to) called 'Scaffold Incorporated' -- copy it from the ref called 'prime-edited' new_ref = deepcopy(refs['Prime-edited']) new_ref['name'] = "Scaffold-incorporated" From 47b3908b891f153d843c5baf386ecc94e7f749d0 Mon Sep 17 00:00:00 2001 From: Samuel Nichols Date: Thu, 28 Mar 2024 15:01:22 -0600 Subject: [PATCH 8/8] Docker size (#49) * Bug Fix - 367 (#35) * - Fixed references to ref_names_for_pe * removed extra tabs * trying to match empty line, no tabs * - changed references to ref_names[0] * Mckay/pd warnings (#45) * refactor errors='ignore' to try except * refactored integer slice to iloc[] * moved to_numeric try except to function * Refactor to_numeric_ignore_errors to to_numeric_ignore_columns This change is slightly cleaner because it addresses the root issue that some columns are strings (and can therefore not be converted to numeric types). Now if an error does occur when converting the dfs to numeric types it won't be swallowed up. * Add documentation to to_numeric_ignore_columns --------- Co-authored-by: Cole Lyman --------- Co-authored-by: Cole Lyman * GitHub actions integration tests (#48) * GitHub actions clean (#40) * Create pytest.yml * Create pylint.yml * Create .pylintrc * Create test_env.yml * Full path * Remove conda install * Replace path * Pytest tests * pip -e * Create integration_tests.yml * Simplify name * CRISPRESSO2_DIR environment variable * Up one dir * ls workspace * Install CRISPResso and ydiff * Clone repo instead of checkout * submodule * ls * CRISPResso2_copy * ls * Update env * Simplify * Pull from githubactions branch * Pull githubactions repo * Checkout githubactions * Mckay/pd warnings (#45) * refactor errors='ignore' to try except * refactored integer slice to iloc[] * moved to_numeric try except to function * Refactor to_numeric_ignore_errors to to_numeric_ignore_columns This change is slightly cleaner because it addresses the root issue that some columns are strings (and can therefore not be converted to numeric types). Now if an error does occur when converting the dfs to numeric types it won't be swallowed up. * Add documentation to to_numeric_ignore_columns --------- Co-authored-by: Cole Lyman * Run tests individually * Pin plotly version * Run all tests even if one fails * Test on another branch * Switch branch with token * Update integration_tests.yml * Introduce pandas sorting in CRISPRessoCompare (#47) * New makefile commands * Fix interleaved fastq input in CRISPRessoPooled and suppress CRISPRessoWGS params (#42) * Extract out split_interleaved_fastq function to CRISPRessoShared * Implement splitting interleaved fastq files in CRISPRessoPooled * Suppress split_interleaved_input from CRISPRessoWGS parameters * Suppress other parameters in CRISPRessoWGS * Move where interleaved fastq files are split to be trimmed properly * Bug Fix - 367 (#35) * - Fixed references to ref_names_for_pe * removed extra tabs * trying to match empty line, no tabs * - changed references to ref_names[0] * Mckay/pd warnings (#45) * refactor errors='ignore' to try except * refactored integer slice to iloc[] * moved to_numeric try except to function * Refactor to_numeric_ignore_errors to to_numeric_ignore_columns This change is slightly cleaner because it addresses the root issue that some columns are strings (and can therefore not be converted to numeric types). Now if an error does occur when converting the dfs to numeric types it won't be swallowed up. * Add documentation to to_numeric_ignore_columns --------- Co-authored-by: Cole Lyman --------- Co-authored-by: Cole Lyman * On push no branches * On push no branches * All in one file * Fix yml errors * Rename jobs * Remove old workflow files * Remove paths * Run jobs in parallel --------- Co-authored-by: mbowcut2 <55161542+mbowcut2@users.noreply.github.com> Co-authored-by: Cole Lyman * 3.4->2.08 * Put ttf-mscorefonts-installer back above apt-get clean * restore slash, replace fastp with trimmomatic and flash, add autoremove step --------- Co-authored-by: mbowcut2 <55161542+mbowcut2@users.noreply.github.com> Co-authored-by: Cole Lyman --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3af1d19b..699d26d1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,10 +10,11 @@ MAINTAINER Kendell Clement RUN apt-get update && apt-get install gcc g++ bowtie2 samtools libsys-hostname-long-perl \ -y --no-install-recommends \ && apt-get clean \ + && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /usr/share/man/* \ && rm -rf /usr/share/doc/* \ - && conda install -c defaults -c conda-forge -c bioconda -y -n base --debug -c bioconda trimmomatic flash numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib pandas plotly\ + && conda install -c defaults -c conda-forge -c bioconda -y -n base --debug trimmomatic flash numpy cython jinja2 tbb=2020.2 pyparsing=2.3.1 scipy matplotlib-base pandas plotly\ && conda clean --all --yes #install ms fonts @@ -40,4 +41,4 @@ RUN python setup.py install \ && CRISPRessoCompare -h -ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"] +ENTRYPOINT ["python","/CRISPResso2/CRISPResso2_router.py"] \ No newline at end of file