diff --git a/Changes b/Changes index 63b1ca23..000e6a0e 100644 --- a/Changes +++ b/Changes @@ -1,7 +1,16 @@ LIST OF CHANGES FOR NPG-QC PACKAGE + +release 69.8.0 + - mqc skipper - a more generic way to disregard spiked-in controls when + counting the number of studies the samples in the run belong to. The change + is driven by the fact that spiked-in controls can now belong to a number of + different studies. + - replace fastx_toolkit use with seqtk + - npg_substitution_metrics.pl script clean-up + release 69.7.0 - - added new npg_subtitution_metrics.pl script + - added new npg_substitution_metrics.pl script - move from Travis CI to GitHub Actions - switch to node 14 for CI, using npm from default conda pkg - remove dependency on npg_common::sequence::reference::base_count, which diff --git a/bin/npg_mqc_skipper b/bin/npg_mqc_skipper index d095c28f..e12e08e5 100755 --- a/bin/npg_mqc_skipper +++ b/bin/npg_mqc_skipper @@ -19,7 +19,7 @@ use npg_qc::mqc::skipper; our $VERSION = '0'; -Readonly::Scalar my $EXCLUDE_STUDY_NAME => 'Illumina Controls'; +Readonly::Scalar my $EXCLUDE_ENTITY_TYPE => 'library_indexed_spike'; Readonly::Scalar my $NUM_DISTINCT_STUDIES => 1; Readonly::Scalar my $RUN_STATUS_FROM => 'qc review pending'; Readonly::Scalar my $RUN_STATUS_TO => 'archival pending'; @@ -39,7 +39,7 @@ GetOptions('dry_run|dry-run!' => \$dry_run, 'study_name=s' => \$study_name, 'deplexing_percent_threshold=i' => \$deplexing_threshold, 'qc_fails_percent_threshold=i' => \$qc_fails_threshold, - 'artic_qc_fails_percent_threshold=i' => \$artic_qc_fails_threshold, + 'artic_qc_fails_percent_threshold=i' => \$artic_qc_fails_threshold, 'help' => sub { pod2usage(-verbose => 2, -exitval => 0) @@ -71,7 +71,7 @@ my @rows = $tracking_schema->resultset('Run')->search( 'run_status_dict.description' => $RUN_STATUS_FROM, 'instrument_format.model' => $INSTRUMENT_MODEL}, {prefetch => [{'run_statuses' => 'run_status_dict'}, 'instrument_format']} - )->all(); + )->all(); @rows = grep { my $row = $_; none { $row->is_tag_set($_) } @NO_SKIPPING_TAGS } @rows; @@ -107,13 +107,13 @@ my $query = q[from iseq_product_metrics p ] . q[join iseq_flowcell f on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] . q[join study s on s.id_study_tmp=f.id_study_tmp ] . - qq[where s.name != ? and p.id_run in (${placeholders}) ] . + qq[where f.entity_type != ? and p.id_run in (${placeholders}) ] . q[group by p.id_run having study_count = ?]; my $sth = $dbh->prepare($query) or ($logger->fatal("Failed to prepare statement: $DBI::errstr") and exit 1); # Run time database errors are thrown by the execute method, no need to # do anything special. -$sth->execute($EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES); +$sth->execute($EXCLUDE_ENTITY_TYPE, @run_ids, $NUM_DISTINCT_STUDIES); @run_ids = $get_run_ids->($sth); if (@run_ids) { diff --git a/bin/npg_substitution_metrics.pl b/bin/npg_substitution_metrics.pl index 5dd15299..a90ff399 100755 --- a/bin/npg_substitution_metrics.pl +++ b/bin/npg_substitution_metrics.pl @@ -190,83 +190,6 @@ () return \%win; } -#%======================================================to process winRi structures -#function [new_win]=stripped_convert_new(win_R1) -#%coverts win_R1 (as in error quality) into converient format, new_win, -#%where each of 12 columns are counts of windows X.Y for each of 12 substitutions -# -#%INPUT input win_R1 (16x12) from error file see below: -# -#%# Effect of previous base and next base high predictor. Use `grep ^PNCRH | cut -f 2-` to extract this part -#%# 16 rows: one row per pair= previous/next base, columns (12 for each subs) is previous base+substitution+next base and count for 12 substitutions -# -#%PCRNH 1 caA bw AX AACA 6 AACC 10 AACG 6 AACT 2 AAGA 159 AAGC 125 AAGG 133 AAGT 102 AATA 45 AATC 51 AATG 52 AATT 65 -#%PCRNH 1 caC ACAA 11 ACAC 23 ACAG 10 ACAT 12 ACGA 2 ACGC 12 ACGG 8 ACGT 4 ACTA 129 ACTC 156 ACTG 276 ACTT 74 -#%PCRNH 1 caG AGAA 51 AGAC 179 AGAG 77 AGAT 53 AGCA 45 AGCC 32 AGCG 20 AGCT 13 AGTA 646 AGTC 701 AGTG 398 AGTT 209 -#%PCRNH 1 caT ATAA 49 ATAC 64 ATAG 49 ATAT 53 ATCA 116 ATCC 92 ATCG 137 ATCT 119 ATGA 2 ATGC 7 ATGG 8 ATGT 5 -# -#%PCRNH 1 caA bw CX CACA 6 CACC 4 CACG 14 CACT 12 CAGA 146 CAGC 65 CAGG 155 CAGT 124 CATA 60 CATC 39 CATG 60 CATT 72 -#%PCRNH 1 CCAA 56 CCAC 25 CCAG 22 CCAT 30 CCGA 13 CCGC 3 CCGG 7 CCGT 7 CCTA 175 CCTC 34 CCTG 138 CCTT 66 -#%PCRNH 1 CGAA 129 CGAC 363 CGAG 97 CGAT 244 CGCA 33 CGCC 37 CGCG 36 CGCT 22 CGTA 463 CGTC 509 CGTG 306 CGTT 251 -#%PCRNH 1 CTAA 23 CTAC 46 CTAG 37 CTAT 45 CTCA 178 CTCC 118 CTCG 204 CTCT 202 CTGA 3 CTGC 2 CTGG 9 CTGT 10 -# -#%PCRNH 1 caA bw GX GACA 3 GACC 7 GACG 6 GACT 4 GAGA 99 GAGC 78 GAGG 94 GAGT 83 GATA 60 GATC 58 GATG 46 GATT 72 -#%PCRNH 1 GCAA 97 GCAC 20 GCAG 20 GCAT 29 GCGA 11 GCGC 11 GCGG 10 GCGT 14 GCTA 201 GCTC 192 GCTG 307 GCTT 162 -#%PCRNH 1 GGAA 53 GGAC 214 GGAG 36 GGAT 94 GGCA 33 GGCC 21 GGCG 7 GGCT 13 GGTA 348 GGTC 388 GGTG 145 GGTT 184 -#%PCRNH 1 GTAA 26 GTAC 47 GTAG 33 GTAT 51 GTCA 3520 GTCC 96 GTCG 71 GTCT 104 GTGA 4 GTGC 5 GTGG 11 GTGT 22 -# -#%PCRNH 1 caA bw TX TACA 4 TACC 3 TACG 2 TACT 3 TAGA 96 TAGC 4020 TAGG 209 TAGT 90 TATA 63 TATC 46 TATG 16 TATT 49 -#%PCRNH 1 TCAA 28 TCAC 11 TCAG 19 TCAT 23 TCGA 15 TCGC 5 TCGG 7 TCGT 5 TCTA 186 TCTC 65 TCTG 141 TCTT 84 -#%PCRNH 1 TGAA 134 TGAC 272 TGAG 112 TGAT 86 TGCA 74 TGCC 47 TGCG 46 TGCT 14 TGTA 774 TGTC 822 TGTG 512 TGTT 211 -#%PCRNH 1 TTAA 45 TTAC 63 TTAG 31 TTAT 43 -# -#new_win=[]; -#N=4; -# -#a=1; -#for f=1:3, -# for k=1:4, -# j=a+(k-1)*N; -# for t1=1:4, -# new_win(j+t1-a,f)=win_R1(j,t1+(f-1)*N); -# end -# end -#end -# -# -#c=2; -#for f=1:3, -# for k=1:4, -# j=c+(k-1)*N; -# for t1=1:4, -# new_win(j+t1-c,f+2*c-1)=win_R1(j,t1+(f-1)*N); -# end -# end -#end -# -#g=3; -#for f=1:3, -# for k=1:4, -# j=g+(k-1)*N; -# for t=1:4, -# new_win(j+t-g,f+2*g)=win_R1(j,t+(f-1)*N); -# end -# end -#end -# -# -#t=4; -#for f=1:3, -# for k=1:4, -# j=t+(k-1)*N;% caG 3, -# for t1=1:4, -# new_win(j+t1-t,f+2*t+1)=win_R1(j,t1+(f-1)*N); -# end -# end -#end -#endfunction -# - # generate metrics sub metrics_gen_assym_prediction () { my ($err_data, $verbose) = @_; @@ -318,16 +241,6 @@ () my $as_ag_tc_R2 = ($mis2 > 0 ? (abs($coH2{AG} - $coH2{TC}) / $mis2) : 0); my $symm_ag_tc = max(($as_ag_tc_R1, $as_ag_tc_R2)); - #1.1.1.1--------remove strong assymetry if needed (re-define %separated pair - if ($as_ag_tc_R1 > $thr_ass) { - $coH1{AG} = $mis1; - $coH1{TC} = $mis1; - } - if ($as_ag_tc_R2 > $thr_ass) { - $coH2{AG} = $mis2; - $coH2{TC} = $mis2; - } - #1.1.2 ---- strand assymetry (sh be symmetry) mic=min 'close' Ti: CT GA my $mic1 = min(($coH1{CT}, $coH1{GA})); my $mic2 = min(($coH2{CT}, $coH2{GA})); @@ -335,16 +248,6 @@ () my $as_ct_ga_R2 = ($mic2 > 0 ? (abs($coH2{CT} - $coH2{GA}) / $mic2) : 0); my $symm_ct_ga = max(($as_ct_ga_R1, $as_ct_ga_R2)); - #1.1.2.1-remove close Ti starnd assymetry if needed - if ($as_ct_ga_R1 > $thr_ass) { - $coH1{CT} = $mic1; - $coH1{GA} = $mic1; - } - if ($as_ct_ga_R2 > $thr_ass) { - $coH2{CT} = $mic2; - $coH2{GA} = $mic2; - } - my @ti1 = ($coH1{AG}, $coH1{TC}, $coH1{CT}, $coH1{GA}); my $mean_ti1 = round(mean(@ti1)); my $sd_ti1 = round(stddev(@ti1)); @@ -394,18 +297,15 @@ () my $TiTv_meGT = max(($si_sv1, $si_sv2)); #3========metrics on oxiG and GT/CA artefact - #artefact C2A=art_ox=proportion of gt/ca to mean Ti, depends - #on Ti variability - my $thr_ti1 = $mean_ti1 - ($cvTi1 > $thr_cv ? 1 : 3) * $sd_ti1; - my $thr_ti2 = $mean_ti2 - ($cvTi2 > $thr_cv ? 1 : 3) * $sd_ti2; + #artefact C2A=art_ox=proportion of gt/ca to mean Ti, depends on Ti variability #===============art_ox=proportion of gt or ca to mean Ti my $ma1 = max(($coH1{CA}, $coH1{GT})); my $ma2 = max(($coH2{CA}, $coH2{GT})); # prop max(gt,ca) in Ti: the higher the more likely artC2A - my $artR1 = ($thr_ti1 > 0 ? ($ma1 / $thr_ti1) : 0); - my $artR2 = ($thr_ti2 > 0 ? ($ma2 / $thr_ti2) : 0); + my $artR1 = ($mean_ti1 > 0 ? ($ma1 / $mean_ti1) : 0); + my $artR2 = ($mean_ti2 > 0 ? ($ma2 / $mean_ti2) : 0); my $art_ox = max(($artR1, $artR2)); #3.2 ----------------------oxidation biasas BROAD @@ -476,12 +376,8 @@ () } #5.2--------compute probability/likelihood of C2A based on art_ox - my $likely = 0.25 * ($artR1 + $artR2); - my $new_likely = $likely; - if ($likely > 1) { - $new_likely = 1; - } - my $likelihood = $new_likely; + my $likely = $gt_nearTi; + my $likelihood = $likely; # update results $results{likelihood} = $likelihood; diff --git a/lib/npg_qc/autoqc/checks/insert_size.pm b/lib/npg_qc/autoqc/checks/insert_size.pm index 3ab20d34..8ccbc16d 100644 --- a/lib/npg_qc/autoqc/checks/insert_size.pm +++ b/lib/npg_qc/autoqc/checks/insert_size.pm @@ -20,10 +20,8 @@ use npg_common::Alignment; use npg_common::extractor::fastq qw(read_count); extends qw(npg_qc::autoqc::checks::check); -with qw( - npg_tracking::data::reference::find - npg_common::roles::software_location - ); +with qw(npg_tracking::data::reference::find), + qw(npg_common::roles::software_location) => { tools => [qw/bwa0_6 seqtk/] }; has '+aligner' => (default => 'bwa0_6', is => 'ro'); our $VERSION = '0'; @@ -433,10 +431,10 @@ sub _fastq_reverse_complement { my @reversed = (); foreach my $i (0 .. 1) { push @reversed, catfile($self->tmp_path, ($i+1) . q[r.fastq]); - my @args = (q[fastx_reverse_complement], q[-Q 33], q[-i], $sample_reads->[$i], q[-o], $reversed[$i]); - print 'Producing reverse complemented file: ' . (join q[ ], @args) . qq[\n] || carp 'Producing reverse complemented file: ' . (join q[ ], @args) . qq[\n]; - if (system(@args)) { - croak printf "Child %s exited with value %d\n", join(q[ ], @args), $CHILD_ERROR >> $CHILD_ERROR_SHIFT; + my $cmd = $self->seqtk_cmd . q[ seq -r -Q 33 ] . $sample_reads->[$i] . q[ > ] . $reversed[$i]; + print 'Producing reverse complemented file: ' . $cmd . qq[\n] || carp 'Producing reverse complemented file: ' . $cmd . qq[\n]; + if (system($cmd)) { + croak printf "Child %s exited with value %d\n", $cmd, $CHILD_ERROR >> $CHILD_ERROR_SHIFT; } } return \@reversed; @@ -455,25 +453,13 @@ sub _set_additional_modules_info { use strict; ## use critic if ($self->use_reverse_complemented) { - push @packages_info, q[FASTX Toolkit fastx_reverse_complement ] . $self->_fastx_version; + push @packages_info, q[seqtk ] . $self->current_version( $self->seqtk_cmd() ) } push @packages_info, join q[ ], $self->norm_fit_cmd, $VERSION; $self->result->set_info('Additional_Modules', ( join q[;], @packages_info ) ); return; } -sub _fastx_version { - my $self = shift; - my @lines = slurp 'fastx_reverse_complement -h |'; - foreach my $line (@lines) { - if ($line =~ /FASTX\ Toolkit/xms) { - my ($version) = $line =~ /\ (\d+\.\d+\.\d+)\ /xms; - if ($version) { return $version; } - } - } - return q[]; -} - sub _align { my ($self, $sample_reads, $prefix) = @_; diff --git a/t/60-autoqc-checks-insert_size.t b/t/60-autoqc-checks-insert_size.t index adb47de1..1c569134 100644 --- a/t/60-autoqc-checks-insert_size.t +++ b/t/60-autoqc-checks-insert_size.t @@ -30,7 +30,7 @@ use_ok('npg_qc::autoqc::results::insert_size'); use_ok('npg_qc::autoqc::checks::insert_size'); sub _additional_modules { - my $use_fastx = shift; + my $use_seqtk = shift; my @expected = (); require npg_qc::autoqc::parse::alignment; push @expected, join(q[ ], @@ -41,8 +41,8 @@ sub _additional_modules { require npg_common::Alignment; push @expected, join(q[ ], q[npg_common::Alignment], $npg_common::Alignment::VERSION); - if ($use_fastx) { - push @expected, q[FASTX Toolkit fastx_reverse_complement 0.0.12]; + if ($use_seqtk) { + push @expected, q[seqtk 1.3]; } push @expected, join(q[ ], $norm_fit, $npg_qc::autoqc::results::insert_size::VERSION); return @expected; @@ -572,7 +572,7 @@ sub _additional_modules { { my $dir = tempdir( CLEANUP => 1 ); - t::autoqc_util::write_fastx_script(catfile($dir, 'fastx_reverse_complement')); + t::autoqc_util::write_seqtk_script(catfile($dir, 'seqtk')); local $ENV{PATH} = join ':', $dir, $ENV{PATH}; my $qc = npg_qc::autoqc::checks::insert_size->new( @@ -584,15 +584,15 @@ sub _additional_modules { expected_size => [340,350], format => $format, ); - is ($qc->_fastx_version, '0.0.12', 'fastx version'); + is ($qc->current_version( $qc->seqtk_cmd() ), '1.3', 'seqtk version'); $qc->_set_additional_modules_info; is ( $qc->result->get_info('Additional_Modules'), - join(q[;], _additional_modules(1)), 'additional info with fastx'); + join(q[;], _additional_modules(1)), 'additional info with seqtk'); } { my $dir = tempdir( CLEANUP => 1 ); - t::autoqc_util::write_fastx_script(catfile($dir, 'fastx_reverse_complement'), 1); + t::autoqc_util::write_seqtk_script(catfile($dir, 'seqtk'), 1); my $s1 = catfile($current_dir, q[t/data/autoqc/alignment_few.sam]); t::autoqc_util::write_bwa_script(catfile($dir, 'bwa0_6'), $s1); local $ENV{PATH} = join ':', $dir, $ENV{PATH}; diff --git a/t/autoqc_util.pm b/t/autoqc_util.pm index 5a2f9ae8..ad20a362 100644 --- a/t/autoqc_util.pm +++ b/t/autoqc_util.pm @@ -50,10 +50,10 @@ sub find_or_save_composition { sub write_samtools_script { my ($script_path, $b1) = @_; - + open my $fh, '>', $script_path or croak "Cannot open $script_path for writing"; - print $fh '#!/usr/local/bin/bash'; + print $fh '#!/usr/bin/env bash'; print $fh "\n"; print $fh '# Script to fake samtools in testing. Automatically generated by npg at ' . time; print $fh "\n"; @@ -65,34 +65,31 @@ sub write_samtools_script { } close $fh or carp "Cannot close $script_path"; - + chmod 0775, $script_path; return 1; } -sub write_fastx_script { +sub write_seqtk_script { my ($script_path, $b1) = @_; - + my $fh; open($fh, '>', $script_path) or croak "Cannot open $script_path for writing"; print $fh '#!/usr/local/bin/bash'; print $fh "\n"; - print $fh '# Script to fake fastx in testing. Automatically generated by npg at ' . time; + print $fh '# Script to fake seqtk in testing. Automatically generated by npg at ' . time; print $fh "\n"; # echo the first argument $1; # echo number of arguments: $#; if (!defined $b1) { - my $out = q[usage: fastx_reverse_complement [-h] [-r] [-z] [-v] [-i INFILE] [-o OUTFILE] -Part of FASTX Toolkit 0.0.12 by A. Gordon (gordon@cshl.edu) - - [-h] = This helpful help screen. - [-z] = Compress output with GZIP. - [-i INFILE] = FASTA/Q input file. default is STDIN. - [-o OUTFILE] = FASTA/Q output file. default is STDOUT. + my $out = q[ +Usage: seqtk +Version: 1.3 +Command: seq common transformation of FASTA/Q ]; print $fh "echo '$out'"; @@ -107,7 +104,7 @@ Part of FASTX Toolkit 0.0.12 by A. Gordon (gordon@cshl.edu) sub write_bwa_script { my ($script_path, $s1, $bwa_version) = @_; - + my $fh; open($fh, '>', $script_path) or croak "Cannot open $script_path for writing"; @@ -133,7 +130,7 @@ sub write_bwa_script { } close $fh; - + chmod 0775, $script_path; return 1; }