Merge branch 'devel' of https://github.com/wtsi-npg/npg_qc into maste…

…r to create release 69.8.0
wtsi-npg · Aug 11, 2021 · 682c155 · 682c155
2 parents 1eda6c2 + 4b595b2
commit 682c155
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 158 deletions.
diff --git a/Changes b/Changes
@@ -1,7 +1,16 @@
 LIST OF CHANGES FOR NPG-QC PACKAGE
 
+
+release 69.8.0
+ - mqc skipper - a more generic way to disregard spiked-in controls when
+   counting the number of studies the samples in the run belong to. The change
+   is driven by the fact that spiked-in controls can now belong to a number of
+   different studies.
+ - replace fastx_toolkit use with seqtk
+ - npg_substitution_metrics.pl script clean-up
+
 release 69.7.0
- - added new npg_subtitution_metrics.pl script
+ - added new npg_substitution_metrics.pl script
  - move from Travis CI to GitHub Actions
  - switch to node 14 for CI, using npm from default conda pkg
  - remove dependency on npg_common::sequence::reference::base_count, which

diff --git a/bin/npg_mqc_skipper b/bin/npg_mqc_skipper
@@ -19,7 +19,7 @@ use npg_qc::mqc::skipper;
 
 our $VERSION = '0';
 
-Readonly::Scalar my $EXCLUDE_STUDY_NAME   => 'Illumina Controls';
+Readonly::Scalar my $EXCLUDE_ENTITY_TYPE  => 'library_indexed_spike';
 Readonly::Scalar my $NUM_DISTINCT_STUDIES => 1;
 Readonly::Scalar my $RUN_STATUS_FROM      => 'qc review pending';
 Readonly::Scalar my $RUN_STATUS_TO        => 'archival pending';
@@ -39,7 +39,7 @@ GetOptions('dry_run|dry-run!' => \$dry_run,
            'study_name=s'     => \$study_name,
            'deplexing_percent_threshold=i' => \$deplexing_threshold,
            'qc_fails_percent_threshold=i'  => \$qc_fails_threshold,
-	   'artic_qc_fails_percent_threshold=i'  => \$artic_qc_fails_threshold,
+           'artic_qc_fails_percent_threshold=i'  => \$artic_qc_fails_threshold,
            'help'             => sub {
              pod2usage(-verbose => 2,
                        -exitval => 0)
@@ -71,7 +71,7 @@ my @rows = $tracking_schema->resultset('Run')->search(
    'run_status_dict.description' => $RUN_STATUS_FROM,
    'instrument_format.model'     => $INSTRUMENT_MODEL},
   {prefetch => [{'run_statuses' => 'run_status_dict'}, 'instrument_format']}
-							   )->all();
+                                                     )->all();
 @rows = grep { my $row = $_; none { $row->is_tag_set($_) } @NO_SKIPPING_TAGS }
         @rows;
 
@@ -107,13 +107,13 @@ my $query =
    q[from iseq_product_metrics p ] .
    q[join iseq_flowcell f on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] .
    q[join study s on s.id_study_tmp=f.id_study_tmp ] .
-  qq[where s.name != ? and p.id_run in (${placeholders}) ] .
+  qq[where f.entity_type != ? and p.id_run in (${placeholders}) ] .
    q[group by p.id_run having study_count = ?];
 my $sth = $dbh->prepare($query) or
   ($logger->fatal("Failed to prepare statement: $DBI::errstr") and exit 1);
 # Run time database errors are thrown by the execute method, no need to
 # do anything special.
-$sth->execute($EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES);
+$sth->execute($EXCLUDE_ENTITY_TYPE, @run_ids, $NUM_DISTINCT_STUDIES);
 @run_ids = $get_run_ids->($sth);
 
 if (@run_ids) {

diff --git a/bin/npg_substitution_metrics.pl b/bin/npg_substitution_metrics.pl
@@ -190,83 +190,6 @@ ()
   return \%win;
 }
 
-#%======================================================to process winRi structures
-#function [new_win]=stripped_convert_new(win_R1)
-#%coverts win_R1 (as in error quality) into converient format, new_win,
-#%where each of 12 columns are counts of windows X.Y for each of 12 substitutions
-#
-#%INPUT input win_R1 (16x12) from error file  see below:
-#
-#%# Effect of previous base and next base high predictor. Use `grep ^PNCRH | cut -f 2-` to extract this part
-#%# 16 rows: one row per pair= previous/next base, columns (12 for each subs) is previous base+substitution+next base and count for 12 substitutions
-#
-#%PCRNH	1  caA bw AX	AACA	6	AACC	10	AACG	6	AACT	2	AAGA	159	AAGC	125	AAGG	133	AAGT	102	AATA	45	AATC	51	AATG	52	AATT	65
-#%PCRNH	1  caC	        ACAA	11	ACAC	23	ACAG	10	ACAT	12	ACGA	2	ACGC	12	ACGG	8	ACGT	4	ACTA	129	ACTC	156	ACTG	276	ACTT	74
-#%PCRNH	1  caG	        AGAA	51	AGAC	179	AGAG	77	AGAT	53	AGCA	45	AGCC	32	AGCG	20	AGCT	13	AGTA	646	AGTC	701	AGTG	398	AGTT	209
-#%PCRNH	1  caT	        ATAA	49	ATAC	64	ATAG	49	ATAT	53	ATCA	116	ATCC	92	ATCG	137	ATCT	119	ATGA	2	ATGC	7	ATGG	8	ATGT	5
-#
-#%PCRNH	1  caA bw CX	CACA	6	CACC	4	CACG	14	CACT	12	CAGA	146	CAGC	65	CAGG	155	CAGT	124	CATA	60	CATC	39	CATG	60	CATT	72
-#%PCRNH	1	CCAA	56	CCAC	25	CCAG	22	CCAT	30	CCGA	13	CCGC	3	CCGG	7	CCGT	7	CCTA	175	CCTC	34	CCTG	138	CCTT	66
-#%PCRNH	1	CGAA	129	CGAC	363	CGAG	97	CGAT	244	CGCA	33	CGCC	37	CGCG	36	CGCT	22	CGTA	463	CGTC	509	CGTG	306	CGTT	251
-#%PCRNH	1	CTAA	23	CTAC	46	CTAG	37	CTAT	45	CTCA	178	CTCC	118	CTCG	204	CTCT	202	CTGA	3	CTGC	2	CTGG	9	CTGT	10
-#
-#%PCRNH	1  caA bw GX	GACA	3	GACC	7	GACG	6	GACT	4	GAGA	99	GAGC	78	GAGG	94	GAGT	83	GATA	60	GATC	58	GATG	46	GATT	72
-#%PCRNH	1	GCAA	97	GCAC	20	GCAG	20	GCAT	29	GCGA	11	GCGC	11	GCGG	10	GCGT	14	GCTA	201	GCTC	192	GCTG	307	GCTT	162
-#%PCRNH	1	GGAA	53	GGAC	214	GGAG	36	GGAT	94	GGCA	33	GGCC	21	GGCG	7	GGCT	13	GGTA	348	GGTC	388	GGTG	145	GGTT	184
-#%PCRNH	1	GTAA	26	GTAC	47	GTAG	33	GTAT	51	GTCA	3520	GTCC	96	GTCG	71	GTCT	104	GTGA	4	GTGC	5	GTGG	11	GTGT	22
-#
-#%PCRNH	1 caA bw TX	TACA	4	TACC	3	TACG	2	TACT	3	TAGA	96	TAGC	4020	TAGG	209	TAGT	90	TATA	63	TATC	46	TATG	16	TATT	49
-#%PCRNH	1	TCAA	28	TCAC	11	TCAG	19	TCAT	23	TCGA	15	TCGC	5	TCGG	7	TCGT	5	TCTA	186	TCTC	65	TCTG	141	TCTT	84
-#%PCRNH	1	TGAA	134	TGAC	272	TGAG	112	TGAT	86	TGCA	74	TGCC	47	TGCG	46	TGCT	14	TGTA	774	TGTC	822	TGTG	512	TGTT	211
-#%PCRNH	1	TTAA	45	TTAC	63	TTAG	31	TTAT	43
-#
-#new_win=[];
-#N=4;
-#
-#a=1;
-#for f=1:3,
-# for k=1:4,
-#      j=a+(k-1)*N;
-#      for t1=1:4,
-#            new_win(j+t1-a,f)=win_R1(j,t1+(f-1)*N); 
-#      end
-#  end
-#end 
-#
-#
-#c=2;
-#for f=1:3,
-#  for k=1:4,
-#      j=c+(k-1)*N;
-#      for t1=1:4,
-#            new_win(j+t1-c,f+2*c-1)=win_R1(j,t1+(f-1)*N); 
-#      end
-#  end
-#end 
-#
-#g=3;
-#for f=1:3,
-#   for k=1:4,
-#      j=g+(k-1)*N;
-#      for t=1:4,
-#            new_win(j+t-g,f+2*g)=win_R1(j,t+(f-1)*N);
-#      end
-#   end
-#end 
-#
-#
-#t=4;
-#for f=1:3,
-#     for k=1:4,
-#      j=t+(k-1)*N;% caG  3,
-#      for t1=1:4,
-#            new_win(j+t1-t,f+2*t+1)=win_R1(j,t1+(f-1)*N); 
-#      end
-#  end
-#end
-#endfunction
-#
-
 # generate metrics
 sub metrics_gen_assym_prediction () {
   my ($err_data, $verbose) = @_;
@@ -318,33 +241,13 @@ ()
   my $as_ag_tc_R2 = ($mis2 > 0 ? (abs($coH2{AG} - $coH2{TC}) / $mis2) : 0);
   my $symm_ag_tc = max(($as_ag_tc_R1, $as_ag_tc_R2));
 
-  #1.1.1.1--------remove strong assymetry if needed (re-define %separated pair
-  if ($as_ag_tc_R1 > $thr_ass) {
-    $coH1{AG} = $mis1;
-    $coH1{TC} = $mis1;
-  }
-  if ($as_ag_tc_R2 > $thr_ass) {
-    $coH2{AG} = $mis2;
-    $coH2{TC} = $mis2;
-  }
-
   #1.1.2 ---- strand assymetry (sh be symmetry) mic=min 'close' Ti: CT GA
   my $mic1 = min(($coH1{CT}, $coH1{GA}));
   my $mic2 = min(($coH2{CT}, $coH2{GA}));
   my $as_ct_ga_R1 = ($mic1 > 0 ? (abs($coH1{CT} - $coH1{GA}) / $mic1) : 0);
   my $as_ct_ga_R2 = ($mic2 > 0 ? (abs($coH2{CT} - $coH2{GA}) / $mic2) : 0);
   my $symm_ct_ga = max(($as_ct_ga_R1, $as_ct_ga_R2));
 
-  #1.1.2.1-remove close Ti starnd assymetry if needed
-  if ($as_ct_ga_R1 > $thr_ass) {
-      $coH1{CT} = $mic1;
-      $coH1{GA} = $mic1;
-  }
-  if ($as_ct_ga_R2 > $thr_ass) {
-      $coH2{CT} = $mic2;
-      $coH2{GA} = $mic2;
-  }
-
   my @ti1 = ($coH1{AG}, $coH1{TC}, $coH1{CT}, $coH1{GA});
   my $mean_ti1 = round(mean(@ti1));
   my $sd_ti1 = round(stddev(@ti1));
@@ -394,18 +297,15 @@ ()
   my $TiTv_meGT = max(($si_sv1, $si_sv2));
 
   #3========metrics on oxiG and GT/CA artefact
-  #artefact C2A=art_ox=proportion of gt/ca to mean Ti, depends
-  #on Ti variability 
-  my $thr_ti1 = $mean_ti1 - ($cvTi1 > $thr_cv ? 1 : 3) * $sd_ti1;
-  my $thr_ti2 = $mean_ti2 - ($cvTi2 > $thr_cv ? 1 : 3) * $sd_ti2;
+  #artefact C2A=art_ox=proportion of gt/ca to mean Ti, depends on Ti variability 
 
   #===============art_ox=proportion of gt or ca to mean Ti
   my $ma1 = max(($coH1{CA}, $coH1{GT}));
   my $ma2 = max(($coH2{CA}, $coH2{GT}));
 
   # prop max(gt,ca) in Ti: the higher the more likely artC2A
-  my $artR1 = ($thr_ti1 > 0 ? ($ma1 / $thr_ti1) : 0);
-  my $artR2 = ($thr_ti2 > 0 ? ($ma2 / $thr_ti2) : 0);
+  my $artR1 = ($mean_ti1 > 0 ? ($ma1 / $mean_ti1) : 0);
+  my $artR2 = ($mean_ti2 > 0 ? ($ma2 / $mean_ti2) : 0);
   my $art_ox = max(($artR1, $artR2));
 
   #3.2 ----------------------oxidation biasas BROAD
@@ -476,12 +376,8 @@ ()
   }
 
   #5.2--------compute probability/likelihood of C2A based on art_ox
-  my $likely = 0.25 * ($artR1 + $artR2);
-  my $new_likely = $likely;
-  if ($likely > 1) {
-    $new_likely = 1;
-  }
-  my $likelihood = $new_likely;
+  my $likely = $gt_nearTi;
+  my $likelihood = $likely;
 
   # update results
   $results{likelihood} = $likelihood;

diff --git a/lib/npg_qc/autoqc/checks/insert_size.pm b/lib/npg_qc/autoqc/checks/insert_size.pm
@@ -20,10 +20,8 @@ use npg_common::Alignment;
 use npg_common::extractor::fastq qw(read_count);
 
 extends qw(npg_qc::autoqc::checks::check);
-with qw(
-  npg_tracking::data::reference::find
-  npg_common::roles::software_location
-       );
+with qw(npg_tracking::data::reference::find),
+     qw(npg_common::roles::software_location) => { tools => [qw/bwa0_6 seqtk/] };
 has '+aligner' => (default => 'bwa0_6', is => 'ro');
 
 our $VERSION = '0';
@@ -433,10 +431,10 @@ sub _fastq_reverse_complement {
     my @reversed = ();
     foreach my $i (0 .. 1) {
         push @reversed, catfile($self->tmp_path, ($i+1) . q[r.fastq]);
-        my @args = (q[fastx_reverse_complement], q[-Q 33], q[-i], $sample_reads->[$i], q[-o], $reversed[$i]);
-        print 'Producing reverse complemented file: ' . (join q[ ], @args) . qq[\n] || carp 'Producing reverse complemented file: ' . (join q[ ], @args) . qq[\n];
-        if (system(@args)) {
-            croak  printf "Child %s exited with value %d\n", join(q[ ], @args), $CHILD_ERROR >> $CHILD_ERROR_SHIFT;
+        my $cmd = $self->seqtk_cmd . q[ seq -r -Q 33 ] . $sample_reads->[$i] . q[ > ] . $reversed[$i];
+        print 'Producing reverse complemented file: ' . $cmd . qq[\n] || carp 'Producing reverse complemented file: ' . $cmd . qq[\n];
+        if (system($cmd)) {
+            croak  printf "Child %s exited with value %d\n", $cmd, $CHILD_ERROR >> $CHILD_ERROR_SHIFT;
         }
     }
     return \@reversed;
@@ -455,25 +453,13 @@ sub _set_additional_modules_info {
     use strict;
     ## use critic
     if ($self->use_reverse_complemented) {
-        push @packages_info, q[FASTX Toolkit fastx_reverse_complement ] . $self->_fastx_version;
+        push @packages_info, q[seqtk ] . $self->current_version( $self->seqtk_cmd() )
     }
     push @packages_info, join q[ ], $self->norm_fit_cmd, $VERSION;
     $self->result->set_info('Additional_Modules', ( join q[;], @packages_info ) );
     return;
 }
 
-sub _fastx_version {
-    my $self = shift;
-    my @lines = slurp 'fastx_reverse_complement -h |';
-    foreach my $line (@lines) {
-        if ($line =~ /FASTX\ Toolkit/xms) {
-            my ($version) = $line =~ /\ (\d+\.\d+\.\d+)\ /xms;
-            if ($version) { return $version; }
-        }
-    }
-    return q[];
-}
-
 sub _align {
     my ($self, $sample_reads, $prefix) = @_;
 

diff --git a/t/60-autoqc-checks-insert_size.t b/t/60-autoqc-checks-insert_size.t
@@ -30,7 +30,7 @@ use_ok('npg_qc::autoqc::results::insert_size');
 use_ok('npg_qc::autoqc::checks::insert_size');
 
 sub _additional_modules {
-  my $use_fastx = shift;
+  my $use_seqtk = shift;
   my @expected = ();
   require npg_qc::autoqc::parse::alignment;
   push @expected, join(q[ ],
@@ -41,8 +41,8 @@ sub _additional_modules {
   require npg_common::Alignment;
   push @expected, join(q[ ],
     q[npg_common::Alignment], $npg_common::Alignment::VERSION);
-  if ($use_fastx) {
-    push @expected, q[FASTX Toolkit fastx_reverse_complement 0.0.12];
+  if ($use_seqtk) {
+    push @expected, q[seqtk 1.3];
   }
   push @expected, join(q[ ], $norm_fit, $npg_qc::autoqc::results::insert_size::VERSION);
   return @expected;
@@ -572,7 +572,7 @@ sub _additional_modules {
 
 {
   my $dir = tempdir( CLEANUP => 1 );
-  t::autoqc_util::write_fastx_script(catfile($dir, 'fastx_reverse_complement'));
+  t::autoqc_util::write_seqtk_script(catfile($dir, 'seqtk'));
   local $ENV{PATH} = join ':', $dir,  $ENV{PATH};
 
   my $qc = npg_qc::autoqc::checks::insert_size->new(
@@ -584,15 +584,15 @@ sub _additional_modules {
                                               expected_size => [340,350],
                                               format   => $format,
                                                    );
-  is ($qc->_fastx_version, '0.0.12', 'fastx version');
+  is ($qc->current_version( $qc->seqtk_cmd() ), '1.3', 'seqtk version');
   $qc->_set_additional_modules_info;
   is ( $qc->result->get_info('Additional_Modules'),
-    join(q[;], _additional_modules(1)), 'additional info with fastx');
+    join(q[;], _additional_modules(1)), 'additional info with seqtk');
 }
 
 {
   my $dir = tempdir( CLEANUP => 1 );
-  t::autoqc_util::write_fastx_script(catfile($dir, 'fastx_reverse_complement'), 1);
+  t::autoqc_util::write_seqtk_script(catfile($dir, 'seqtk'), 1);
   my $s1 = catfile($current_dir, q[t/data/autoqc/alignment_few.sam]);
   t::autoqc_util::write_bwa_script(catfile($dir, 'bwa0_6'), $s1);
   local $ENV{PATH} = join ':', $dir,  $ENV{PATH};

diff --git a/t/autoqc_util.pm b/t/autoqc_util.pm
@@ -50,10 +50,10 @@ sub find_or_save_composition {
 
 sub write_samtools_script {
   my ($script_path, $b1) = @_;
-  
+
   open my $fh, '>', $script_path or croak "Cannot open $script_path for writing";
 
-  print $fh '#!/usr/local/bin/bash';
+  print $fh '#!/usr/bin/env bash';
   print $fh "\n";
   print $fh '# Script to fake samtools in testing. Automatically generated by npg at ' . time;
   print $fh "\n";
@@ -65,34 +65,31 @@ sub write_samtools_script {
   }
 
   close $fh or carp "Cannot close $script_path";
-    
+
   chmod 0775, $script_path;
   return 1;
 }
 
-sub write_fastx_script {
+sub write_seqtk_script {
 
   my ($script_path, $b1) = @_;
-  
+
   my $fh;
   open($fh, '>', $script_path) or croak "Cannot open $script_path for writing";
 
   print $fh '#!/usr/local/bin/bash';
   print $fh "\n";
-  print $fh '# Script to fake fastx in testing. Automatically generated by npg at ' . time;
+  print $fh '# Script to fake seqtk in testing. Automatically generated by npg at ' . time;
   print $fh "\n";
 
   # echo the first argument $1;
   # echo number of arguments: $#;
   if (!defined $b1) {
-    my $out = q[usage: fastx_reverse_complement [-h] [-r] [-z] [-v] [-i INFILE] [-o OUTFILE]
-Part of FASTX Toolkit 0.0.12 by A. Gordon ([email protected])
-
-   [-h]         = This helpful help screen.
-   [-z]         = Compress output with GZIP.
-   [-i INFILE]  = FASTA/Q input file. default is STDIN.
-   [-o OUTFILE] = FASTA/Q output file. default is STDOUT.
+    my $out = q[
+Usage:   seqtk <command> <arguments>
+Version: 1.3
 
+Command: seq       common transformation of FASTA/Q
 ];
 
     print $fh "echo '$out'";
@@ -107,7 +104,7 @@ Part of FASTX Toolkit 0.0.12 by A. Gordon ([email protected])
 sub write_bwa_script {
 
   my ($script_path, $s1, $bwa_version) = @_;
-  
+
   my $fh;
   open($fh, '>', $script_path) or croak "Cannot open $script_path for writing";
 
@@ -133,7 +130,7 @@ sub write_bwa_script {
   }
 
   close $fh;
-    
+
   chmod 0775, $script_path;
   return 1;
 }