From 58637bd4992b87cf484f0011fbb31c52bc3a2f1f Mon Sep 17 00:00:00 2001 From: Carol Scott Date: Mon, 13 Nov 2017 22:16:08 +0000 Subject: [PATCH 1/2] update bam_flagstats to support no markdups option --- Changes | 1 + MANIFEST | 2 + lib/npg_qc/autoqc/checks/bam_flagstats.pm | 62 ++++++++++++------- t/60-autoqc-checks-bam_flagstats.t | 45 +++++++++++++- .../24135_1#1.bam_flagstats.json | 1 + .../autoqc/bam_flagstats/24135_1#1.flagstat | 13 ++++ 6 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json create mode 100644 t/data/autoqc/bam_flagstats/24135_1#1.flagstat diff --git a/Changes b/Changes index a4060d4bd..971597962 100644 --- a/Changes +++ b/Changes @@ -2,6 +2,7 @@ LIST OF CHANGES FOR NPG-QC PACKAGE - utility QC feature: db tables, DBIx classes, retrieval - Add STAR aligner bams as a valid RNA alignments - also added tests for it + - update bam_flagstats to be able run when no markdups metrics file exists release 64.3.1 - error in a statement for data update is fixed diff --git a/MANIFEST b/MANIFEST index 9e64aed96..86396a34f 100644 --- a/MANIFEST +++ b/MANIFEST @@ -310,6 +310,8 @@ t/data/api_dbic_fixtures/npg/RunStatus.yml t/data/api_dbic_fixtures/npg/Run.yml t/data/autoqc/bam_flagstats/16960_1_0.tar.gz t/data/autoqc/bam_flagstats/17448_1_9.tar.gz +t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json +t/data/autoqc/bam_flagstats/24135_1#1.flagstat t/data/autoqc/bam_flagstats/4783_5_bam_flagstats.json t/data/autoqc/bam_flagstats/4783_5_metrics_optical.txt t/data/autoqc/bam_flagstats/4783_5.flagstat diff --git a/lib/npg_qc/autoqc/checks/bam_flagstats.pm b/lib/npg_qc/autoqc/checks/bam_flagstats.pm index de328e9a8..95c03b60d 100644 --- a/lib/npg_qc/autoqc/checks/bam_flagstats.pm +++ b/lib/npg_qc/autoqc/checks/bam_flagstats.pm @@ -19,20 +19,21 @@ with qw( npg_tracking::glossary::subset ); our $VERSION = '0'; -Readonly::Scalar my $METRICS_FIELD_LIST => [qw( - library - unpaired_mapped_reads - paired_mapped_reads - unmapped_reads - unpaired_read_duplicates - paired_read_duplicates - read_pair_optical_duplicates - percent_duplicate - library_size)]; +Readonly::Hash my %METRICS_FIELD_MAPPING => { + 'LIBRARY' => 'library', + 'READ_PAIRS_EXAMINED' => 'read_pairs_examined', + 'UNPAIRED_READ_DUPLICATES' => 'unpaired_read_duplicates', + 'READ_PAIR_DUPLICATES' => 'paired_read_duplicates', + 'READ_PAIR_OPTICAL_DUPLICATES' => 'read_pair_optical_duplicates', + 'PERCENT_DUPLICATION' => 'percent_duplicate', + 'ESTIMATED_LIBRARY_SIZE' => 'library_size' +}; # picard and biobambam mark duplicates assign this # value for aligned data with no mapped paired reads Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1; +Readonly::Scalar my $METRICS_NUMBER => 9; +Readonly::Scalar my $PAIR_NUMBER => 2; Readonly::Scalar our $EXT => q[bam]; has '+subset' => ( isa => 'Str', ); @@ -54,13 +55,23 @@ sub _build__sequence_file { } sub _build_markdups_metrics_file { my $self = shift; - return join q[.], $self->_file_path_root, 'markdups_metrics.txt'; + my $metrics_file; + if( !$self->skip_markdups_metrics ){ + $metrics_file = join q[.], $self->_file_path_root, 'markdups_metrics.txt'; + } + return $metrics_file; } sub _build_flagstats_metrics_file { my $self = shift; return join q[.], $self->_file_path_root, 'flagstat'; } +has 'skip_markdups_metrics' => (is => 'ro', + isa => 'Bool', + required => 0, + default => 0, + ); + has '_file_path_root' => ( isa => 'Str', is => 'ro', lazy_build => 1, @@ -124,7 +135,9 @@ sub _build_related_results { override 'execute' => sub { my $self = shift; - $self->_parse_markdups_metrics(); + if( !$self->skip_markdups_metrics ){ + $self->_parse_markdups_metrics(); + } $self->_parse_flagstats(); for my $rr ( @{$self->related_results()} ) { $rr->execute(); @@ -142,20 +155,22 @@ sub _parse_markdups_metrics { chomp $header; $self->result()->set_info('markdups_metrics_header', $header); - my ($metrics_source) = $header =~ /(MarkDuplicates | EstimateLibraryComplexity | bam\S*markduplicates)/mxs; - my $metrics = $file_contents[1]; my $histogram = $file_contents[2]; my @metrics_lines = split /\n/mxs, $metrics; + my @metrics_header = split /\t/mxs, $metrics_lines[1]; my @metrics_numbers = split /\t/mxs, $metrics_lines[2]; - if (scalar @metrics_numbers > scalar @{$METRICS_FIELD_LIST} ) { + my %metrics; + @metrics{@metrics_header} = @metrics_numbers; + + if (scalar @metrics_numbers > $METRICS_NUMBER ) { croak 'MarkDuplicate metrics format is wrong'; } - foreach my $field (@{$METRICS_FIELD_LIST}){ - my $field_value = shift @metrics_numbers; + foreach my $field (keys %METRICS_FIELD_MAPPING){ + my $field_value = $metrics{$field}; if ($field_value) { if ($field_value =~/\?/mxs) { $field_value = undef; @@ -167,12 +182,7 @@ sub _parse_markdups_metrics { } } } - $self->result()->$field( $field_value ); - } - - $self->result()->read_pairs_examined( $self->result()->paired_mapped_reads() ); - if ($metrics_source eq 'EstimateLibraryComplexity') { - $self->result()->paired_mapped_reads(0); + $self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value ); } if ($histogram) { @@ -202,6 +212,12 @@ sub _parse_flagstats { ? $self->result()->mate_mapped_defferent_chr_5($number) :( $line =~ /in\ total/mxs ) ? $self->result()->num_total_reads($number) + : ( $line =~ /with\ itself\ and\ mate\ mapped/mxs ) + ? $self->result()->paired_mapped_reads($number/$PAIR_NUMBER) + : ( $line =~ /singletons\ \(/mxs ) + ? $self->result()->unpaired_mapped_reads($number) + : ( $line =~ /mapped\ \(/mxs ) + ? $self->result()->unmapped_reads($self->result()->num_total_reads() - $number) : next; } close $samtools_output_fh or carp "Warning: $OS_ERROR - failed to close filehandle to $fn"; diff --git a/t/60-autoqc-checks-bam_flagstats.t b/t/60-autoqc-checks-bam_flagstats.t index 10f425b2f..fc402124d 100644 --- a/t/60-autoqc-checks-bam_flagstats.t +++ b/t/60-autoqc-checks-bam_flagstats.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 8; +use Test::More tests => 9; use Test::Exception; use Test::Deep; use File::Temp qw( tempdir ); @@ -76,6 +76,49 @@ subtest 'high-level parsing' => sub { is($r->read_pairs_examined(), 15017382, 'read_pairs_examined'); }; +subtest 'high-level parsing, no markdup metrics' => sub { + plan tests => 10; + + my $fstat = 't/data/autoqc/bam_flagstats/24135_1#1.flagstat'; + + my $c = npg_qc::autoqc::checks::bam_flagstats->new( + id_run => 24135, + position => 1, + tag_index => 1, + flagstats_metrics_file => $fstat, + skip_markdups_metrics => 1, + related_results => [] + ); + + my $expected = from_json( + slurp q{t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json}, {chomp=>1}); + + lives_ok { $c->execute() } 'execute method is ok'; + my $r; + my $result_json; + lives_ok { + $r = $c->result(); + $result_json = $r->freeze(); + $r->store(qq{$tempdir/24135_1#1.bam_flagstats.json}); + } 'no error when serializing to json string and file'; + + my $from_json_hash = from_json($result_json); + delete $from_json_hash->{'__CLASS__'}; + delete $from_json_hash->{'composition'}; + delete $from_json_hash->{'info'}->{'Check'}; + delete $from_json_hash->{'info'}->{'Check_version'}; + + is_deeply($from_json_hash, $expected, 'correct json output'); + is($r->total_reads(), 66302 , 'total reads'); + is($r->total_mapped_reads(), '62526', 'total mapped reads'); + is($r->percent_mapped_reads, 94.304847515912, 'percent mapped reads'); + is($r->percent_duplicate_reads, undef, 'percent duplicate reads'); + is($r->percent_properly_paired ,90.7845917166903, 'percent properly paired'); + is($r->percent_singletons,0.0995445084612832 , 'percent singletons'); + is($r->read_pairs_examined(), undef, 'read_pairs_examined'); +}; + + my $archive_16960 = '16960_1_0'; my $ae_16960 = Archive::Extract->new(archive => "t/data/autoqc/bam_flagstats/${archive_16960}.tar.gz"); $ae_16960->extract(to => $tempdir) or die $ae_16960->error; diff --git a/t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json b/t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json new file mode 100644 index 000000000..05cbb8aa6 --- /dev/null +++ b/t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json @@ -0,0 +1 @@ +{"histogram":{},"info":{},"id_run":24135,"mate_mapped_defferent_chr":6,"mate_mapped_defferent_chr_5":6,"num_total_reads":66302,"paired_mapped_reads":31230,"position":1,"proper_mapped_pair":60192,"tag_index":1,"unmapped_reads":3776,"unpaired_mapped_reads":66} diff --git a/t/data/autoqc/bam_flagstats/24135_1#1.flagstat b/t/data/autoqc/bam_flagstats/24135_1#1.flagstat new file mode 100644 index 000000000..841f62247 --- /dev/null +++ b/t/data/autoqc/bam_flagstats/24135_1#1.flagstat @@ -0,0 +1,13 @@ +66302 + 0 in total (QC-passed reads + QC-failed reads) +0 + 0 secondary +0 + 0 supplementary +0 + 0 duplicates +62526 + 0 mapped (94.30% : N/A) +66302 + 0 paired in sequencing +33151 + 0 read1 +33151 + 0 read2 +60192 + 0 properly paired (90.78% : N/A) +62460 + 0 with itself and mate mapped +66 + 0 singletons (0.10% : N/A) +6 + 0 with mate mapped to a different chr +6 + 0 with mate mapped to a different chr (mapQ>=5) From 9d5574c631539beb1eb183428ca977bf06fa823e Mon Sep 17 00:00:00 2001 From: Carol Scott Date: Tue, 14 Nov 2017 10:54:55 +0000 Subject: [PATCH 2/2] Fix test --- t/60-autoqc-db_loader.t | 1 + 1 file changed, 1 insertion(+) diff --git a/t/60-autoqc-db_loader.t b/t/60-autoqc-db_loader.t index 35e8337ab..053778b21 100644 --- a/t/60-autoqc-db_loader.t +++ b/t/60-autoqc-db_loader.t @@ -348,6 +348,7 @@ subtest 'loading bam_flagststs' => sub { path => ['t/data/autoqc/bam_flagstats'], ); warnings_like { $db_loader->load() } [ + qr/Skipped t\/data\/autoqc\/bam_flagstats\/24135_1#1.bam_flagstats\.json/, # no __CLASS__ key qr/Skipped t\/data\/autoqc\/bam_flagstats\/4783_5_bam_flagstats\.json/, # no __CLASS__ key qr/Loaded t\/data\/autoqc\/bam_flagstats\/4921_3_bam_flagstats\.json/, qr/1 json files have been loaded/