Skip to content

Commit

Permalink
Merge pull request #470 from ces/nomd
Browse files Browse the repository at this point in the history
 update bam_flagstats to support no markdups option
  • Loading branch information
srl147 authored Nov 23, 2017
2 parents 47130ae + 9d5574c commit e8caf55
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 24 deletions.
1 change: 1 addition & 0 deletions Changes
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ LIST OF CHANGES FOR NPG-QC PACKAGE

- utility QC feature: db tables, DBIx classes, retrieval
- Add STAR aligner bams as a valid RNA alignments - also added tests for it
- update bam_flagstats to be able run when no markdups metrics file exists

release 64.3.1
- error in a statement for data update is fixed
Expand Down
2 changes: 2 additions & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ t/data/api_dbic_fixtures/npg/RunStatus.yml
t/data/api_dbic_fixtures/npg/Run.yml
t/data/autoqc/bam_flagstats/16960_1_0.tar.gz
t/data/autoqc/bam_flagstats/17448_1_9.tar.gz
t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json
t/data/autoqc/bam_flagstats/24135_1#1.flagstat
t/data/autoqc/bam_flagstats/4783_5_bam_flagstats.json
t/data/autoqc/bam_flagstats/4783_5_metrics_optical.txt
t/data/autoqc/bam_flagstats/4783_5.flagstat
Expand Down
62 changes: 39 additions & 23 deletions lib/npg_qc/autoqc/checks/bam_flagstats.pm
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,21 @@ with qw( npg_tracking::glossary::subset );

our $VERSION = '0';

Readonly::Scalar my $METRICS_FIELD_LIST => [qw(
library
unpaired_mapped_reads
paired_mapped_reads
unmapped_reads
unpaired_read_duplicates
paired_read_duplicates
read_pair_optical_duplicates
percent_duplicate
library_size)];
Readonly::Hash my %METRICS_FIELD_MAPPING => {
'LIBRARY' => 'library',
'READ_PAIRS_EXAMINED' => 'read_pairs_examined',
'UNPAIRED_READ_DUPLICATES' => 'unpaired_read_duplicates',
'READ_PAIR_DUPLICATES' => 'paired_read_duplicates',
'READ_PAIR_OPTICAL_DUPLICATES' => 'read_pair_optical_duplicates',
'PERCENT_DUPLICATION' => 'percent_duplicate',
'ESTIMATED_LIBRARY_SIZE' => 'library_size'
};

# picard and biobambam mark duplicates assign this
# value for aligned data with no mapped paired reads
Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1;
Readonly::Scalar my $METRICS_NUMBER => 9;
Readonly::Scalar my $PAIR_NUMBER => 2;
Readonly::Scalar our $EXT => q[bam];

has '+subset' => ( isa => 'Str', );
Expand All @@ -54,13 +55,23 @@ sub _build__sequence_file {
}
sub _build_markdups_metrics_file {
my $self = shift;
return join q[.], $self->_file_path_root, 'markdups_metrics.txt';
my $metrics_file;
if( !$self->skip_markdups_metrics ){
$metrics_file = join q[.], $self->_file_path_root, 'markdups_metrics.txt';
}
return $metrics_file;
}
sub _build_flagstats_metrics_file {
my $self = shift;
return join q[.], $self->_file_path_root, 'flagstat';
}

has 'skip_markdups_metrics' => (is => 'ro',
isa => 'Bool',
required => 0,
default => 0,
);

has '_file_path_root' => ( isa => 'Str',
is => 'ro',
lazy_build => 1,
Expand Down Expand Up @@ -124,7 +135,9 @@ sub _build_related_results {
override 'execute' => sub {
my $self = shift;

$self->_parse_markdups_metrics();
if( !$self->skip_markdups_metrics ){
$self->_parse_markdups_metrics();
}
$self->_parse_flagstats();
for my $rr ( @{$self->related_results()} ) {
$rr->execute();
Expand All @@ -142,20 +155,22 @@ sub _parse_markdups_metrics {
chomp $header;
$self->result()->set_info('markdups_metrics_header', $header);

my ($metrics_source) = $header =~ /(MarkDuplicates | EstimateLibraryComplexity | bam\S*markduplicates)/mxs;

my $metrics = $file_contents[1];
my $histogram = $file_contents[2];

my @metrics_lines = split /\n/mxs, $metrics;
my @metrics_header = split /\t/mxs, $metrics_lines[1];
my @metrics_numbers = split /\t/mxs, $metrics_lines[2];

if (scalar @metrics_numbers > scalar @{$METRICS_FIELD_LIST} ) {
my %metrics;
@metrics{@metrics_header} = @metrics_numbers;

if (scalar @metrics_numbers > $METRICS_NUMBER ) {
croak 'MarkDuplicate metrics format is wrong';
}

foreach my $field (@{$METRICS_FIELD_LIST}){
my $field_value = shift @metrics_numbers;
foreach my $field (keys %METRICS_FIELD_MAPPING){
my $field_value = $metrics{$field};
if ($field_value) {
if ($field_value =~/\?/mxs) {
$field_value = undef;
Expand All @@ -167,12 +182,7 @@ sub _parse_markdups_metrics {
}
}
}
$self->result()->$field( $field_value );
}

$self->result()->read_pairs_examined( $self->result()->paired_mapped_reads() );
if ($metrics_source eq 'EstimateLibraryComplexity') {
$self->result()->paired_mapped_reads(0);
$self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value );
}

if ($histogram) {
Expand Down Expand Up @@ -202,6 +212,12 @@ sub _parse_flagstats {
? $self->result()->mate_mapped_defferent_chr_5($number)
:( $line =~ /in\ total/mxs )
? $self->result()->num_total_reads($number)
: ( $line =~ /with\ itself\ and\ mate\ mapped/mxs )
? $self->result()->paired_mapped_reads($number/$PAIR_NUMBER)
: ( $line =~ /singletons\ \(/mxs )
? $self->result()->unpaired_mapped_reads($number)
: ( $line =~ /mapped\ \(/mxs )
? $self->result()->unmapped_reads($self->result()->num_total_reads() - $number)
: next;
}
close $samtools_output_fh or carp "Warning: $OS_ERROR - failed to close filehandle to $fn";
Expand Down
45 changes: 44 additions & 1 deletion t/60-autoqc-checks-bam_flagstats.t
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use strict;
use warnings;
use Test::More tests => 8;
use Test::More tests => 9;
use Test::Exception;
use Test::Deep;
use File::Temp qw( tempdir );
Expand Down Expand Up @@ -76,6 +76,49 @@ subtest 'high-level parsing' => sub {
is($r->read_pairs_examined(), 15017382, 'read_pairs_examined');
};

subtest 'high-level parsing, no markdup metrics' => sub {
plan tests => 10;

my $fstat = 't/data/autoqc/bam_flagstats/24135_1#1.flagstat';

my $c = npg_qc::autoqc::checks::bam_flagstats->new(
id_run => 24135,
position => 1,
tag_index => 1,
flagstats_metrics_file => $fstat,
skip_markdups_metrics => 1,
related_results => []
);

my $expected = from_json(
slurp q{t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json}, {chomp=>1});

lives_ok { $c->execute() } 'execute method is ok';
my $r;
my $result_json;
lives_ok {
$r = $c->result();
$result_json = $r->freeze();
$r->store(qq{$tempdir/24135_1#1.bam_flagstats.json});
} 'no error when serializing to json string and file';

my $from_json_hash = from_json($result_json);
delete $from_json_hash->{'__CLASS__'};
delete $from_json_hash->{'composition'};
delete $from_json_hash->{'info'}->{'Check'};
delete $from_json_hash->{'info'}->{'Check_version'};

is_deeply($from_json_hash, $expected, 'correct json output');
is($r->total_reads(), 66302 , 'total reads');
is($r->total_mapped_reads(), '62526', 'total mapped reads');
is($r->percent_mapped_reads, 94.304847515912, 'percent mapped reads');
is($r->percent_duplicate_reads, undef, 'percent duplicate reads');
is($r->percent_properly_paired ,90.7845917166903, 'percent properly paired');
is($r->percent_singletons,0.0995445084612832 , 'percent singletons');
is($r->read_pairs_examined(), undef, 'read_pairs_examined');
};


my $archive_16960 = '16960_1_0';
my $ae_16960 = Archive::Extract->new(archive => "t/data/autoqc/bam_flagstats/${archive_16960}.tar.gz");
$ae_16960->extract(to => $tempdir) or die $ae_16960->error;
Expand Down
1 change: 1 addition & 0 deletions t/60-autoqc-db_loader.t
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,7 @@ subtest 'loading bam_flagststs' => sub {
path => ['t/data/autoqc/bam_flagstats'],
);
warnings_like { $db_loader->load() } [
qr/Skipped t\/data\/autoqc\/bam_flagstats\/24135_1#1.bam_flagstats\.json/, # no __CLASS__ key
qr/Skipped t\/data\/autoqc\/bam_flagstats\/4783_5_bam_flagstats\.json/, # no __CLASS__ key
qr/Loaded t\/data\/autoqc\/bam_flagstats\/4921_3_bam_flagstats\.json/,
qr/1 json files have been loaded/
Expand Down
1 change: 1 addition & 0 deletions t/data/autoqc/bam_flagstats/24135_1#1.bam_flagstats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"histogram":{},"info":{},"id_run":24135,"mate_mapped_defferent_chr":6,"mate_mapped_defferent_chr_5":6,"num_total_reads":66302,"paired_mapped_reads":31230,"position":1,"proper_mapped_pair":60192,"tag_index":1,"unmapped_reads":3776,"unpaired_mapped_reads":66}
13 changes: 13 additions & 0 deletions t/data/autoqc/bam_flagstats/24135_1#1.flagstat
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
66302 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
62526 + 0 mapped (94.30% : N/A)
66302 + 0 paired in sequencing
33151 + 0 read1
33151 + 0 read2
60192 + 0 properly paired (90.78% : N/A)
62460 + 0 with itself and mate mapped
66 + 0 singletons (0.10% : N/A)
6 + 0 with mate mapped to a different chr
6 + 0 with mate mapped to a different chr (mapQ>=5)

0 comments on commit e8caf55

Please sign in to comment.