diff --git a/CHANGES.md b/CHANGES.md index bdd1035..fae31a5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## v4.3.0 + +* Add script to convert allelecount output to JSON + ## v4.2.1 * Update so docker and native install use same install scripts behind the scenes diff --git a/perl/Makefile.PL b/perl/Makefile.PL index e1099d2..92b5637 100644 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -28,7 +28,7 @@ WriteMakefile( NAME => 'alleleCount', LICENSE => 'agpl_3', # http://search.cpan.org/~dagolden/CPAN-Meta-2.142690/lib/CPAN/Meta/Spec.pm#license VERSION_FROM => 'lib/Sanger/CGP/AlleleCount.pm', - EXE_FILES => [qw( bin/alleleCounter.pl )], + EXE_FILES => [qw( bin/alleleCounter.pl bin/alleleCounterToJson.pl )], PREREQ_PM => { 'Const::Fast' => 0.014, 'Try::Tiny' => 0.19, @@ -38,5 +38,6 @@ WriteMakefile( 'Devel::Cover' => 1.09, 'Pod::Coverage' => 0.23, 'IPC::System::Simple' => 1.25, + 'JSON' => 2.90, } ); diff --git a/perl/bin/alleleCounterToJson.pl b/perl/bin/alleleCounterToJson.pl new file mode 100755 index 0000000..16d9843 --- /dev/null +++ b/perl/bin/alleleCounterToJson.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +##########LICENCE########## +# Copyright (c) 2014-2021 Genome Research Ltd. +# +# Author: CASM/Cancer IT +# +# This file is part of alleleCount. +# +# alleleCount is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +##########LICENCE########## + +use strict; +use Carp; +use English qw( -no_match_vars ); +use warnings FATAL => 'all'; + +use Getopt::Long 'GetOptions'; +use Pod::Usage; + +use Sanger::CGP::AlleleCount; +use Sanger::CGP::AlleleCount::ToJson; + +{ + my $options = option_builder(); + $options->{'o'} = '/dev/stdout' unless(defined $options->{'o'}); + run($options); +} + +sub run { + my ($options) = @_; + my $json_string = Sanger::CGP::AlleleCount::ToJson::alleleCountToJson($options->{'a'}, $options->{'l'}); + my $OUT; + if($options->{'o'}){ + open($OUT, '>', $options->{'o'}) or croak("Error opening file for output: $!"); + } + print $OUT "$json_string"; + if($options->{'o'}){ + close($OUT) or croak("Error closing output file for JSON conversion: $!"); + } +} + + +sub option_builder { + my ($factory) = @_; + + my %opts; + + &GetOptions ( + 'h|help' => \$opts{'h'}, + 'l|locus-file=s' => \$opts{'l'}, + 'a|allelecount-file=s' => \$opts{'a'}, + 'o|output-file:s' => \$opts{'o'}, + 'v|version' => \$opts{'v'}, + ); + + pod2usage(0) if($opts{'h'}); + if($opts{'v'}){ + print Sanger::CGP::AlleleCount->VERSION."\n"; + exit; + } + pod2usage(1) if(!$opts{'l'} || !$opts{'a'}); + croak("Locus file ".$opts{'l'}." does not exist.") if(! -e $opts{'l'}); + croak("Allele count output file ".$opts{'a'}." does not exist.") if(! -e $opts{'a'}); + return \%opts; +} + +__END__ + +=head1 NAME + +alleleCounterToJson.pl - Generate JSON format file from the tab seperated format + +=head1 SYNOPSIS + +alleleCounterToJson.pl + + Required: + + -locus-file -l File containing SNP positions used for allelecounter + -allelecount-file -a Allelecounter output file + + Optional: + -output-file -o Output file (default: stdout) + -help -h This message + -version -v Version number + +=cut diff --git a/perl/lib/Sanger/CGP/AlleleCount.pm b/perl/lib/Sanger/CGP/AlleleCount.pm index d03b5f6..6f60069 100644 --- a/perl/lib/Sanger/CGP/AlleleCount.pm +++ b/perl/lib/Sanger/CGP/AlleleCount.pm @@ -25,7 +25,7 @@ package Sanger::CGP::AlleleCount; use strict; use base 'Exporter'; -our $VERSION = '4.2.1'; +our $VERSION = '4.3.0'; our @EXPORT = qw($VERSION); 1; diff --git a/perl/lib/Sanger/CGP/AlleleCount/ToJson.pm b/perl/lib/Sanger/CGP/AlleleCount/ToJson.pm new file mode 100644 index 0000000..9132be7 --- /dev/null +++ b/perl/lib/Sanger/CGP/AlleleCount/ToJson.pm @@ -0,0 +1,130 @@ +package Sanger::CGP::AlleleCount::ToJson; + +##########LICENCE########## +# Copyright (c) 2014-2021 Genome Research Ltd. +# +# Author: CASM/Cancer IT +# +# This file is part of alleleCount. +# +# alleleCount is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +##########LICENCE########## + + +use strict; + +use Carp; +use English qw( -no_match_vars ); +use warnings FATAL => 'all'; + +use JSON; +use IO::Zlib; + +use Sanger::CGP::AlleleCount; + +use Const::Fast qw(const); + +const my %ALLELECOUNT_CONST => ( + MIN_PROP => 0.21, + MIN_READS => 5, +); + +sub allelecount_val { + my ($item) = @_; + return $ALLELECOUNT_CONST{$item}; +} + +=item new + +Null constructor + +=cut + +sub new { + my ($class) = @_; + my $self = { }; + bless $self, $class; + return $self; +} + +=item alleleCountToJson + +Convert allele count file result format to JSON + +=cut + +sub alleleCountToJson{ + my ($countsfile, $snpsfile) = @_; + my $tmp; + my $SNPS; + my $snp_list; + + #TODO open gzipped file.... + open($SNPS, '<', $snpsfile) or croak("Error opening allele count locus file '$snpsfile' for JSON conversion: $!"); + while(<$SNPS>){ + my $line = $_; + next if($line =~ m/^\s*#/); + chomp($line); + my ($chr,$pos,$name,undef) = split(/\s+/,$line); + $snp_list->{$chr}->{$pos} = $name; + } + close($SNPS) or croak("Error closing allele count locus file '$snpsfile' for JSON conversion: $!"); + + my $fh = new IO::Zlib; + if($fh->open($countsfile, "rb")){ + while(<$fh>){ + my $line = $_; + next if($line =~ m/^\s*#/); + chomp($line); + my ($chr,$pos,$a,$c,$g,$t,$good) = split(/\s+/,$line); + my $nom = $snp_list->{$chr}->{$pos}; + my $genotype = _calculate_genotype_from_allele_count($a,$c,$g,$t,$good); + $tmp->{$nom} = $genotype; + } + $fh->close; + }else{ + croak("Error trying to open file for SNP locus loading '$countsfile': $!\n"); + } + my $jsonstr = encode_json($tmp); + return $jsonstr; +} + +sub _calculate_genotype_from_allele_count{ + my ($a_a,$a_c,$a_g,$a_t,$good) = @_; + my $geno; + return q{.} if($good < allelecount_val('MIN_READS')); + + my @counts; + push @counts, ['A', $a_a] if($a_a/$good >= allelecount_val('MIN_PROP')); + push @counts, ['C', $a_c] if($a_c/$good >= allelecount_val('MIN_PROP')); + push @counts, ['G', $a_g] if($a_g/$good >= allelecount_val('MIN_PROP')); + push @counts, ['T', $a_t] if($a_t/$good >= allelecount_val('MIN_PROP')); + + my $entries = scalar @counts; + if($entries == 0) { + $geno = q{.}; + } + elsif($entries == 1) { + $geno = $counts[0][0].$counts[0][0]; + } + else { + @counts = sort {$b->[1]<=>$a->[1]} @counts; # reverse sorts by the counts + $geno = join(q{}, sort {$a cmp $b} $counts[0][0], $counts[1][0]); # then sort the alleles into the string + } + croak("Error calculating genotype from allele counts $a_a,$a_c,$a_g,$a_t,$good.\n") if((length $geno)>2 || (length $geno) == 0); + return $geno; + +} + +1; diff --git a/perl/t/2_pl_compile.t b/perl/t/2_pl_compile.t index fce4f9e..5585ac4 100644 --- a/perl/t/2_pl_compile.t +++ b/perl/t/2_pl_compile.t @@ -49,16 +49,17 @@ for(@scripts) { next; } my $message = "Compilation check: $script"; + my $output = ""; my $command = "$perl -c $script"; my ($pid, $process); try { $pid = open $process, $command.' 2>&1 |'; - while(<$process>){}; + while(<$process>){ $output.=$_;}; close $process; pass($message); } catch { - fail($message); + fail($message."\n".$output); }; } diff --git a/perl/t/tojson.t b/perl/t/tojson.t new file mode 100644 index 0000000..5ffab7e --- /dev/null +++ b/perl/t/tojson.t @@ -0,0 +1,67 @@ +##########LICENCE########## +# Copyright (c) 2014-2020 Genome Research Ltd. +# +# Author: CASM/Cancer IT +# +# This file is part of alleleCount. +# +# alleleCount is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +##########LICENCE########## + +use strict; +use Test::More; +use Test::Fatal; +use Const::Fast qw(const); +use File::Temp qw(tempdir); +use File::Slurp; + +use Sanger::CGP::AlleleCount::ToJson; + + +const my $MOD => 'Sanger::CGP::AlleleCount::ToJson'; +const my $EXP_JSON => '{"rs2369898":"CT"}'; + +const my @DEFAULT_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], + [qw(22 16165776 1 12 0 0 13)] + ); +const my @PBQ20_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], + [qw(22 16165776 2 17 0 0 19)] + ); + +const my @MAPQ0_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], + [qw(22 16165776 1 17 0 0 18)] + ); + +const my @DEFAULT_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], + [qw(22 16165776 12 1 13)] + ); +const my @PBQ20_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], + [qw(22 16165776 17 2 19)] + ); +const my @MAPQ0_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], + [qw(22 16165776 17 1 18)] + ); + +use FindBin qw($Bin); +my $data_root = "$Bin/../../testData"; + +my $loci = "$data_root/test_loci.txt"; +my $ac_output = "$data_root/test_ac_out.txt"; + +my $obj = new_ok($MOD); # no options + +is($EXP_JSON, Sanger::CGP::AlleleCount::ToJson::alleleCountToJson($ac_output, $loci), "Check conversion to JSON"); + +done_testing(); + diff --git a/testData/test_ac_out.txt b/testData/test_ac_out.txt new file mode 100644 index 0000000..6ae7752 --- /dev/null +++ b/testData/test_ac_out.txt @@ -0,0 +1 @@ +chr14 96772879 0 31 0 31 62 diff --git a/testData/test_loci.txt b/testData/test_loci.txt new file mode 100644 index 0000000..eecb7f5 --- /dev/null +++ b/testData/test_loci.txt @@ -0,0 +1 @@ +chr14 96772879 rs2369898