-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #70 from cancerit/feature/convert_to_json
Feature/convert to json
- Loading branch information
Showing
7 changed files
with
303 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
#!/usr/bin/perl | ||
|
||
##########LICENCE########## | ||
# Copyright (c) 2014-2021 Genome Research Ltd. | ||
# | ||
# Author: CASM/Cancer IT <[email protected]> | ||
# | ||
# This file is part of alleleCount. | ||
# | ||
# alleleCount is free software: you can redistribute it and/or modify it under | ||
# the terms of the GNU Affero General Public License as published by the Free | ||
# Software Foundation; either version 3 of the License, or (at your option) any | ||
# later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but WITHOUT | ||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | ||
# details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
##########LICENCE########## | ||
|
||
use strict; | ||
use Carp; | ||
use English qw( -no_match_vars ); | ||
use warnings FATAL => 'all'; | ||
|
||
use Getopt::Long 'GetOptions'; | ||
use Pod::Usage; | ||
|
||
use Sanger::CGP::AlleleCount; | ||
use Sanger::CGP::AlleleCount::ToJson; | ||
|
||
{ | ||
my $options = option_builder(); | ||
$options->{'o'} = '/dev/stdout' unless(defined $options->{'o'}); | ||
run($options); | ||
} | ||
|
||
sub run { | ||
my ($options) = @_; | ||
my $json_string = Sanger::CGP::AlleleCount::ToJson::alleleCountToJson($options->{'a'}, $options->{'l'}); | ||
my $OUT; | ||
if($options->{'o'}){ | ||
open($OUT, '>', $options->{'o'}) or croak("Error opening file for output: $!"); | ||
} | ||
print $OUT "$json_string"; | ||
if($options->{'o'}){ | ||
close($OUT) or croak("Error closing output file for JSON conversion: $!"); | ||
} | ||
} | ||
|
||
|
||
sub option_builder { | ||
my ($factory) = @_; | ||
|
||
my %opts; | ||
|
||
&GetOptions ( | ||
'h|help' => \$opts{'h'}, | ||
'l|locus-file=s' => \$opts{'l'}, | ||
'a|allelecount-file=s' => \$opts{'a'}, | ||
'o|output-file:s' => \$opts{'o'}, | ||
'v|version' => \$opts{'v'}, | ||
); | ||
|
||
pod2usage(0) if($opts{'h'}); | ||
if($opts{'v'}){ | ||
print Sanger::CGP::AlleleCount->VERSION."\n"; | ||
exit; | ||
} | ||
pod2usage(1) if(!$opts{'l'} || !$opts{'a'}); | ||
croak("Locus file ".$opts{'l'}." does not exist.") if(! -e $opts{'l'}); | ||
croak("Allele count output file ".$opts{'a'}." does not exist.") if(! -e $opts{'a'}); | ||
return \%opts; | ||
} | ||
|
||
__END__ | ||
=head1 NAME | ||
alleleCounterToJson.pl - Generate JSON format file from the tab seperated format | ||
=head1 SYNOPSIS | ||
alleleCounterToJson.pl | ||
Required: | ||
-locus-file -l File containing SNP positions used for allelecounter | ||
-allelecount-file -a Allelecounter output file | ||
Optional: | ||
-output-file -o Output file (default: stdout) | ||
-help -h This message | ||
-version -v Version number | ||
=cut |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
package Sanger::CGP::AlleleCount::ToJson; | ||
|
||
##########LICENCE########## | ||
# Copyright (c) 2014-2021 Genome Research Ltd. | ||
# | ||
# Author: CASM/Cancer IT <[email protected]> | ||
# | ||
# This file is part of alleleCount. | ||
# | ||
# alleleCount is free software: you can redistribute it and/or modify it under | ||
# the terms of the GNU Affero General Public License as published by the Free | ||
# Software Foundation; either version 3 of the License, or (at your option) any | ||
# later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but WITHOUT | ||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | ||
# details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
##########LICENCE########## | ||
|
||
|
||
use strict; | ||
|
||
use Carp; | ||
use English qw( -no_match_vars ); | ||
use warnings FATAL => 'all'; | ||
|
||
use JSON; | ||
use IO::Zlib; | ||
|
||
use Sanger::CGP::AlleleCount; | ||
|
||
use Const::Fast qw(const); | ||
|
||
const my %ALLELECOUNT_CONST => ( | ||
MIN_PROP => 0.21, | ||
MIN_READS => 5, | ||
); | ||
|
||
sub allelecount_val { | ||
my ($item) = @_; | ||
return $ALLELECOUNT_CONST{$item}; | ||
} | ||
|
||
=item new | ||
Null constructor | ||
=cut | ||
|
||
sub new { | ||
my ($class) = @_; | ||
my $self = { }; | ||
bless $self, $class; | ||
return $self; | ||
} | ||
|
||
=item alleleCountToJson | ||
Convert allele count file result format to JSON | ||
=cut | ||
|
||
sub alleleCountToJson{ | ||
my ($countsfile, $snpsfile) = @_; | ||
my $tmp; | ||
my $SNPS; | ||
my $snp_list; | ||
|
||
#TODO open gzipped file.... | ||
open($SNPS, '<', $snpsfile) or croak("Error opening allele count locus file '$snpsfile' for JSON conversion: $!"); | ||
while(<$SNPS>){ | ||
my $line = $_; | ||
next if($line =~ m/^\s*#/); | ||
chomp($line); | ||
my ($chr,$pos,$name,undef) = split(/\s+/,$line); | ||
$snp_list->{$chr}->{$pos} = $name; | ||
} | ||
close($SNPS) or croak("Error closing allele count locus file '$snpsfile' for JSON conversion: $!"); | ||
|
||
my $fh = new IO::Zlib; | ||
if($fh->open($countsfile, "rb")){ | ||
while(<$fh>){ | ||
my $line = $_; | ||
next if($line =~ m/^\s*#/); | ||
chomp($line); | ||
my ($chr,$pos,$a,$c,$g,$t,$good) = split(/\s+/,$line); | ||
my $nom = $snp_list->{$chr}->{$pos}; | ||
my $genotype = _calculate_genotype_from_allele_count($a,$c,$g,$t,$good); | ||
$tmp->{$nom} = $genotype; | ||
} | ||
$fh->close; | ||
}else{ | ||
croak("Error trying to open file for SNP locus loading '$countsfile': $!\n"); | ||
} | ||
my $jsonstr = encode_json($tmp); | ||
return $jsonstr; | ||
} | ||
|
||
sub _calculate_genotype_from_allele_count{ | ||
my ($a_a,$a_c,$a_g,$a_t,$good) = @_; | ||
my $geno; | ||
return q{.} if($good < allelecount_val('MIN_READS')); | ||
|
||
my @counts; | ||
push @counts, ['A', $a_a] if($a_a/$good >= allelecount_val('MIN_PROP')); | ||
push @counts, ['C', $a_c] if($a_c/$good >= allelecount_val('MIN_PROP')); | ||
push @counts, ['G', $a_g] if($a_g/$good >= allelecount_val('MIN_PROP')); | ||
push @counts, ['T', $a_t] if($a_t/$good >= allelecount_val('MIN_PROP')); | ||
|
||
my $entries = scalar @counts; | ||
if($entries == 0) { | ||
$geno = q{.}; | ||
} | ||
elsif($entries == 1) { | ||
$geno = $counts[0][0].$counts[0][0]; | ||
} | ||
else { | ||
@counts = sort {$b->[1]<=>$a->[1]} @counts; # reverse sorts by the counts | ||
$geno = join(q{}, sort {$a cmp $b} $counts[0][0], $counts[1][0]); # then sort the alleles into the string | ||
} | ||
croak("Error calculating genotype from allele counts $a_a,$a_c,$a_g,$a_t,$good.\n") if((length $geno)>2 || (length $geno) == 0); | ||
return $geno; | ||
|
||
} | ||
|
||
1; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
##########LICENCE########## | ||
# Copyright (c) 2014-2020 Genome Research Ltd. | ||
# | ||
# Author: CASM/Cancer IT <[email protected]> | ||
# | ||
# This file is part of alleleCount. | ||
# | ||
# alleleCount is free software: you can redistribute it and/or modify it under | ||
# the terms of the GNU Affero General Public License as published by the Free | ||
# Software Foundation; either version 3 of the License, or (at your option) any | ||
# later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, but WITHOUT | ||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | ||
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | ||
# details. | ||
# | ||
# You should have received a copy of the GNU Affero General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
##########LICENCE########## | ||
|
||
use strict; | ||
use Test::More; | ||
use Test::Fatal; | ||
use Const::Fast qw(const); | ||
use File::Temp qw(tempdir); | ||
use File::Slurp; | ||
|
||
use Sanger::CGP::AlleleCount::ToJson; | ||
|
||
|
||
const my $MOD => 'Sanger::CGP::AlleleCount::ToJson'; | ||
const my $EXP_JSON => '{"rs2369898":"CT"}'; | ||
|
||
const my @DEFAULT_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], | ||
[qw(22 16165776 1 12 0 0 13)] | ||
); | ||
const my @PBQ20_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], | ||
[qw(22 16165776 2 17 0 0 19)] | ||
); | ||
|
||
const my @MAPQ0_RESULT => ( ['#CHR',qw(POS Count_A Count_C Count_G Count_T Good_depth)], | ||
[qw(22 16165776 1 17 0 0 18)] | ||
); | ||
|
||
const my @DEFAULT_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], | ||
[qw(22 16165776 12 1 13)] | ||
); | ||
const my @PBQ20_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], | ||
[qw(22 16165776 17 2 19)] | ||
); | ||
const my @MAPQ0_RESULT_SNP6 => (['#CHR',qw(POS Count_Allele_A Count_Allele_B Good_depth)], | ||
[qw(22 16165776 17 1 18)] | ||
); | ||
|
||
use FindBin qw($Bin); | ||
my $data_root = "$Bin/../../testData"; | ||
|
||
my $loci = "$data_root/test_loci.txt"; | ||
my $ac_output = "$data_root/test_ac_out.txt"; | ||
|
||
my $obj = new_ok($MOD); # no options | ||
|
||
is($EXP_JSON, Sanger::CGP::AlleleCount::ToJson::alleleCountToJson($ac_output, $loci), "Check conversion to JSON"); | ||
|
||
done_testing(); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
chr14 96772879 0 31 0 31 62 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
chr14 96772879 rs2369898 |