Skip to content

Commit

Permalink
Merge branch 'release/v3.5.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
keiranmraine committed Aug 7, 2019
2 parents 4a0c618 + 7eb5a48 commit 57915c0
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 22 deletions.
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# CHANGES

## 3.5.0

* Loads vagrent cache into IntervalTree to speed up processing by:
* reducing redundant/random disk access
* reducing repeated decompression of same data when events are local to each other

## 3.4.0

* Add Dockerfile and supporting scripts
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ FROM ubuntu:16.04

LABEL maintainer="[email protected]" \
uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \
version="3.4.0" \
version="3.5.0" \
description="VAGrENT genome annotation docker"

RUN apt-get -yq update
Expand Down
1 change: 1 addition & 0 deletions Makefile.PL
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ WriteMakefile(
'Sort::Key' => '1.33',
'TAP::Harness' => '3.33',
'Try::Tiny' => '0.22',
'Set::IntervalTree' => '0.12',
}
);

Expand Down
8 changes: 6 additions & 2 deletions bin/AnnotateVcf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,11 @@ sub make_process_log {

sub get_annotator {
my $options = shift;
my $sorted = $options->{'sorted'};
$sorted = 1 if(-e $options->{'input'}.'.tbi');

# creating an EnsemblTranscriptSource using the Ensembl registry
my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'});
my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}, 'sorted' => $sorted);

# creating an AnnotatorCollection
my $annotator = Sanger::CGP::Vagrent::Annotators::AnnotatorCollection->new(
Expand Down Expand Up @@ -473,7 +475,7 @@ sub option_builder {
'p|process=n' => \$opts{'process'},
'sp|species=s' => \$opts{'species'},
'as|assembly=s' => \$opts{'assembly'},

'u|sorted' => \$opts{'sorted'},
);

pod2usage() if($opts{'help'});
Expand Down Expand Up @@ -537,4 +539,6 @@ =head1 SYNOPSIS
--tabix (-t) bgzip and tabix index the output file (will generate the .gz version of the -o option)
--sorted (-s) Input is sorted - lower memory requirement, automatic if *.tbi found
=cut
2 changes: 1 addition & 1 deletion lib/Sanger/CGP/Vagrent.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use strict;
use Const::Fast qw(const);

use base 'Exporter';
our $VERSION = '3.4.0';
our $VERSION = '3.5.0';
our @EXPORT = qw($VERSION);

1;
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use Const::Fast qw(const);

use Bio::DB::HTS;
use Bio::DB::HTS::Tabix;
use Set::IntervalTree;

use Sanger::CGP::Vagrent::Data::Transcript;
use Sanger::CGP::Vagrent::Data::Exon;
Expand Down Expand Up @@ -118,37 +119,83 @@ sub _generateLocationString {
return $gr->getChr.':'.$gr->getMinPos.'-'.$gr->getMaxPos;
}

sub _tabix_to_interval_tree {
my ($self, $chr) = @_;
return 1 if defined $self->{_cache_iTree}->{$chr};

my $full_tree = {};
if ($self->{_sorted}) {
$self->{_cache_iTree} = $full_tree;
}
else {
$full_tree = $self->{_cache_iTree};
}

my %collated;
$self->{_cache_tbx} = Bio::DB::HTS::Tabix->new(filename => $self->{_cache}) unless defined $self->{_cache_tbx};
my $iter = $self->{_cache_tbx}->query_full($chr);
return 1 unless defined $iter;
while(my $line = $iter->next) {
my ($chr, $s, $e, $object) = (split /\t/, $line)[0,1,2,6];
# +1 on end to convert to 1 bases, tabix module would handle this
my $this_loci = sprintf '%s:%d:%d', $chr, $s, $e+1;
push @{$collated{sprintf '%s:%d:%d', $chr, $s, $e+1}}, $object;
}

my $chr_tree = Set::IntervalTree->new();
for my $loci(keys %collated) {
my ($chr, $s, $e) = split ':', $loci;
$chr_tree->insert($collated{$loci}, $s, $e);
delete $collated{$loci};
}
$full_tree->{$chr} = $chr_tree;
return 1;
}

sub _getTranscriptsFromCache {
my ($self,$gp) = @_;
$self->{_cache_tbx} = Bio::DB::HTS::Tabix->new(filename => $self->{_cache}) unless defined $self->{_cache_tbx};
my $chr = $gp->getChr();
$self->_tabix_to_interval_tree($chr);
my $min;
my $max = $gp->getMaxPos + $SEARCH_BUFFER;
if($gp->getMinPos() < $SEARCH_BUFFER){
$min = 0;
} else {
$min = ($gp->getMinPos - $SEARCH_BUFFER);
}
my $iter = $self->{_cache_tbx}->query_full($gp->getChr(),$min,$max);
return undef unless defined $iter;
my $out = undef;
while(my $ret = $iter->next){
my $raw = (split("\t",$ret))[6];
my $VAR1;
eval $raw;
$VAR1->{_cdnaseq} = $self->_getTranscriptSeq($VAR1);
push @$out, $VAR1;
my @data = ();
@data = @{$self->{_cache_iTree}->{$chr}->fetch($min,$max)};
return undef unless(@data);
my @out;
for my $overlap(@data){
for my $item(@{$overlap}) {
unless(ref $item) { # turn string into object
my $VAR1;
eval $item;
$VAR1->{_cdnaseq} = $self->_getTranscriptSeq($VAR1);
$item = $VAR1;
}
push @out, $item;
}
}
return $out;
@out = sort _sort_itree @out;
return \@out;
}

sub _init {
my $self = shift;
my %vars = @_;
foreach my $k(keys(%vars)){
if($k eq 'cache'){
$self->_setCacheFile($vars{$k});
}
sub _sort_itree {
if($a->{_genomicminpos} != $b->{_genomicminpos}) {
return $a->{_genomicminpos} <=> $b->{_genomicminpos};
}
if($a->{_genomicmaxpos} != $b->{_genomicmaxpos}) {
return $a->{_genomicmaxpos} <=> $b->{_genomicmaxpos};
}
return 0;
}

sub _init {
my ($self, %vars) = @_;
$self->_setCacheFile($vars{'cache'});
$self->{_sorted} = $vars{'sorted'};
}

sub _setCacheFile {
Expand Down

0 comments on commit 57915c0

Please sign in to comment.