diff --git a/.travis.yml b/.travis.yml index d7a012e8d6..a787b22b75 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,6 +32,7 @@ before_install: - git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl.git - git clone --branch main --depth 1 https://github.com/Ensembl/ensembl-hive.git - git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-io.git +- git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-vep.git - git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-funcgen.git - git clone --branch release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git - git clone --branch 1.9 --depth 1 https://github.com/samtools/htslib.git diff --git a/modules/Bio/EnsEMBL/Variation/Utils/Config.pm b/modules/Bio/EnsEMBL/Variation/Utils/Config.pm index 06175faed5..4fdf41f816 100644 --- a/modules/Bio/EnsEMBL/Variation/Utils/Config.pm +++ b/modules/Bio/EnsEMBL/Variation/Utils/Config.pm @@ -31,6 +31,7 @@ our @EXPORT_OK = qw( $OVERLAP_CONSEQUENCE_CLASS $MAX_ATTRIB_CODE_LENGTH %SO_ACC_MAPPER + %SO_TERMS ); our $OVERLAP_CONSEQUENCE_CLASS = 'Bio::EnsEMBL::Variation::OverlapConsequence'; @@ -1382,5 +1383,29 @@ our %SO_ACC_MAPPER = ( } ); +# Used to convert symbolic alternative allele to SO term +our %SO_TERMS = ( + INS => 'insertion', + INS_ME => 'mobile_element_insertion', + INS_ALU => 'Alu_insertion', + INS_HERV => 'HERV_insertion', + INS_LINE1 => 'LINE1_insertion', + INS_SVA => 'SVA_insertion', + + DEL => 'deletion', + DEL_ME => 'mobile_element_deletion', + DEL_ALU => 'Alu_deletion', + DEL_HERV => 'HERV_deletion', + DEL_LINE1 => 'LINE1_deletion', + DEL_SVA => 'SVA_deletion', + + TREP => 'tandem_repeat', + TDUP => 'tandem_duplication', + DUP => 'duplication', + CNV => 'copy_number_variation', + INV => 'inversion', + BND => 'chromosome_breakpoint' +); + 1; diff --git a/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm b/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm index a91f2687c2..e467994c96 100644 --- a/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm +++ b/modules/Bio/EnsEMBL/Variation/Utils/VEP.pm @@ -86,11 +86,13 @@ use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor; use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code SO_variation_class); +use Bio::EnsEMBL::Variation::Utils::Config qw(%SO_TERMS); use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3; use Bio::EnsEMBL::Variation::StructuralVariationFeature; use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor; use Bio::EnsEMBL::Variation::TranscriptStructuralVariation; use Bio::EnsEMBL::Variation::Source; +use Bio::EnsEMBL::VEP::Parser qw(get_SO_term &check_format); # we need to manually include all these modules for caching to work use Bio::EnsEMBL::CoordSystem; @@ -113,8 +115,6 @@ use vars qw(@ISA @EXPORT_OK); @ISA = qw(Exporter); @EXPORT_OK = qw( - &_valid_region_regex - &check_format &detect_format &parse_line &vf_to_consequences @@ -384,68 +384,6 @@ sub parse_line { return $vfs; } -sub _valid_region_regex { - return qr/^([^:]+):(\d+)-(\d+)(:[-\+]?1)?[\/:]([a-z0-9:]{3,}|[ACGTN-]+)$/i; -} - -# sub-routine to check format of string -sub check_format { - my @line = @_; - my $format; - - # any changes here must be copied to the JavaScript file to run instant VEP: - # public-plugins/tools/htdocs/components/20_VEPForm.js - - # region: chr21:10-10:1/A - if ( scalar @line == 1 && $line[0] =~ &_valid_region_regex() ) { - $format = 'region'; - } - - # SPDI: NC_000016.10:68684738:G:A - elsif ( scalar @line == 1 && $line[0] =~ /^(.*?\:){2}([^\:]+|)$/i ) { - $format = 'spdi'; - } - - # CAID: CA9985736 - elsif ( scalar @line == 1 && $line[0] =~ /^CA\d{1,}$/i ) { - $format = 'caid'; - } - - # HGVS: ENST00000285667.3:c.1047_1048insC - elsif ( - scalar @line == 1 && - $line[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i - ) { - $format = 'hgvs'; - } - - # variant identifier: rs123456 - elsif ( scalar @line == 1 ) { - $format = 'id'; - } - - # VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ - elsif ( - $line[0] =~ /(chr)?\w+/ && - $line[1] =~ /^\d+$/ && - exists $line[3] && $line[3] =~ /^[ACGTN\-\.]+$/i && - exists $line[4] - ) { - $format = 'vcf'; - } - - # ensembl: 20 14370 14370 A/G + - elsif ( - $line[0] =~ /\w+/ && - $line[1] =~ /^\d+$/ && - exists $line[2] && $line[2] =~ /^\d+$/ && - exists $line[3] && $line[3] =~ /([a-z]{2,})|([ACGTN-]+\/[ACGTN-]+)/i - ) { - $format = 'ensembl'; - } - return $format; -} - # sub-routine to detect format of input sub detect_format { my $line = shift; @@ -478,12 +416,7 @@ sub parse_ensembl { my $so_term; # convert to SO term - my %terms = ( - INS => 'insertion', - DEL => 'deletion', - TDUP => 'tandem_duplication', - DUP => 'duplication' - ); + my %terms = %SO_TERMS; $so_term = defined $terms{$allele_string} ? $terms{$allele_string} : $allele_string; @@ -631,14 +564,7 @@ sub parse_vcf { if(defined($type)) { # convert to SO term - my %terms = ( - INS => 'insertion', - DEL => 'deletion', - TDUP => 'tandem_duplication', - DUP => 'duplication' - ); - - $so_term = defined $terms{$type} ? $terms{$type} : $type; + $so_term = get_SO_term(undef, $type) || $type; } my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({ @@ -933,14 +859,16 @@ sub convert_to_vcf { else { # convert to SO term - my %terms = ( - 'insertion' => 'INS', - 'deletion' => 'DEL', - 'tandem_duplication' => 'TDUP', - 'duplication' => 'DUP' - ); + my %terms = reverse %SO_TERMS; + my $abbrev = $terms{$vf->class_SO_term} || $vf->class_SO_term; + + $abbrev = "DUP:TANDEM" if $abbrev eq "TDUP"; + $abbrev = "CNV:TR" if $abbrev eq "TREP"; + $abbrev =~ s/_/:/ if $abbrev =~ /^(INS|DEL)_ME$/; + $abbrev =~ s/_/:ME:/ if $abbrev =~ /^(INS|DEL)_[A-Z0-9]+$/; - my $alt = '<'.($terms{$vf->class_SO_term} || $vf->class_SO_term).'>'; + my $alt = '<'.$abbrev.'>'; + $alt = ( split(/\//, $vf->allele_string, 2) )[1] if ($alt eq "BND"); return [ $vf->{chr} || $vf->seq_region_name, diff --git a/travisci/harness.sh b/travisci/harness.sh index f0a1573d8e..f490dd1c58 100755 --- a/travisci/harness.sh +++ b/travisci/harness.sh @@ -1,6 +1,6 @@ #!/bin/bash -export PERL5LIB=$PWD/bioperl-live:$PWD/ensembl-test/modules:$PWD/ensembl/modules:$PWD/ensembl-hive/modules:$PWD/modules:$PWD/scripts/import/:$PWD/ensembl-io/modules:$PWD/ensembl-funcgen/modules +export PERL5LIB=$PWD/bioperl-live:$PWD/ensembl-test/modules:$PWD/ensembl/modules:$PWD/ensembl-hive/modules:$PWD/modules:$PWD/scripts/import/:$PWD/ensembl-io/modules:$PWD/ensembl-vep/modules:$PWD/ensembl-funcgen/modules export PATH=$PATH:$PWD/C_code:$PWD/htslib