Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove redundant symbolic alt to so term hash #1643

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 74 additions & 27 deletions modules/Bio/EnsEMBL/VEP/Parser.pm
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::VEP::Utils qw(get_compressed_filehandle);
use Bio::EnsEMBL::Variation::Utils::Sequence qw(trim_sequences);
use Bio::EnsEMBL::Variation::Utils::VEP qw(&check_format);
use Bio::EnsEMBL::Variation::Utils::Config qw(%SO_TERMS);

use Bio::EnsEMBL::VEP::Parser::VCF;
use Bio::EnsEMBL::VEP::Parser::VEP_input;
Expand All @@ -82,7 +82,11 @@ use FileHandle;

use base qw(Exporter);

our @EXPORT_OK = qw(get_SO_term);
our @EXPORT_OK = qw(
get_SO_term
&check_format
&_valid_region_regex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to export _valid_region_regex, it was only being used within check_format for ensembl-variation (confirm?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is used in -

);

my %FORMAT_MAP = (
'vcf' => 'VCF',
Expand Down Expand Up @@ -351,6 +355,69 @@ sub delimiter {
}


sub _valid_region_regex {
return qr/^([^:]+):(\d+)-(\d+)(:[-\+]?1)?[\/:]([a-z0-9:]{3,}|[ACGTN-]+)$/i;
}

# sub-routine to check format of string
sub check_format {
my @line = @_;
my $format;

# any changes here must be copied to the JavaScript file to run instant VEP:
# public-plugins/tools/htdocs/components/20_VEPForm.js

# region: chr21:10-10:1/A
if ( scalar @line == 1 && $line[0] =~ &_valid_region_regex() ) {
$format = 'region';
}

# SPDI: NC_000016.10:68684738:G:A
elsif ( scalar @line == 1 && $line[0] =~ /^(.*?\:){2}([^\:]+|)$/i ) {
$format = 'spdi';
}

# CAID: CA9985736
elsif ( scalar @line == 1 && $line[0] =~ /^CA\d{1,}$/i ) {
$format = 'caid';
}

# HGVS: ENST00000285667.3:c.1047_1048insC
elsif (
scalar @line == 1 &&
$line[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i
) {
$format = 'hgvs';
}

# variant identifier: rs123456
elsif ( scalar @line == 1 ) {
$format = 'id';
}

# VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ
elsif (
$line[0] =~ /(chr)?\w+/ &&
$line[1] =~ /^\d+$/ &&
exists $line[3] && $line[3] =~ /^[ACGTN\-\.]+$/i &&
exists $line[4]
) {
$format = 'vcf';
}

# ensembl: 20 14370 14370 A/G +
elsif (
$line[0] =~ /\w+/ &&
$line[1] =~ /^\d+$/ &&
exists $line[2] && $line[2] =~ /^\d+$/ &&
exists $line[3] && $line[3] =~ /([a-z]{2,})|([ACGTN-]+\/[ACGTN-]+)/i
) {
$format = 'ensembl';
}
return $format;
}


=head2 detect_format

Example : $format = $parser->detect_format();
Expand Down Expand Up @@ -659,7 +726,7 @@ sub validate_vf {

sub get_SO_term {
my $self = shift;
nakib103 marked this conversation as resolved.
Show resolved Hide resolved
my $type = shift || join(",", @{ $self->get_alternatives });
my $type = shift || ( $self->can("get_alternatives") ? join(",", @{ $self->get_alternatives }) : '');
my $abbrev;

my @mobile_elements = ("ALU", "HERV", "LINE1", "SVA");
Expand Down Expand Up @@ -697,33 +764,13 @@ sub get_SO_term {
$abbrev = $type;
}

my %terms = (
INS => 'insertion',
INS_ME => 'mobile_element_insertion',
INS_ALU => 'Alu_insertion',
INS_HERV => 'HERV_insertion',
INS_LINE1 => 'LINE1_insertion',
INS_SVA => 'SVA_insertion',

DEL => 'deletion',
DEL_ME => 'mobile_element_deletion',
DEL_ALU => 'Alu_deletion',
DEL_HERV => 'HERV_deletion',
DEL_LINE1 => 'LINE1_deletion',
DEL_SVA => 'SVA_deletion',

TREP => 'tandem_repeat',
TDUP => 'tandem_duplication',
DUP => 'duplication',
CNV => 'copy_number_variation',
INV => 'inversion',
BND => 'chromosome_breakpoint'
);
my %terms = %SO_TERMS;

my $res = $terms{$abbrev};
## unsupported SV types
if ($self->isa('Bio::EnsEMBL::VEP::Parser')) {
$self->skipped_variant_msg("$abbrev is not a supported structural variant type") unless $res;
## $self can be an empty hash from Bio::EnsEMBL::Variation::Utils::VEP::parse_vcf
if (%{ $self } && $self->isa('Bio::EnsEMBL::VEP::Parser')) {
$self->skipped_variant_msg("$abbrev type is not supported") unless $res;
}
return $res;
}
Expand Down
4 changes: 1 addition & 3 deletions modules/Bio/EnsEMBL/VEP/Parser/Region.pm
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ use Bio::EnsEMBL::Utils::Scalar qw(assert_ref);
use Bio::EnsEMBL::Utils::Exception qw(throw warning);
use Bio::EnsEMBL::IO::ListBasedParser;

use Bio::EnsEMBL::Variation::Utils::VEP qw(&_valid_region_regex);

=head2 new

Arg 1 : hashref $args
Expand Down Expand Up @@ -168,7 +166,7 @@ sub create_VariationFeatures {

my $region = $parser->get_value();

return [] unless $region =~ &_valid_region_regex();
return [] unless $region =~ $self->_valid_region_regex();
my ($chr, $start, $end) = ($1, $2, $3);

my ($strand, $allele);
Expand Down
2 changes: 1 addition & 1 deletion t/Parser_VCF.t
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ is_deeply($cvf, bless( {
'Bio::EnsEMBL::Variation::StructuralVariationFeature' ) , 'StructuralVariationFeature - CPX skipped');


like($tmp, qr/CPX is not a supported structural variant type/, 'StructuralVariationFeature - skip CPX warning');
like($tmp, qr/CPX type is not supported/, 'StructuralVariationFeature - skip CPX warning');

open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";

Expand Down