diff --git a/.dockerignore b/.dockerignore index 733769c..035e922 100644 --- a/.dockerignore +++ b/.dockerignore @@ -6,7 +6,12 @@ /CHANGES.md /.gitignore /.git +/perl/blib +/pm_to_blib /perl/docs /perl/docs.tar.gz /python/env /install_tmp +/.circleci +/*.code-workspace +/tmp diff --git a/.gitignore b/.gitignore index 4203ca7..67462a3 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ /perl/pm_to_blib .idea/* /python/env +/tmp +*.code-workspace diff --git a/CHANGES.md b/CHANGES.md index bd42643..c4c9af7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,188 +1,22 @@ # CHANGES -## 3.6.0 -- Addition of `FF019` and `FF020` flags -- New flag rule set `pulldownFfpeRulesFragment.lst` including FF019 and FF020 made - -## 3.5.0 - -- Update to core pindel algorithm to allow complex DI events to have longer inserted sequence than deleted - - Masking real events - -## 3.4.1 - -- Updated Dockerfile to use pcap-core 5.4.0 - htslib/samtools 1.11 - -## 3.4.0 - -- Updated Dockerfile to use pcap-core 5.2.2 -- Modified setup script to use build/\*.sh - -## 3.3.0 - -- I/O hardening, see [milestone 3](https://github.com/cancerit/cgpPindel/milestone/3) - -## 3.2.2 - -- Handle Input files that may have no reads at all, specifically an issue when generating a normal panel. - -## 3.2.1 - -- Added Dockerfile and docker documentation - -## 3.2.0 - -- Tabix search for high depth/excluded regions now performed in memory using IntervalTrees - - Reduces runtime of input step by ~50% - - Improved disk access profile - - Zero impact on results - -## 3.1.2 - -- 3.0.5 introduced species parsing bug causing single word species names to be invalid. - -## 3.1.1 - -- Fix regression - ability to cope with chromosomes with no events. - -## 3.1.0 - -- Incorporates updated pindel which improves sensitivity -- Internally interpret QCFAIL to determine if whole pair fails - -## 3.0.6 - -- Fixed version tag - -## 3.0.5 - -- Handles species names with spaces in it -- modified checks for species,assembly and checksum - -## 3.0.4 - -- Output bug for pindel BAM/CRAM corrected. When more than 1 chr in output files had no reads. - -## 3.0.3 - -- Changes to how germline filter determined resulted in dummy germline bed file not being generated as previously. -- This release reinstates the old behaviour. - -## 3.0.2 - -- Correct example rule files for \*Fragment.lst files to use FFnnn filter types - -## 3.0.1 - -- Update tabix calls to directly use query_full (solves GRCh38 contig name issues). - -## 3.0.0 - -- Germline bed file is now merged for adjacent regions (#31) -- More compressed intermediate files (#55) -- Change to `Const::Fast` where appropriate (#41) -- Removed TG VG from genotype. - - Readgroups are always variable, often 1 in data from last few years - - Not used by our filters. -- Supports BAM/CRAM inputs -- Output will be aligned with inputs - - bam vs cram - - bai vs csi -- Although ground work for csi input/output has been done `Bio::DB::HTS` doesn't support csi indexed input yet. - - Created our own fork at [`cancerit/Bio::DB::HTS`][cancerit-biodbhts] so that this could be enabled. - - You will need to install this manually or use one of our images for this functionallity. - - [dockstore-cgpwxs][ds-cgpwxs-git] - - [dockstore-cgpwxs][ds-cgpwgs-git] - - - -## 2.2.5 - -- Update tabix->query to tabix->query_full - -## 2.2.4 - -- Force sorting of FILTER field to make records easier to diff. -- Fix sorting of final VCF to handle events with same start better when using comparison tools - -## 2.2.3 - -Correct read sorting during collection of DI events. Caused some events to be split into many and -others to be missed (Thanks to @liangkaiye for patch) - -## 2.2.3 - -Correct read sorting during collection of DI events. Caused some events to be split into many and -others to be missed (Thanks to @liangkaiye for patch) - -## 2.2.2 - -Correction to sorting of VCF files - -## 2.2.0 - -Reduces the amount of temporary space required and overall I/O - -To process 40 million readpairs (40x Tumour + 40x Normal, chr21, 100bp reads): - -Original time: - -``` -User time (seconds): 3553.88 -System time (seconds): 63.92 -Percent of CPU this job got: 159% -Elapsed (wall clock) time (h:mm:ss or m:ss): 37:51.63 -File system inputs: 64 -File system outputs: 1782080 -``` - -New time: - -``` -User time (seconds): 3572.21 -System time (seconds): 74.06 -Percent of CPU this job got: 167% -Elapsed (wall clock) time (h:mm:ss or m:ss): 36:15.01 -File system inputs: 0 -File system outputs: 1139128 -``` - -``` -Original peak size: 650MB - New peak size: 291MB -``` - -__~55%__ reduction in working space and about __40%__ fewer writes to the file system. - -Exactly the same results: - -```bash -$ diff old/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9.germline.bed new/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9.germline.bed - -$ diff_bams -a old/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9_wt.bam -b new/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9_wt.bam -Reference sequence count passed -Reference sequence order passed -Matching records: 194543 - -$ diff_bams -a old/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9_mt.bam -b new/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9_mt.bam -Reference sequence count passed -Reference sequence order passed -Matching records: 239737 - -$ /software/CGP/canpipe/live/bin/canpipe_live vcftools --gzvcf old/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9.flagged.vcf.gz --gzdiff new/f9c3bc8e-dbc4-1ed0-e040-11ac0d4803a9_vs_f9c3bc8e-dbc1-1ed0-e040-11ac0d4803a9.flagged.vcf.gz -... -Comparing individuals in VCF files... -N_combined_individuals: 2 -N_individuals_common_to_both_files: 2 -N_individuals_unique_to_file1: 0 -N_individuals_unique_to_file2: 0 -Comparing sites in VCF files... -Found 15321 SNPs common to both files. -Found 0 SNPs only in main file. -Found 0 SNPs only in second file. -After -``` - -[cancerit-biodbhts]: https://github.com/cancerit/Bio-DB-HTS/releases/tag/v2.10-rc1 -[ds-cgpwgs-git]: https://github.com/cancerit/dockstore-cgpwgs -[ds-cgpwxs-git]: https://github.com/cancerit/dockstore-cgpwxs +## 1.0.1 +- Added a file check after blat step + +## 1.0.0 +- Added filters to FlagVcf.pl to allow flagging of per-sample vcf outputs +- Fixed bugs in Implement.pm and pindelCohortVafSliceFill.pl +- Adds code to allow single sample processing with more accurate VAF calculations (via BLAT) +- Status of new scripts, "pre-release" indicates defaults and CLI may change: + - stable + - pindelCohort.pl + - pindel_blat_vaf.pl + - pre-release + - pindelCohort_to_vcf.pl + - pindel_vcfSortNsplit.pl + - pindelCohortMerge.pl + - pindelCohortVafFill.pl + - pindelCohortVafSplit.pl + - pindelCohortVafSliceFill.pl +- pinning to pindel v3.6.0 +- Switch license management to skywalking-eyes. diff --git a/Dockerfile b/Dockerfile index 2dee6db..b4b465b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,8 @@ USER root # ALL tool versions used by opt-build.sh # need to keep in sync with setup.sh ENV VER_CGPVCF="v2.2.1"\ - VER_VCFTOOLS="0.1.16" + VER_VCFTOOLS="0.1.16"\ + VER_BLAT="v385" # hadolint ignore=DL3008 RUN apt-get -yq update \ diff --git a/build/opt-build-local.sh b/build/opt-build-local.sh index 9cfb57b..a8d5ade 100755 --- a/build/opt-build-local.sh +++ b/build/opt-build-local.sh @@ -63,3 +63,5 @@ if [ ! -e $SETUP_DIR/cgpPindel.success ]; then cd $SETUP_DIR touch $SETUP_DIR/cgpPindel.success fi + +rm -rf $SETUP_DIR diff --git a/build/opt-build.sh b/build/opt-build.sh index 21b0ff4..79cbd4a 100644 --- a/build/opt-build.sh +++ b/build/opt-build.sh @@ -75,3 +75,10 @@ if [ ! -e $SETUP_DIR/cgpVcf.success ]; then rm -rf distro.* distro/* touch $SETUP_DIR/cgpVcf.success fi +set -x +if [ ! -e $SETUP_DIR/ucscTools.success ]; then + curl -sSL http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64.${VER_BLAT}/blat/blat > $INST_PATH/bin/blat + curl -sSL http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64.${VER_BLAT}/pslPretty > $INST_PATH/bin/pslPretty + chmod ugo+x $INST_PATH/bin/blat $INST_PATH/bin/pslPretty + touch $SETUP_DIR/ucscTools.success +fi diff --git a/perl/Makefile.PL b/perl/Makefile.PL index aa083b4..e49aa87 100755 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - use ExtUtils::MakeMaker; WriteMakefile( @@ -42,7 +41,16 @@ WriteMakefile( bin/FlagVcf.pl bin/pindel_merge_vcf_bam.pl bin/pindel_np_from_vcf.pl - bin/pindel_germ_bed.pl)], + bin/pindel_germ_bed.pl + bin/pindelCohort.pl + bin/pindelCohort_to_vcf.pl + bin/pindel_vcfSortNsplit.pl + bin/pindel_blat_vaf.pl + bin/pindelCohortMerge.pl + bin/pindelCohortVafFill.pl + bin/pindelCohortVafSplit.pl + bin/pindelCohortVafSliceFill.pl + )], PREREQ_PM => { 'Const::Fast' => 0.014, 'Try::Tiny' => 0.19, diff --git a/perl/bin/pindel.pl b/perl/bin/pindel.pl index fe5f617..d0c491a 100755 --- a/perl/bin/pindel.pl +++ b/perl/bin/pindel.pl @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - BEGIN { use Cwd qw(abs_path cwd); use File::Basename; @@ -41,9 +40,7 @@ BEGIN use autodie qw(:all); use File::Path qw(remove_tree make_path); -use Getopt::Long; use File::Spec; -use Pod::Usage qw(pod2usage); use List::Util qw(first); use Const::Fast qw(const); use File::Copy; @@ -101,125 +98,42 @@ sub cleanup { } sub setup { - my %opts; - pod2usage(-msg => "\nERROR: Option must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); - $opts{'cmd'} = join " ", $0, @ARGV; - GetOptions( 'h|help' => \$opts{'h'}, - 'm|man' => \$opts{'m'}, - 'c|cpus=i' => \$opts{'threads'}, - 'r|reference=s' => \$opts{'reference'}, - 'o|outdir=s' => \$opts{'outdir'}, - 't|tumour=s' => \$opts{'tumour'}, - 'n|normal=s' => \$opts{'normal'}, - 'e|exclude=s' => \$opts{'exclude'}, - 'b|badloci=s' => \$opts{'badloci'}, - 'p|process=s' => \$opts{'process'}, - 'i|index=i' => \$opts{'index'}, - 'v|version' => \$opts{'version'}, - # these are specifically for pin2vcf - 'sp|species=s{0,}' => \@{$opts{'species'}}, - 'as|assembly=s' => \$opts{'assembly'}, - 'st|seqtype=s' => \$opts{'seqtype'}, - 'sg|skipgerm' => \$opts{'skipgerm'}, - # specifically for FlagVCF - 's|simrep=s' => \$opts{'simrep'}, - 'f|filters=s' => \$opts{'filters'}, - 'g|genes=s' => \$opts{'genes'}, - 'u|unmatched=s' => \$opts{'unmatched'}, - 'sf|softfil=s' => \$opts{'softfil'}, - 'l|limit=i' => \$opts{'limit'}, - 'd|debug' => \$opts{'debug'}, - 'a|apid:s' => \$opts{'apid'}, - ) or pod2usage(2); - - pod2usage(-verbose => 1) if(defined $opts{'h'}); - pod2usage(-verbose => 2) if(defined $opts{'m'}); - - if($opts{'version'}) { - print 'Version: ',Sanger::CGP::Pindel::Implement->VERSION,"\n"; - exit 0; - } - - PCAP::Cli::file_for_reading('reference', $opts{'reference'}); - PCAP::Cli::file_for_reading('tumour', $opts{'tumour'}); - PCAP::Cli::file_for_reading('normal', $opts{'normal'}); - PCAP::Cli::file_for_reading('simrep', $opts{'simrep'}); - PCAP::Cli::file_for_reading('filters', $opts{'filters'}); - PCAP::Cli::file_for_reading('genes', $opts{'genes'}); - PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'}); - PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'}); - PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); - my $final_logs = File::Spec->catdir($opts{'outdir'}, 'logs'); - if(-e $final_logs) { - warn "NOTE: Presence of '$final_logs' directory suggests successful complete analysis, please delete to rerun\n"; - exit 0; - } - - - delete $opts{'process'} unless(defined $opts{'process'}); - delete $opts{'index'} unless(defined $opts{'index'}); - delete $opts{'limit'} unless(defined $opts{'limit'}); - - delete $opts{'exclude'} unless(defined $opts{'exclude'}); - delete $opts{'badloci'} unless(defined $opts{'badloci'}); - delete $opts{'apid'} unless(defined $opts{'apid'}); - - if(exists $opts{'process'}) { - PCAP::Cli::valid_process('process', $opts{'process'}, \@VALID_PROCESS); - if(exists $opts{'index'}) { - my @valid_seqs = Sanger::CGP::Pindel::Implement::valid_seqs(\%opts); + my $opts = Sanger::CGP::Pindel::Implement::shared_setup( + ['tumour', 'normal'], + {'t|tumour=s' => 'tumour', 'n|normal=s' => 'normal'} + ); + PCAP::Cli::file_for_reading('tumour', $opts->{'tumour'}); + PCAP::Cli::file_for_reading('normal', $opts->{'normal'}); + + if(exists $opts->{'process'}) { + PCAP::Cli::valid_process('process', $opts->{'process'}, \@VALID_PROCESS); + if(exists $opts->{'index'}) { + my @valid_seqs = Sanger::CGP::Pindel::Implement::valid_seqs($opts); my $refs = scalar @valid_seqs; - my $max = $index_max{$opts{'process'}}; + my $max = $index_max{$opts->{'process'}}; if($max==-1){ - if(exists $opts{'limit'}) { - $max = $opts{'limit'} > $refs ? $refs : $opts{'limit'}; + if(exists $opts->{'limit'}) { + $max = $opts->{'limit'} > $refs ? $refs : $opts->{'limit'}; } else { $max = $refs; } } - die "ERROR: based on reference and exclude option index must be between 1 and $refs\n" if($opts{'index'} < 1 || $opts{'index'} > $max); - PCAP::Cli::opt_requires_opts('index', \%opts, ['process']); + die "ERROR: based on reference and exclude option index must be between 1 and $refs\n" if($opts->{'index'} < 1 || $opts->{'index'} > $max); + PCAP::Cli::opt_requires_opts('index', $opts, ['process']); die "No max has been defined for this process type\n" if($max == 0); - PCAP::Cli::valid_index_by_factor('index', $opts{'index'}, $max, 1); + PCAP::Cli::valid_index_by_factor('index', $opts->{'index'}, $max, 1); } } - elsif(exists $opts{'index'}) { + elsif(exists $opts->{'index'}) { die "ERROR: -index cannot be defined without -process\n"; } - # now safe to apply defaults - $opts{'threads'} = 1 unless(defined $opts{'threads'}); - $opts{'seqtype'} = 'WGS' unless(defined $opts{'seqtype'}); - - - # make all things that appear to be paths complete (absolute not great if BAM/BAI in different locations) - for my $key (keys %opts) { - next unless( first {$key eq $_} qw(reference outdir tumour normal badloci simrep filters genes unmatched softfil)); - $opts{$key} = cwd().'/'.$opts{$key} if(defined $opts{$key} && -e $opts{$key} && $opts{$key} !~ m/^\//); - } - - my $tmpdir = File::Spec->catdir($opts{'outdir'}, 'tmpPindel'); - make_path($tmpdir) unless(-d $tmpdir); - my $progress = File::Spec->catdir($tmpdir, 'progress'); - make_path($progress) unless(-d $progress); - my $logs = File::Spec->catdir($tmpdir, 'logs'); - make_path($logs) unless(-d $logs); - - $opts{'tmp'} = $tmpdir; - - if(scalar @{$opts{'species'}} > 0 ){ - $opts{'species'}="@{$opts{'species'}}"; - } - else { - delete $opts{'species'}; - } - - return \%opts; + return $opts; } __END__ diff --git a/perl/bin/pindelCohort.pl b/perl/bin/pindelCohort.pl new file mode 100755 index 0000000..2b3802d --- /dev/null +++ b/perl/bin/pindelCohort.pl @@ -0,0 +1,221 @@ +#!/usr/bin/perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# +BEGIN { + use Cwd qw(abs_path cwd); + use File::Basename; + unshift (@INC,dirname(abs_path($0)).'/../lib'); +}; + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Const::Fast qw(const); +use File::Copy; +use File::Path qw(remove_tree make_path); + +use PCAP::Cli; +use Sanger::CGP::Pindel::Implement; + +const my %INDEX_MAX => ( + 'input' => 1, + 'pindel' => -1, + 'parse' => 1, # reads all pout, makes raw-vcf and splits to even sized for blat + 'blat' => -1, + 'concat' => 1, + ); +const my @VALID_PROCESS => keys %INDEX_MAX; + +{ + my $options = setup(); + my $threads = PCAP::Threaded->new($options->{'threads'}); + + # start processes here (in correct order obviously), add conditions for skipping based on 'process' option + if(!exists $options->{'process'} || $options->{'process'} eq 'input') { + Sanger::CGP::Pindel::Implement::input_cohort($options) + } + if(!exists $options->{'process'} || $options->{'process'} eq 'pindel') { + my $jobs = Sanger::CGP::Pindel::Implement::determine_jobs($options); # method still needed to populate info + $jobs = $options->{'limit'} if(exists $options->{'limit'} && defined $options->{'limit'}); + $threads->add_function('pindel', \&Sanger::CGP::Pindel::Implement::pindel); + $threads->run($jobs, 'pindel', $options); + } + if(!exists $options->{'process'} || $options->{'process'} eq 'parse') { + Sanger::CGP::Pindel::Implement::parse($options); + } + $options->{'split_files'} = Sanger::CGP::Pindel::Implement::split_files($options) unless(exists $options->{'split_files'}); + if(!exists $options->{'process'} || $options->{'process'} eq 'blat') { + my $jobs = scalar @{$options->{'split_files'}}; + $jobs = $options->{'limit'} if(exists $options->{'limit'} && defined $options->{'limit'}); + $threads->add_function('blat', \&Sanger::CGP::Pindel::Implement::blat); + $threads->run($jobs, 'blat', $options); + } + if(!exists $options->{'process'} || $options->{'process'} eq 'concat') { + Sanger::CGP::Pindel::Implement::concat($options); + cleanup($options) unless($options->{'debug'}); + } +} + +sub cleanup { + my $options = shift; + my $tmpdir = $options->{'tmp'}; + move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; + remove_tree $tmpdir if(-e $tmpdir); + return 0; +} + +sub index_check { + my $opts = shift; + my $max_files = @{$opts->{'hts_files'}}; + if(exists $opts->{'process'}) { + PCAP::Cli::valid_process('process', $opts->{'process'}, \@VALID_PROCESS); + if(exists $opts->{'index'}) { + my @valid_seqs = Sanger::CGP::Pindel::Implement::valid_seqs($opts); + my $refs = scalar @valid_seqs; + + my $max = $INDEX_MAX{$opts->{'process'}}; + if($max == -1){ + if(exists $opts->{'limit'}) { + $max = $opts->{'limit'} > $refs ? $refs : $opts->{'limit'}; + } else { + if($opts->{'process'} eq 'input') { + $max = $max_files; + } + elsif($opts->{'process'} eq 'blat') { + $opts->{'split_files'} = Sanger::CGP::Pindel::Implement::split_files($opts); + $max = scalar @{$opts->{'split_files'}}; + } + else { + $max = $refs; + } + } + } + if($opts->{'index'} < 1 || $opts->{'index'} > $max) { + if($opts->{'process'} eq 'input') { + die "ERROR: based on number of inputs option -index must be between 1 and $max_files\n"; + } else { + die "ERROR: based on reference and exclude option -index must be between 1 and $refs\n"; + } + } + PCAP::Cli::opt_requires_opts('index', $opts, ['process']); + die "No max has been defined for this process type\n" if($max == 0); + } + } + elsif(exists $opts->{'index'}) { + die "ERROR: -index cannot be defined without -process\n"; + } +} + +sub setup { + my $opts = Sanger::CGP::Pindel::Implement::shared_setup([],{}); + $opts->{pad} = 1 unless(exists $opts->{pad} && defined $opts->{pad}); + + # add hts_files from the remains of @ARGV + Sanger::CGP::Pindel::Implement::cohort_files($opts); + index_check($opts); + + return $opts; +} + +__END__ + +=head1 pindelCohort.pl + +Similar to pindel.pl but processes 1 sample. References to BAM can be replaced with CRAM. + +=head1 SYNOPSIS + +pindelCohort.pl [options] sample1.bam + + Required parameters: + -outdir -o Folder to output result to. + -reference -r Path to reference genome file *.fa[.gz] + + Optional + -pad Multiples (>=1) of max readlength to pad blat target seq with [default 1] + -seqtype -st Sequencing protocol, expect all input to match [WGS] + -assembly -as Name of assembly in use + - when not available in BAM header SQ line. + -species -sp Species + - when not available in BAM header SQ line. + -exclude -e Exclude this list of ref sequences from processing, wildcard '%' + - comma separated, e.g. NC_007605,hs37d5,GL% + -badloci -b Tabix indexed BED file of locations to not accept as anchors or valid events + - e.g. hi-seq depth from UCSC + -cpus -c Number of cores to use. [1] + - recommend max 4 during 'input' process. + -limit -l When defined with '-cpus' internally thread concurrent processes. + - requires '-p', specifically for pindel/pin2vcf steps + -debug -d Don't cleanup workarea on completion. + -apid -a Analysis process ID (numeric) - for cgpAnalysisProc header info + - not necessary for external use + + Targeted processing (further detail under OPTIONS): + -process -p Only process this step then exit, optionally set -index + -index -i Optionally restrict '-p' to single job + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Version + + File list can be full file names or wildcard, e.g. + + pindelCohort.pl -c 4 -r some/genome.fa[.gz] -o myout sample1.bam sample2.bam + or + pindelCohort.pl -c 4 -r some/genome.fa[.gz] -o myout sample*.bam + or + pindelCohort.pl -c 4 -r some/genome.fa[.gz] -o myout sample*.cram + + Please note that colocated index and *.bas files are required. + +=head1 OPTIONS + +=over 2 + +=item B<-process> + +Available processes for this tool are: + + input + pindel - index available + parse + blat - index available + concat + +=item B<-index> + +Possible index ranges for processes above are: + + ? + +If you want STDOUT/ERR to screen ensure index is set even for single job steps. + +=back diff --git a/perl/bin/pindelCohortMerge.pl b/perl/bin/pindelCohortMerge.pl new file mode 100755 index 0000000..dd9af59 --- /dev/null +++ b/perl/bin/pindelCohortMerge.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use Pod::Usage qw(pod2usage); +use FindBin qw($Bin); +use lib "$Bin/../lib"; +use Getopt::Long; +use Capture::Tiny qw(capture); +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use Data::UUID; +use Set::IntervalTree; + +=head +# Build the new header +zgrep -B 1000000 -m 1 '^#CHROM' old_vcfs/PD37237b.pindel.vcf.gz | head -n -1 > new_header +zgrep -hm 1 '^##SAMPLE' old_vcfs/PD37237b*.pindel.vcf.gz >> new_header +zgrep -m 1 '^#CHROM' old_vcfs/PD37237b.pindel.vcf.gz >> new_header +=cut + +my $options = setup(); +my ($vcf_by_sample, $sample_head) = vcf_by_samples($options->{vcfs}); +my ($records, $sample_order) = collate_data($vcf_by_sample); + +# write stuff +header($options->{output}, $options->{vcfs}, $sample_head); +records($options->{output}, $records, $sample_order, $options->{min_vaf}, $options->{np}, $options->{control}, $options->{min_blat}); + +sub records { + my ($output, $records, $sample_order, $min_vaf, $np_tree, $control, $min_blat) = @_; + my %ds = %{$records}; + my $uuid_gen = Data::UUID->new; + my @samples = @{$sample_order}; + for my $chr(sort keys %ds) { + my $chr_tree; + if(defined $np_tree && exists $np_tree->{$chr}) { + $chr_tree = $np_tree->{$chr}; + } + else { + $chr_tree = Set::IntervalTree->new(); + } + for my $pos(sort {$a <=> $b} keys %{$ds{$chr}}) { + for my $seq_key(sort keys %{$ds{$chr}{$pos}}) { + next if($control && exists $ds{$chr}{$pos}{$seq_key}{$control}); + + my ($info, $format) = @{$ds{$chr}{$pos}{$seq_key}{_DATA_}}; + + if(defined $np_tree ) { + $_ = q{;}.$info; + my ($rs) = $_ =~ m/;RS=(\d+)/; + my ($re) = $_ =~ m/;RE=(\d+)/; + next if(@{$chr_tree->fetch($rs, $re)} > 0); + } + + my ($ref, $alt) = split ':', $seq_key; + my $row = join "\t", $chr, $pos, $uuid_gen->to_string($uuid_gen->create), $ref, $alt, q{.}, q{}, $info, $format; + my $samples_with_min_vaf = 0; + my $samples_with_min_blat = 0; + for my $s(@samples) { + if(exists $ds{$chr}{$pos}{$seq_key}{$s}) { + #GT:S1:S2:PP:NP:WTP:WTN:WTM:MTP:MTN:MTM:VAF + #./.:12:205.534:3:2:4:3:.:3:2:0.003:0.417 + my ($mtp, $mtn, $vaf) = (split /:/, $ds{$chr}{$pos}{$seq_key}{$s})[8,9,11]; + $row .= "\t".$ds{$chr}{$pos}{$seq_key}{$s}; + # need to review if these are all q{.} when one is + $vaf = 0 if($vaf eq q{.}); + $mtp = 0 if($mtp eq q{.}); + $mtn = 0 if($mtn eq q{.}); + $samples_with_min_vaf++ if($vaf >= $min_vaf); + # $samples_with_min_blat++ if($mtp >= $min_blat && $mtn >= $min_blat); + $samples_with_min_blat++ if($mtp >= 1 && $mtn >= 1 && $mtp + $mtn >= $min_blat); + } + else { + $row .= "\t."; + } + } + if($samples_with_min_vaf > 0 && $samples_with_min_blat > 0) { + print $output $row."\n"; + } + } + } + } +} + +sub collate_data { + my ($vcf_by_sample) = @_; + my %ds; + my @samples = sort keys %{$vcf_by_sample}; + for my $s(@samples) { + my $fh = IO::Uncompress::Gunzip->new($vcf_by_sample->{$s}, MultiStream => 1, AutoClose=> 1) or die "gunzip failed: $GunzipError\n";; + while (<$fh>) { + next if(m/^#/); + chomp; + my ($chr, $pos, undef, $ref, $alt, undef, undef, $info, $format, $data) = split /\t/, $_; + my $seq_key = sprintf '%s:%s', $ref, $alt; + if(! exists $ds{$chr}{$pos}{$seq_key}) { + $ds{$chr}{$pos}{$seq_key}{_DATA_} = [$info, $format]; + } + $ds{$chr}{$pos}{$seq_key}{$s} = $data; + } + } + return (\%ds, \@samples); +} + +sub header { + my ($output, $vcfs, $sample_head) = @_; + my $metadata_header = _command_output(sprintf q{zgrep -B 1000000 -m 1 '^#CHROM' %s | grep -v '^##SAMPLE=' | head -n -1}, $vcfs->[0]); + my $col_header = _command_output(sprintf q{zgrep -m 1 '^#CHROM' %s}, $vcfs->[0]); + # remove last col as has sample will be added back + ${$col_header} =~ s/\t[^\t]+$//; + my @s_keys = sort keys %{$sample_head}; + + print $output ${$metadata_header}."\n"; + for (@s_keys) { + print $output ${$sample_head->{$_}}."\n"; + } + print $output join "\t", ${$col_header}, @s_keys; + print $output "\n"; + + return 1; +} + +sub vcf_by_samples { + my ($vcfs) = @_; + my %vcfs; + my %samps; + for my $v(@{$vcfs}) { + my $samp_head = _command_output(sprintf q{zgrep -hm 1 '^##SAMPLE' %s}, $v); + my ($sample) = ${$samp_head} =~ m/ID=([^,]+)/; + $vcfs{$sample} = $v; + $samps{$sample} = $samp_head; + } + return (\%vcfs, \%samps); +} + +sub _command_output { + my $command = shift; + my ($c_out, $c_err, $c_exit) = capture { system($command); }; + if($c_exit) { + warn "An error occurred while executing $command\n"; + warn "\tERROR$c_err\n"; + exit $c_exit; + } + chomp $c_out; + return \$c_out; +} + +sub np_lookup { + my ($gff3, $min_samp) = @_; + my $interval_count = 0; + printf STDERR "Loading normal panel...\n"; + my %tree; + my $z = IO::Uncompress::Gunzip->new($gff3, MultiStream => 1, AutoClose=> 1) or die "gunzip failed: $GunzipError\n"; + my $value = 1; + while(my $line = <$z>) { + next if ($line =~ m/^#/); + chomp $line; + # simple hash look up as only check start coord. + my ($chr, $start, $info) = (split /\t/, $line)[0,3,7]; + my ($samp_count) = $info =~ m/^SAMPLE_COUNT=(\d+)/; + next if($samp_count < $min_samp); + + $tree{$chr} = Set::IntervalTree->new() unless(exists $tree{$chr}); + $tree{$chr}->insert(\$value, $start, $start+1); # as half-open (i.e. last value is 1 past end) + $interval_count++; + } + printf STDERR "\tdone, $interval_count intervals loaded.\n"; + return \%tree; +} + +sub vcf_list { + my $list_file = shift; + my @vcfs; + open my $LIST, '<', $list_file; + map { chomp $_; push @vcfs, $_; } <$LIST>; + close $LIST; + return \@vcfs; +} + +sub setup{ + my %opts = ( + 'cmd' => join(" ", $0, @ARGV), + 'mnps' => 1, + 'min_vaf' => 0, + 'min_blat' => 4, + ); + GetOptions( 'h|help' => \$opts{h}, + 'm|man' => \$opts{m}, + 'v|version' => \$opts{v}, + 'o|output=s' => \$opts{output}, + 'n|np=s' => \$opts{np}, + 's|mnps=i' => \$opts{mnps}, + 'k|min:f' => \$opts{min_vaf}, + 'b|blat:i' => \$opts{min_blat}, + 'd|debug' => \$opts{debug}, + 'c|control=s' => \$opts{control}, + 'l|list:s' => \$opts{list}, + ); + + if(defined $opts{'v'}) { + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + + pod2usage(-verbose => 1) if(defined $opts{h}); + pod2usage(-verbose => 2) if(defined $opts{m}); + + if(defined $opts{min_vaf}) { + if($opts{min_vaf} < 0 || $opts{min_vaf} > 1) { + print STDERR "ERROR: -vaf is a fraction and must be between 0 and 1.\n"; + pod2usage(-verbose => 1); + } + $opts{min_vaf} = (sprintf '%.3f', $opts{min_vaf}) + 0; # force number to be stored + } + + my @vcfs; + if($opts{list}) { + @vcfs = @{vcf_list($opts{list})}; + } + else { + @vcfs = @ARGV; + } + $opts{vcfs} = \@vcfs; + + if(@vcfs < 2) { + print STDERR "ERROR: More than 1 VCF input is required\n"; + pod2usage(-verbose => 1); + } + + if(defined $opts{np}) { + $opts{np} = np_lookup($opts{np}, $opts{mnps}); + } + else { + delete $opts{np}; + } + + $opts{align} = $opts{output}.'.sam' unless(defined $opts{align}); + + open my $ofh, '>', $opts{output}; + $opts{output} = $ofh; + + return \%opts; +} + +__END__ + +=head1 NAME + +pindelCohortMerge.pl - Takes outputs from pindelCohort.pl and merges into a single file, filtering events. + +=head1 SYNOPSIS + +pindelCohortMerge.pl [options] A.vcf.gz B.vcf.gz... +or +pindelCohortMerge.pl [options] -l vcf.list + + Required parameters: + -output -o File path for VCF output (not compressed) + + Optional parameters: + -list -l Text list of vcf files, 1 per line + -min -k Keep events VAF >= VALUE (3dp) for 1 or more samples + - default is to retain events even if VAF == 0/. for all samples + -np -n Normal panel gff3 file - omit if no filtering required. + -mnps -s Minimum normal panel samples required to exclude [default: 1] + -control -c Exclude events where this sample has calls. + -blat -b Exclude events where no sample has MTP+MTN >= N (1 reads must be present in each direction) [default: 4] + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + +=head1 DESCRIPTION + +B generate a vcf with collated samples. + +For each common loci/REF/ALT merge the samples. + +Not vcf-merge (from vcftools) as that fails to retain GT ordering, even between rows. + +=cut diff --git a/perl/bin/pindelCohortVafFill.pl b/perl/bin/pindelCohortVafFill.pl new file mode 100755 index 0000000..d71a6c8 --- /dev/null +++ b/perl/bin/pindelCohortVafFill.pl @@ -0,0 +1,251 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use Pod::Usage qw(pod2usage); +use FindBin qw($Bin); +use lib "$Bin/../lib"; +use Getopt::Long; +use File::Spec::Functions; +use File::Path qw(make_path remove_tree); +use Const::Fast qw(const); +use File::Copy qw(move); + +use PCAP::Cli; +use PCAP::Threaded; +use Sanger::CGP::Pindel::Implement; + +use Data::Dumper; + +const my %INDEX_MAX => ( + 'split' => 1, + 'fill' => -1, # depends on number of splits + 'bams' => -1, # may want to make multi but probably not necessary + 'finalise' => 1, # merging of split files + ); +const my @VALID_PROCESS => keys %INDEX_MAX; + +{ + my $options = setup(); + my $threads = PCAP::Threaded->new($options->{threads}); + + if(!exists $options->{process} || $options->{process} eq 'split') { + Sanger::CGP::Pindel::Implement::cohort_split($options); + vaf_fill_seqdata($options); + } + + if(!exists $options->{process} || $options->{process} eq 'fill') { + for my $f(glob(catfile($options->{'split_dir'}, '*.vcf.gz'))) { + push @{$options->{split_files}}, $f if($f =~ m{/\d+.vcf.gz$}); + } + $threads->add_function('fill', \&Sanger::CGP::Pindel::Implement::fill_split_vaf); + $threads->run(scalar @{$options->{split_files}}, 'fill', $options); + } + + if(!exists $options->{process} || $options->{process} eq 'bams') { + $threads->add_function('bams', \&Sanger::CGP::Pindel::Implement::merge_vaf_bams); + $threads->run(scalar @{$options->{primary_hts}}, 'bams', $options); + } + + if(!exists $options->{process} || $options->{process} eq 'finalise') { + Sanger::CGP::Pindel::Implement::fill_vcf_merge($options); + if(!$options->{debug}) { + move(catdir($options->{tmp}, 'logs'), catdir($options->{output}, 'logs')); + remove_tree($options->{tmp}); + } + } +} + +sub vaf_fill_seqdata { + my ($options) = @_; + my $bwa_files = $options->{bwa_file_list}; + return if (-e $bwa_files); + open my $FH, '>', $bwa_files; + for my $f(@{$options->{primary_hts}}) { + print $FH qq{$f\n}; + } + close $FH; +} + +sub setup { + my %opts = ( + 'size' => 10000, + 'name' => 'cohort', + 'primary_hts' => [], + 'secondary_hts' => [], + 'threads' => 1, + ); + GetOptions( 'h|help' => \$opts{h}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{v}, + 'f|input=s' => \$opts{input}, + 'r|ref=s' => \$opts{ref}, + 'o|output=s' => \$opts{output}, + 's|size:i' => \$opts{size}, + 'n|name:s' => \$opts{name}, + 'c|cpus:i' => \$opts{threads}, + 'p|process:s' => \$opts{process}, + 'i|index:i' => \$opts{index}, + 'l|limit:i' => \$opts{limit}, + 'a|abort' => \$opts{abort}, + 'd|data=s' => \$opts{data}, + 'debug' => \$opts{debug}, + 'sr|simple:s' => \$opts{simple}, + ); + + if(defined $opts{v}) { + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + pod2usage(-verbose => 1) if(defined $opts{h}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + + delete $opts{'process'} unless(defined $opts{'process'}); + delete $opts{'index'} unless(defined $opts{'index'}); + delete $opts{'limit'} unless(defined $opts{'limit'}); + + for my $param(qw(input ref output data)) { + pod2usage(-verbose => 1, -message=> sprintf('ERROR: -%s must be defined', $param), -exit => 2) unless(defined $opts{$param}); + } + + my $final_logs = catdir($opts{output}, 'logs'); + if(-e $final_logs) { + warn "WARN: $final_logs directory exists suggesting completed analysis.\n"; + if($opts{abort}) { + die "EXIT as -abort in effect.\n"; + } + else { + warn "WARN: Continuing, set -abort to prevent reprocessing of completed data.\n"; + } + } + + PCAP::Cli::file_for_reading('input', $opts{input}); + PCAP::Cli::file_for_reading('data', $opts{data}); + PCAP::Cli::file_for_reading('ref', $opts{ref}); + PCAP::Cli::file_for_reading('simple', $opts{simple}) if(defined $opts{simple}); + + if(@ARGV) { + die "ERROR: No positional arguments expected." + } + + open my $D, '<', $opts{data}; + while(my $l = <$D>) { + chomp $l; + my ($primary, $secondary); + if($l =~ m/^([^\t]+)\t([^\t]+)$/) { + ($primary, $secondary) = ($1, $2); + } + else { + die "ERROR: file '$opts{data}' is malformed\n"; + } + PCAP::Cli::file_for_reading('bam/cram files', $primary); + PCAP::Cli::file_for_reading('bam/cram files', $secondary); + push @{$opts{primary_hts}}, $primary; + push @{$opts{secondary_hts}}, $secondary; + } + close $D; + my $sample_count = @{$opts{primary_hts}}; + if($opts{size} < $sample_count * 5) { + $opts{size} = $sample_count * 5; + warn sprintf "WARNING: -size has been automatically increased to %d (5x sample number), see '-help'\n", $opts{size}; + } + + $opts{tmp} = catdir($opts{output}, 'tmpCohortVafFill'); + make_path($opts{tmp}); + + $opts{split_dir} = catdir($opts{tmp}, 'split'); + make_path($opts{split_dir}); + $opts{fill_dir} = catdir($opts{tmp}, 'fill'); + make_path($opts{fill_dir}); + $opts{bwa_file_list} = catfile($opts{tmp}, 'bwa_files.lst'); + + make_path(catdir($opts{tmp}, 'logs')); + + return \%opts; +} + +__END__ + +=head1 NAME + +pindelCohortVafFill.pl - Takes merged cohort VCF and fills in gaps in farm friendly manner. + +=head1 SYNOPSIS + +pindelCohortVafFill.pl [options] -i ... -o ... -r ... -d ... + + Required parameters: + -input -f VCF file to read in. + -output -o Workspace directory and final output. + -ref -r File path to the reference file used to provide the coordinate system. + -data -d File containing list of sequence data files for all samples used in "-input" + - format: tab separated BWA mapping followed by pindel_cohort reads, one sample per line. + + sample_A_bwa.bamsample_A_pindel.bam + sample_B_bwa.bamsample_B_pindel.bam + + Optional parameters: + -name -n Stub name for final output files [$output/cohort...] + -size -s Number of Sample/event combinations per-file when processing [10000] + - Automatically increased to a minimum of 5x number of samples (for efficiency). + -abort -a Abort noisily if data appears to have been processed (silent exit otherwise) + -cpus -c Number of cores to use. [1] + -simple -sr Simple repeats file - only applies when complete search space within a single repeat. + + + Targeted processing (further detail under OPTIONS): + -process -p Only process this step then exit, optionally set -index + -index -i Optionally restrict '-p' to single job + -limit -l Use '-p fill -i N -l M' - do not declare '-c' + M: maximum number of scattered processes on multiple hosts + N: This instance, 1..M + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + -debug Don't cleanup anything + +=head1 DESCRIPTION + +B Takes merged cohort VCF and fills in gaps in farm friendly manner. + +One additional file is generated containg events where no fill-in is required. + +Each original sample BAM/CRAM (normally BWA mapping) needs to be paired with the one generated by pindelCohort.pl + +Each sample BAM/CRAM also needs colocated index and bas file. + +=cut diff --git a/perl/bin/pindelCohortVafSliceFill.pl b/perl/bin/pindelCohortVafSliceFill.pl new file mode 100755 index 0000000..70d8a52 --- /dev/null +++ b/perl/bin/pindelCohortVafSliceFill.pl @@ -0,0 +1,140 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use File::Path qw(make_path); +use File::Spec::Functions; +use FindBin qw($Bin); +use Getopt::Long; +use IO::Compress::Gzip qw(:constants gzip $GzipError); +use lib "$Bin/../lib"; +use Pod::Usage qw(pod2usage); + +use PCAP::Cli; +use Sanger::CGP::Pindel::OutputGen::VcfBlatAugment; + +{ + my $options = setup(); + my $augment = Sanger::CGP::Pindel::OutputGen::VcfBlatAugment->new( + input => $options->{input}, + ref => $options->{ref}, + ofh => $options->{output}, + sam => $options->{align}, + hts_files => $options->{hts_files}, + outpath => $options->{outpath}, + fill_in => 1, + simple_rpt => $options->{simple}, + ); + + $augment->output_header; + $augment->process_records; + $augment->sam_to_bam; +} + +sub setup { + my %opts = ( + ); + GetOptions( 'h|help' => \$opts{h}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{v}, + 'i|input=s' => \$opts{input}, + 'r|ref=s' => \$opts{ref}, + 'o|output=s' => \$opts{output}, + 'd|data=s' => \$opts{data}, + 'sr|simple:s' => \$opts{simple}, + ); + + if(defined $opts{v}) { + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + pod2usage(-verbose => 1) if(defined $opts{h}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + PCAP::Cli::file_for_reading('input', $opts{input}); + PCAP::Cli::file_for_reading('ref', $opts{ref}); + PCAP::Cli::file_for_reading('simple', $opts{simple}) if(defined $opts{simple}); + + $opts{align} = $opts{output}.'.fill.sam' unless(defined $opts{align}); + + my @htsfiles; + open my $D, '<', $opts{data}; + while(my $hts_file = <$D>) { + chomp $hts_file; + PCAP::Cli::file_for_reading('bam/cram files', $hts_file); + push @htsfiles, $hts_file; + } + close $D; + + $opts{hts_files} = \@htsfiles; + + $opts{outpath} = $opts{output}; + make_path($opts{outpath}) unless(-e $opts{outpath}); + + my $ofh = new IO::Compress::Gzip catfile($opts{output}, 'slice.vcf.gz'), -Level => Z_BEST_SPEED or die "IO::Compress::Gzip failed: $GzipError\n"; + $opts{output} = $ofh; + + return \%opts; +} + +__END__ + +=head1 NAME + +pindelCohortVafSliceFill.pl - Takes a VCF and adds VAF for sample/event with no call. + +=head1 SYNOPSIS + +pindelCohortVafSliceFill.pl [options] + + Required parameters: + -ref -r File path to the reference file used to provide the coordinate system. + -input -i VCF file to read in. + -output -o Directory for VCF output (gz compressed) and collocated sample bams + -data -d File containing list of BWA mapping files for all samples used in "-input" + - format: one BWA bam/cram file per line, expects co-located *.bai + -simple -sr Simple repeats file - only applies when complete search space within a single repeat. + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + +=head1 DESCRIPTION + +B Fills in VAF for sample/event combinations where no call was made. + +There must be a BAM/CRAM for every sample indicated by the VCF header. + +=cut diff --git a/perl/bin/pindelCohortVafSplit.pl b/perl/bin/pindelCohortVafSplit.pl new file mode 100755 index 0000000..05c974b --- /dev/null +++ b/perl/bin/pindelCohortVafSplit.pl @@ -0,0 +1,156 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use Pod::Usage qw(pod2usage); +use FindBin qw($Bin); +use lib "$Bin/../lib"; +use Getopt::Long; +use File::Path qw(make_path); +use File::Spec::Functions; +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use IO::Compress::Gzip qw(:constants gzip $GzipError); +use PCAP::Cli; + +{ + my $options = setup(); + split_data($options->{input}, $options->{output}, $options->{size}); +} + +sub split_data { + my ($input, $outdir, $max_e) = @_; + make_path($outdir); + my $part_fmt = '%s/%04d.vcf.gz'; + my @header; + my $no_vaf_count = 0; + my $no_vaf_file_no = 0; + + my $complete_rec = catfile($outdir, 'complete_rec.vaf.vcf.gz'); + my $no_vaf_file = sprintf $part_fmt, $outdir, $no_vaf_file_no++; + + my $COMP = new IO::Compress::Gzip $complete_rec, -Level => Z_BEST_SPEED or die "IO::Compress::Gzip failed: $GzipError\n"; + my $NO_VAF; + + my $z = IO::Uncompress::Gunzip->new($input, MultiStream => 1) or die "gunzip failed: $GunzipError\n"; + while(my $line = <$z>) { + if($line =~ m/^#/) { + push @header, $line; + print $COMP $line; + next; + } + unless($NO_VAF) { + warn "Creating $no_vaf_file\n"; + $NO_VAF = new IO::Compress::Gzip $no_vaf_file, -Level => Z_BEST_SPEED or die "IO::Compress::Gzip failed: $GzipError\n"; + #open $NO_VAF, '>', $no_vaf_file; + print $NO_VAF join q{}, @header; + } + chomp $line; + # col 9+ containing '.' means work to be done + my ($chr, $pos, $id, $ref, $alt, $qual, $filter, $info, $format, @samples) = split /\t/, $line; + my $n_vaf = 0; + map { if($_ eq q{.}) {$n_vaf++} } @samples; + if($n_vaf) { + print $NO_VAF $line."\n"; + $no_vaf_count += $n_vaf; + if($no_vaf_count >= $max_e) { + close $NO_VAF; + $no_vaf_file = sprintf $part_fmt, $outdir, $no_vaf_file_no++; + warn "\nCreating $no_vaf_file\n"; + $NO_VAF = new IO::Compress::Gzip $no_vaf_file, -Level => Z_BEST_SPEED or die "IO::Compress::Gzip failed: $GzipError\n"; + #open $NO_VAF, '>', $no_vaf_file; + print $NO_VAF join q{}, @header; + $no_vaf_count = 0; + } + } + else { + print $COMP $line."\n"; + } + } + close $COMP; + close $NO_VAF; +} + +sub setup { + my %opts = ( + 'size' => 10000, + ); + GetOptions( 'h|help' => \$opts{h}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{v}, + 'i|input=s' => \$opts{input}, + 'o|output=s' => \$opts{output}, + 's|size:i' => \$opts{size}, + ); + + if(defined $opts{v}) { + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + pod2usage(-verbose => 1) if(defined $opts{h}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + PCAP::Cli::file_for_reading('input', $opts{input}); + + return \%opts; +} + +__END__ + +=head1 NAME + +pindelCohortVafSplit.pl - Takes merged cohort VCF and splits into even sized files for VAF fill-in. + +=head1 SYNOPSIS + +pindelCohortVafSplit.pl [options] + + Required parameters: + -input -i VCF file to read in. + -output -o Directory for split VCFs + + Optional parameters: + -size -s Number of Sample/event combinations per-file [10000] + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + +=head1 DESCRIPTION + +B will generate a set of split files for VAF fill-in processing. + +One additional file is generated containg events where no fill-in is required. + +=cut diff --git a/perl/bin/pindelCohort_to_vcf.pl b/perl/bin/pindelCohort_to_vcf.pl new file mode 100755 index 0000000..d394a4a --- /dev/null +++ b/perl/bin/pindelCohort_to_vcf.pl @@ -0,0 +1,343 @@ +#!/usr/bin/perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use Pod::Usage qw(pod2usage); +use FindBin qw($Bin); +use lib "$Bin/../lib"; + +use File::Basename; +use Getopt::Long; +use Data::UUID; + +use Sanger::CGP::Pindel::Implement; +use Sanger::CGP::Pindel::OutputGen::VcfCohortConverter; +use Sanger::CGP::Vcf::Contig; +use Sanger::CGP::Vcf::Sample; +use Sanger::CGP::Pindel::OutputGen::PindelRecordParser; +use PCAP::Cli; +use PCAP::Bam::Bas; + +use Bio::DB::HTS; + +use Data::Dumper; + +{ + my $options = setup(); + my $contigs = contig_setup($options); + my ($hts_by_sample, $bas_by_sample, $vcfsamp_by_sample) = hts_and_sample_parse($options); + + my @samples = sort keys %{$hts_by_sample}; + my $record_converter = Sanger::CGP::Pindel::OutputGen::VcfCohortConverter->new( + -contigs => [values %$contigs], + -samples => \@samples, + -hts_set => $hts_by_sample, + -bas_set => $bas_by_sample, + -all => $options->{'all'}, + -badloci => $options->{'badloci'}, + ); + my $input_source = basename($0). '_v'. Sanger::CGP::Pindel->VERSION; + my $header = $record_converter->gen_header($options->{'reference'}, $input_source, $vcfsamp_by_sample, $options); + my $out_fh = $options->{'output'}; + print $out_fh $header; + for my $in_file(@{$options->{'input'}}) { + process_pindel_file($options, $in_file, $out_fh, $record_converter); + } + close $options->{'output'} if($options->{'output'} ne \*STDOUT); +} + +sub process_pindel_file { + my ($options, $pindel_file, $out_fh, $record_converter) = @_; + my $prp = Sanger::CGP::Pindel::OutputGen::PindelRecordParser->new( + -path => $pindel_file, + -fai => Bio::DB::HTS::Fai->load($options->{'reference'}), + -noreads => 1, + + ); + my $uuid_gen = Data::UUID->new; +my $records = 0; + while(my $record = $prp->next_record) { + $record->id($uuid_gen->to_string($uuid_gen->create)); +#last if($records > 40); + print $out_fh $record_converter->gen_record($record); +#$records++; + } +} + +sub hts_and_sample_parse { + my $options = shift; + my %hts_by_sample; + my %bas_by_sample; + my %vcf_samples; + for my $hts_f(@{$options->{'hts_files'}}) { + my $bas_f = $hts_f.'.bas'; + die "ERROR: Failed to find colocated bas file at: $bas_f" unless(-e $bas_f); + my $hts = Bio::DB::HTS->new(-bam => $hts_f, -fasta => $options->{reference}); + my ($sample, $platform); + foreach my $line (split(/\n/,$hts->header->text)) { + next unless($line =~ m/^\@RG/); + chomp $line; + ($sample) = $line =~ m/SM:([^\t]+)/; + ($platform) = $line =~ /PL:([^\t]+)/; + last if(defined $sample); + } + $hts_by_sample{$sample} = $hts_f; + $bas_by_sample{$sample} = PCAP::Bam::Bas->new($bas_f); + die sprintf "Failed to find sample name in \@RG header lines of %s", $hts->hts_path unless(defined $sample); + $vcf_samples{$sample} = Sanger::CGP::Vcf::Sample->new( + -name => $sample, + -study => $options->{'project'}, + -platform => $platform, + -seq_protocol => $options->{'protocol'}, + -description => $sample + ); + + } + return (\%hts_by_sample, \%bas_by_sample, \%vcf_samples); +} + +sub _contig_parse { + my ($hts, $hts_contigs, $fixed_contigs) = @_; + my ($assembly_out, $species_out); + foreach my $line (split(/\n/,$hts->header->text)){ + next unless($line =~ /^\@SQ/); + my ($name) = $line =~ /SN:([^\t]+)/; + my ($length) = $line =~ /LN:([^\t]+)/; + my ($assembly) = $line =~ /AS:([^\t]+)/; + my ($species) = $line =~ /SP:([^\t]+)/; + if(defined $assembly && !defined $assembly_out) { + $assembly_out = $assembly; + } + if(defined $species && !defined $species_out) { + $species_out = $species; + } + if($fixed_contigs) { + if(!exists $hts_contigs->{$name}) { + die sprintf "ERROR: Found contig %s in file %s but this is not consistent with other files.", $name, $hts->hts_path; + } + } + else { + $hts_contigs->{$name} = $length; + } + } + return ($assembly_out, $species_out); +} + +sub contig_setup { + my $options = shift; + my @hts; # to store objects + my $hts_contigs = {}; # to store contig list + my ($assembly, $species); + + $assembly = $options->{'assembly'} if(defined $options->{'assembly'}); + $species = join q{ }, @{$options->{'species'}} if(defined $options->{'species'} && @{$options->{'species'}} > 0); + + my @paths; + my $not_first = 0; + for my $hts_f(@{$options->{'hts_files'}}) { + my $hts = Bio::DB::HTS->new(-bam => $hts_f, -fasta => $options->{reference}); + my ($assembly_found, $species_found) = _contig_parse($hts, $hts_contigs, $not_first++); + if(! defined $assembly && defined $assembly_found) { + $assembly = $assembly_found; + } + if(defined $assembly && defined $assembly_found && $assembly ne $assembly_found) { + if(defined $options->{'assembly'}){ + die sprintf "Assembly defined on command line (%s) doesn't match that found in %s (%s)", $options->{'assembly'}, $hts->hts_path, $assembly_found; + } + else { + die sprintf "Assembly defined in %s (%s) doesn't match that found in previous files (%s):%s\n", $hts->hts_path, $assembly_found, $assembly, join(qq{\n\t}, @paths); + } + } + if(! defined $species && defined $species_found) { + $species = $species_found; + } + if(defined $species && defined $species_found && $species ne $species_found) { + if(defined $options->{'species'}){ + die sprintf "Species defined on command line (%s) doesn't match that found in %s (%s)", $options->{'species'}, $hts->hts_path, $species_found; + } + else { + die sprintf "Species defined in %s (%s) doesn't match that found in previous files (%s):%s\n", $hts->hts_path, $species_found, $species, join(qq{\n\t}, @paths); + } + } + push @paths, $hts->hts_path; + } + die "No assembly defined in BAM/CRAM headers please specify in command options." unless(defined $assembly); + die "No species defined in BAM/CRAM headers please specify in command options." unless(defined $species); + for my $name(keys %{$hts_contigs}) { + my $contig = Sanger::CGP::Vcf::Contig->new( + -name => $name, + -length => $hts_contigs->{$name}, + -assembly => $assembly, + -species => $species + ); + $hts_contigs->{$name} = $contig; + } + return $hts_contigs; +} + +sub setup{ + my %opts = ( + 'cmd' => join(" ", $0, @ARGV), + 'all' => 0, + ); + GetOptions( 'h|help' => \$opts{'h'}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{'v'}, + 'o|output:s' => \$opts{'output'}, + 'r|ref=s' => \$opts{'reference'}, + 'i|input=s@' => \$opts{'input'}, + 'prj|project:s' => \$opts{'project'}, + 'p|prot:s' => \$opts{'protocol'}, + 'as|assembly:s' => \$opts{'assembly'}, + 'sp|species=s{0,}' => \@{$opts{'species'}}, + 'pp|parent:s' => \$opts{'pp'}, + 'a|all' => \$opts{'all'}, + 'b|badloci:s' => \$opts{'badloci'}, + ); + + if(defined $opts{'v'}){ + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + + pod2usage(-verbose => 1) if(defined $opts{'h'}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + PCAP::Cli::file_for_reading('ref', $opts{'reference'}); + my @full_inputs; + for my $if(@{$opts{'input'}}) { + if($if =~ s/%/*/g) { + for my $gf(glob $if) { + push @full_inputs, $gf if(-s $gf); + } + } + else { + push @full_inputs, $if; + } + } + for my $if(@full_inputs) { + PCAP::Cli::file_for_reading('input (possibly expanded)', $if); + } + $opts{'input'} = \@full_inputs; + + if($opts{'output'}) { + open my $fh, '>', $opts{'output'}; + $opts{'output'} = $fh; + } + else { $opts{'output'} = \*STDOUT; } + + # add hts_files from the remains of @ARGV + Sanger::CGP::Pindel::Implement::cohort_files(\%opts); + + return \%opts; +} + + +__END__ + +=head1 NAME + +pindelCohort_to_vcf.pl - Takes raw Pindel files and a set of bam files to produces a collated vcf file. + +=head1 SYNOPSIS + +pindelCohort_to_vcf.pl [options] SAMPLE1.bam [SAMPLE2.bam ...] + + Required parameters: + -ref -r File path to the reference file used to provide the coordinate system. + -input -i Files to read in, repeatable or '%' wildcard + + Optional parameters: + -output -o File path to output to. Defaults to STDOUT. + -all -a Generate VAF for all samples, even when not seen by Pindel. + -badloci -b Tabix indexed BED file of locations reject as events + - e.g. hi-seq depth from UCSC + -project -prj String representing the project data is from. + -prot -p String representing the sequencing protocol (e.g. genomic, targeted, RNA-seq). + -assembly -as Reference assembly name, used when not found in BAM/CRAM headers. + -species -sp Species name, used when not found in BAM/CRAM headers. + -parent -pp Process information from parent program (where one exists) + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + +=head1 OPTIONS + +=over 8 + +=item B<-input> + +File path(s) to read. Accepts only raw pindel files, repeat or '%' as wildcard for multiple. + +=item B<-output> + +File path to output data. If this option is omitted the script will attempt to write to STDOUT. + + +=item B<-project> + +String identifying the project to which the samples belong. + +=item B<-prot> + +String representing the sequencing protocol (e.g. genomic, targeted, RNA-seq). + +=item B<-assembly> + +Reference assembly name, used when not found in BAM headers. Validated against header if both are present. + +=item B<-species> + +Species name, used when not found in BAM headers. Validated against header if both are present. + +=item B<-help> + +Print a brief help message and exits. + +=item B<-man> + +Prints the manual page and exits. + +=item B<-version> + +Prints the version number and exits. + +=back + +=head1 DESCRIPTION + +B will attempt to generate a vcf file from a set of Pindel output files. + +=cut diff --git a/perl/bin/pindel_2_combined_vcf.pl b/perl/bin/pindel_2_combined_vcf.pl index 0e6a625..d19fab1 100755 --- a/perl/bin/pindel_2_combined_vcf.pl +++ b/perl/bin/pindel_2_combined_vcf.pl @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - use strict; use warnings FATAL => 'all'; use autodie qw(:all); diff --git a/perl/bin/pindel_blat_vaf.pl b/perl/bin/pindel_blat_vaf.pl new file mode 100755 index 0000000..1006f5a --- /dev/null +++ b/perl/bin/pindel_blat_vaf.pl @@ -0,0 +1,141 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Cwd qw(abs_path); +use File::Path qw(make_path); +use File::Spec::Functions; +use FindBin qw($Bin); +use Getopt::Long; +use lib "$Bin/../lib"; +use Pod::Usage qw(pod2usage); + +use PCAP::Cli; +use Sanger::CGP::Pindel::OutputGen::VcfBlatAugment; + +{ + my $options = setup(); + my $augment = Sanger::CGP::Pindel::OutputGen::VcfBlatAugment->new( + input => $options->{input}, + ref => $options->{ref}, + ofh => $options->{output}, + sam => $options->{align}, + hts_files => $options->{hts}, + outpath => $options->{outpath}, + debug => $options->{debug}, + ); + + $augment->output_header; + $augment->process_records; +} + + +sub setup{ + my %opts = ( + 'cmd' => join(" ", $0, @ARGV), + 'hts' => [], + ); + my @hts_files; + GetOptions( 'h|help' => \$opts{h}, + 'm|man' => \$opts{m}, + 'v|version' => \$opts{v}, + 'o|output=s' => \$opts{output}, + 'r|ref=s' => \$opts{ref}, + 'i|input=s' => \$opts{input}, + 'd|debug' => \$opts{debug}, + 'hts=s@' => \@hts_files, + ); + + $opts{hts} = [split(/,/,join(',',@hts_files))]; + + + if(defined $opts{'v'}) { + printf "Version: %s\n", Sanger::CGP::Pindel::Implement->VERSION; + exit; + } + + pod2usage(-verbose => 1) if(defined $opts{h}); + pod2usage(-verbose => 2) if(defined $opts{m}); + + PCAP::Cli::file_for_reading('ref', $opts{ref}); + PCAP::Cli::file_for_reading('input', $opts{input}); + for my $t(@{$opts{hts}}) { + PCAP::Cli::file_for_reading('hts', $t); + unless(-e $t.'.bai' || -e $t.'.csi' || -e $t.'.crai') { + die "ERROR: Unable to find appropriate index file for $t\n"; + } + unless(-e $t.'.bas') { + die "ERROR: Unable to find *.bas file for $t\n"; + } + } + + $opts{outpath} = $opts{output}; + make_path($opts{outpath}) unless(-e $opts{outpath}); + + $opts{align} = catfile($opts{outpath}, 'data.sam'); + + open my $ofh, '>', catfile($opts{outpath}, 'data.vcf'); + $opts{output} = $ofh; + + return \%opts; +} + +__END__ + +=head1 NAME + +pindel_blat_vaf.pl - Takes a raw Pindel VCF and bam file to add accurate counts. + +=head1 SYNOPSIS + +pindel_blat_vaf.pl [options] + + Required parameters: + -ref -r File path to the reference file used to provide the coordinate system. + -input -i VCF file to read in. + -hts BAM/CRAM file for associated sample. + -output -o Directory for VCF output (gz compressed) and colocated sample bams + + Other: + -debug -d Turn on additional outputs. + -help -h Brief help message. + -man -m Full documentation. + -version -v Prints the version number. + +=head1 DESCRIPTION + +B will attempt to generate a vcf with expanded counts and VAF. + +For every variant called by Pindel a blat will be performed and the results merged into a single vcf record. + +=cut diff --git a/perl/bin/pindel_germ_bed.pl b/perl/bin/pindel_germ_bed.pl index eb4f771..ab7e507 100755 --- a/perl/bin/pindel_germ_bed.pl +++ b/perl/bin/pindel_germ_bed.pl @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - BEGIN { use Cwd qw(abs_path); use File::Basename; diff --git a/perl/bin/pindel_input_gen.pl b/perl/bin/pindel_input_gen.pl index d1c8781..19b5466 100755 --- a/perl/bin/pindel_input_gen.pl +++ b/perl/bin/pindel_input_gen.pl @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - BEGIN { use Cwd qw(abs_path); use File::Basename; diff --git a/perl/bin/pindel_np_from_vcf.pl b/perl/bin/pindel_np_from_vcf.pl index c777467..fd80600 100755 --- a/perl/bin/pindel_np_from_vcf.pl +++ b/perl/bin/pindel_np_from_vcf.pl @@ -29,7 +29,6 @@ # 2009, 2010, 2011, 2012’. # - BEGIN { use Cwd qw(abs_path); use File::Basename; diff --git a/perl/bin/pindel_vcfSortNsplit.pl b/perl/bin/pindel_vcfSortNsplit.pl new file mode 100755 index 0000000..fcdee3d --- /dev/null +++ b/perl/bin/pindel_vcfSortNsplit.pl @@ -0,0 +1,101 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use File::Basename; +use Capture::Tiny qw(capture); +use Const::Fast qw(const); +use File::Temp; +use File::Path qw(make_path); +use File::Spec::Functions; + +const my $USAGE => sprintf "USAGE: %s in.vcf \n", basename($0); +const my $SRT_COUNT => q{bash -c "set -o pipefail; (grep -B 100000 -m 1 '^#CHRO' %s && grep -v '^#' %s | sort -s -S 1G -k 1,1 -k 2,2n -k 4,4 -k 5,5) | tee %s | grep -cv '^#'"}; +const my $NO_HEAD_SPLIT => q{bash -c "set -o pipefail; grep -v '^#' %s | split -a 4 --additional-suffix=.vcf -l %d - %s"}; +const my $CAPTURE_HEADER => q{grep -B 100000 -m 1 '^#CHRO' %s}; + +if(@ARGV < 3) { + die $USAGE; +} + +my ($in_vcf, $split_lines, $outdir) = @ARGV; + +die "ERROR: Absent or Empty file: $in_vcf" unless(-e $in_vcf && -s _ > 0); + +make_path($outdir) unless(-e $outdir); + +my $tmp_dir = File::Temp->newdir(DIR=> $outdir, CLEANUP => 1); +my $srt_vcf = catfile($tmp_dir, 'srt.vcf'); + +# first we sort and capture the number of variants +my $c_srt_count = sprintf $SRT_COUNT, $in_vcf, $in_vcf, $srt_vcf; +my ($c_out, $c_err, $c_exit) = capture { system($c_srt_count); }; +if($c_exit > 1) { # allow 1 as could be 0 events to work with + warn "An error occurred while executing $c_srt_count\n"; + warn "\tERROR$c_err\n"; + exit $c_exit; +} +chomp $c_out; +my $events = $c_out; +die "ERROR: Did not get a count of events" if($events !~ m/^\d+$/); + +my $prefix = catfile($tmp_dir, 'split_'); +my $c_split = sprintf $NO_HEAD_SPLIT, $srt_vcf, $split_lines, $prefix; +($c_out, $c_err, $c_exit) = capture { system($c_split); }; +if($c_exit > 1) { # allow 1 as could be 0 events to work with + warn "An error occurred while executing $c_split\n"; + warn "\tERROR$c_err\n"; + exit $c_exit; +} + +my $c_header = sprintf $CAPTURE_HEADER, $in_vcf; +($c_out, $c_err, $c_exit) = capture { system($c_header); }; +if($c_exit > 0) { + warn "An error occurred while executing $c_split\n"; + warn "\tERROR$c_err\n"; + exit $c_exit; +} +my $vcf_head = $c_out; + +# now need to convert all the files to valid VCF: +opendir(my $dh, $tmp_dir) || die "Can't opendir $tmp_dir: $!"; +while (readdir $dh) { + next unless($_ =~ m/^split_[a-z]{4}\.vcf$/); + my $split_vcf = catfile($tmp_dir, $_); + my $vcf_out = catfile($outdir, $_); + open my $ofh, '>', $vcf_out; + print $ofh $vcf_head; + close $ofh; + system("cat $split_vcf >> $vcf_out"); +} +closedir $dh; diff --git a/perl/lib/Sanger/CGP/Pindel.pm b/perl/lib/Sanger/CGP/Pindel.pm index af28e50..671e16b 100644 --- a/perl/lib/Sanger/CGP/Pindel.pm +++ b/perl/lib/Sanger/CGP/Pindel.pm @@ -34,6 +34,7 @@ use Const::Fast qw(const); use base 'Exporter'; our $VERSION = '3.6.0'; -our @EXPORT = qw($VERSION); +our $COHORT_VERSION = '1.0.1'; +our @EXPORT = qw($VERSION $COHORT_VERSION); 1; diff --git a/perl/lib/Sanger/CGP/Pindel/Implement.pm b/perl/lib/Sanger/CGP/Pindel/Implement.pm index cccd3c0..a0ca73a 100644 --- a/perl/lib/Sanger/CGP/Pindel/Implement.pm +++ b/perl/lib/Sanger/CGP/Pindel/Implement.pm @@ -32,21 +32,23 @@ package Sanger::CGP::Pindel::Implement; use strict; use warnings FATAL => 'all'; use autodie qw(:all); +use Capture::Tiny; use Const::Fast qw(const); -use File::Spec; -use File::Which qw(which); +use Cwd qw(cwd); +use File::Basename; use File::Copy qw(copy); use File::Path qw(make_path remove_tree); +use File::Spec::Functions; use File::Temp qw(tempfile); -use Capture::Tiny; -use List::Util qw(first); +use File::Which qw(which); use FindBin qw($Bin); +use Getopt::Long; +use List::Util qw(first); +use Pod::Usage qw(pod2usage); -use Sanger::CGP::Pindel; - -use PCAP::Threaded; use PCAP::Bam; - +use PCAP::Threaded; +use Sanger::CGP::Pindel; use Sanger::CGP::Pindel::OutputGen::BamUtil; const my $PINDEL_GEN_COMM => q{ -b %s -o %s -t %s}; @@ -57,13 +59,39 @@ const my $PIN_MERGE => q{ -o %s -i %s -r %s}; const my $FLAG => q{ -a %s -u %s -s %s -i %s -o %s -r %s}; const my $PIN_GERM => q{ -f %s -i %s -o %s}; const my $BASE_GERM_RULE => 'F012'; # prefixed with additional F if fragment filtering. +const my $COHORT_2_VCF => q{ -r %s -i %s -o %s %s%s}; +const my $VCF_SPLIT_SIZE => 5_000; + +sub input_cohort{ + my ($options) = @_; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 0); + + my $input = $options->{'hts_files'}->[0]; + my $max_threads = $options->{'threads'}; + + my $sample = sanitised_sample_from_bam($input); + my $gen_out = catdir($tmp, $sample); + make_path($gen_out) unless(-e $gen_out); + + my $command = "$^X "; + $command .= _which('pindel_input_gen.pl'); + $command .= sprintf $PINDEL_GEN_COMM, $input, $gen_out, $max_threads; + $command .= " -r $options->{reference}"; + $command .= " -e $options->{badloci}" if(exists $options->{'badloci'}); + + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 0); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 0); + return 1; +} sub input { my ($index, $options) = @_; return 1 if(exists $options->{'index'} && $index != $options->{'index'}); my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + return 1 if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); my @inputs = ($options->{'tumour'}, $options->{'normal'}); my $iter = 1; @@ -80,7 +108,7 @@ sub input { $max_threads = 1 if($max_threads == 0); my $sample = sanitised_sample_from_bam($input); - my $gen_out = File::Spec->catdir($tmp, $sample); + my $gen_out = catdir($tmp, $sample); make_path($gen_out) unless(-e $gen_out); my $command = "$^X "; @@ -89,11 +117,11 @@ sub input { $command .= " -r $options->{reference}"; $command .= " -e $options->{badloci}" if(exists $options->{'badloci'}); - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, $index); # ## The rest is auto-magical - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); } return 1; } @@ -107,7 +135,7 @@ sub pindel { my @seqs = sort keys %{$options->{'seqs'}}; my @indicies = limited_indicies($options, $index_in, scalar @seqs); for my $index(@indicies) { - next if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + next if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); my $seq = $seqs[$index-1]; ## build commands for this index @@ -115,11 +143,12 @@ sub pindel { my @command_set; + push @command_set, "set -o pipefail"; # was split - my $refs = File::Spec->catdir($tmp, 'refs'); + my $refs = catdir($tmp, 'refs'); make_path($refs) unless(-e $refs); - my $refseq_file = File::Spec->catfile($refs, "$seq.fa"); + my $refseq_file = catfile($refs, "$seq.fa"); my $split_comm = _which('samtools'); $split_comm .= sprintf $SAMTOOLS_FAIDX, $options->{'reference'}, @@ -129,16 +158,16 @@ sub pindel { push @command_set, $split_comm; # was filter - my $filter_out = File::Spec->catdir($tmp, 'filter'); + my $filter_out = catdir($tmp, 'filter'); make_path($filter_out) unless(-e $filter_out); - my $filtered_seq = File::Spec->catfile($filter_out, $seq); + my $filtered_seq = catfile($filter_out, $seq); # pindel - my $gen_out = File::Spec->catdir($tmp, 'pout'); + my $gen_out = catdir($tmp, 'pout'); make_path($gen_out) unless(-e $gen_out); - my ($bd_fh, $bd_file) = tempfile(File::Spec->catfile($tmp, 'pindel_db_XXXX'), UNLINK => 0); + my ($bd_fh, $bd_file) = tempfile(catfile($tmp, 'pindel_db_XXXX'), UNLINK => 0); close $bd_fh; unlink $filtered_seq if(-e $filtered_seq); @@ -158,11 +187,11 @@ sub pindel { $bd_file, 5; - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@command_set, $index); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), \@command_set, $index); # a little cleanup for my $ext((qw(BP INV LI TD))) { - unlink File::Spec->catfile($gen_out, (join '_', $seq, $seq, $ext)); + unlink catfile($gen_out, (join '_', $seq, $seq, $ext)); } unlink $bd_file; unlink $refseq_file; @@ -171,7 +200,125 @@ sub pindel { # ## The rest is auto-magical - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); + } + return 1; +} + +sub parse { + my ($options) = @_; + my $tmp = $options->{'tmp'}; + + my $pout = catdir($tmp, 'pout'); + my $vcf = catdir($tmp, 'vcf'); + make_path($vcf) unless(-e $vcf); + my $collated_vcf = catfile($vcf, 'raw.vcf'); + + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'tovcf')) { + my $bad_loci = q{}; + if($options->{badloci}) { + $bad_loci = sprintf q{ -b %s }, $options->{badloci}; + } + my $command = $^X.' '._which('pindelCohort_to_vcf.pl'); + $command .= sprintf $COHORT_2_VCF, + $options->{'reference'}, + catfile($pout, '%_%_%'), + $collated_vcf, + $bad_loci, + join(q{ }, @{$options->{hts_files}}); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 'tovcf'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'tovcf'); + } + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'split')) { + #perl pindel_vcfSortNsplit.pl run_PD26988a/tmpPindel/vcf/raw.vcf 10000 tsrt + my $command = $^X.' '._which('pindel_vcfSortNsplit.pl'); + $command .= sprintf q{ %s %d %s}, + $collated_vcf, + $VCF_SPLIT_SIZE, + $vcf; + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 'split'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'split'); + } + return 1; +} + +sub concat { + my ($options) = @_; + my $tmp = $options->{'tmp'}; + + my $hts_input = $options->{'hts_files'}->[0]; + my $sample_name = (PCAP::Bam::sample_name($hts_input))[0]; + my $vcf_gz = catfile($options->{'outdir'}, sprintf('%s.pindel.vcf.gz', $sample_name)); + my $bam = catfile($options->{'outdir'}, sprintf('%s.pindel.bam', $sample_name)); + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'concat')) { + #vcf-concat blat_*/data.vcf + my $command = _which('vcf-concat'); + $command .= sprintf q{ %s | bgzip -c > %s}, + catfile($tmp, 'blat_*/data.vcf'), + $vcf_gz; + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), ['set -o pipefail', $command], 'concat'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'concat'); + } + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'tabix')) { + my $command = _which('tabix'); + $command .= sprintf q{ -f -p vcf %s}, $vcf_gz; + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 'tabix'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'tabix'); + } + # now deal with the sam files + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'calmd')) { + my $samtools = _which('samtools'); + my $command = sprintf q{rm -f %s && (%s view -H %s | grep -P '^@(HD|SQ)' && zgrep -hvP '^@(HD|SQ)' %s | sort | uniq) | %s sort -l 0 -T %s - | %s calmd -b - %s > %s}, + catfile($tmp, 'srt.????.bam'), # cleanup anything that may be have been left by previous run + $samtools, $hts_input, + catfile($tmp, (sprintf 'blat_*/%s.sam.gz', $sample_name)), + $samtools, catfile($tmp, 'srt'), + $samtools, $options->{'reference'}, $bam; + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), ['set -o pipefail', $command], 'calmd'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'calmd'); + } + unless(PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 'index')) { + my $command = _which('samtools'); + $command .= sprintf q{ index %s}, $bam; + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 'index'); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 'index'); + } + return 1; +} + +sub split_files { + my $options = shift; + my $vcf = catdir($options->{'tmp'}, 'vcf'); + my $patt = catfile($vcf, 'split_*.vcf'); + my @files = sort glob $patt; + return \@files; +} + +sub blat { + my ($index_in, $options) = @_; + my $tmp = $options->{'tmp'}; + my $vcf = catdir($tmp, 'vcf'); + # -i run_PD26988a/tmpPindel/vcf/raw.vcf + + return 1 if(exists $options->{index} && $index_in != $options->{index}); + + my @split_files = @{$options->{split_files}}; + my @indicies = limited_indicies($options, $index_in, scalar @split_files); + for my $index(@indicies) { + next if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); + + my $split_file = $split_files[$index-1]; + my $blat_file = fileparse($split_file, '.vcf'); + $blat_file =~ s/split_([a-z]+)/blat_$1/; + my $command = $^X.' '._which('pindel_blat_vaf.pl'); + $command .= sprintf q{ -r %s -hts %s -i %s -o %s ; gzip -t %s/*.gz }, + $options->{reference}, + $options->{hts_files}->[0], + $split_file, + catfile($tmp, $blat_file), + catfile($tmp, $blat_file); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, $index); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); } return 1; } @@ -185,18 +332,18 @@ sub pindel_to_vcf { my @seqs = sort keys %{$options->{'seqs'}}; my @indicies = limited_indicies($options, $index_in, scalar @seqs); for my $index(@indicies) { - next if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + next if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); my $seq = $seqs[$index-1]; - my $pout = File::Spec->catdir($tmp, 'pout'); + my $pout = catdir($tmp, 'pout'); my @in_files; for my $type(qw(D SI)) { - my $in_file = File::Spec->catfile($pout, $seq.'_'.$seq.'_'.$type); + my $in_file = catfile($pout, $seq.'_'.$seq.'_'.$type); push @in_files, $in_file if(-e $in_file && -f $in_file); } if(scalar @in_files > 0) { - my $vcf = File::Spec->catdir($tmp, 'vcf'); + my $vcf = catdir($tmp, 'vcf'); make_path($vcf) unless(-e $vcf); my $pg = Sanger::CGP::Pindel::OutputGen::BamUtil::pg_from_caller('pindel', 'cgpPindel indel detection', $VERSION, $options->{'cmd'}); @@ -206,8 +353,8 @@ sub pindel_to_vcf { $command .= sprintf $PIN_2_VCF, $options->{'tumour'}, $options->{'normal'}, $options->{'reference'}, - File::Spec->catfile($vcf, $seq.'_pindel.vcf'), - File::Spec->catfile($vcf, $seq.'_pindel'), + catfile($vcf, $seq.'_pindel.vcf'), + catfile($vcf, $seq.'_pindel'), $options->{'seqtype'}, $options->{'seqtype'}, $pg, @@ -217,13 +364,13 @@ sub pindel_to_vcf { $command .= ' -s' if(defined $options->{'skipgerm'}); $command .= ' -as '.$options->{'assembly'} if(defined $options->{'assembly'}); $command .= ' -sp '.$options->{'species'} if(defined $options->{'species'}); - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, $index); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, $index); } # ## The rest is auto-magical - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); } return 1; } @@ -232,10 +379,10 @@ sub merge_and_bam { my $options = shift; my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + return 1 if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 0); - my $vcf = File::Spec->catdir($tmp, 'vcf'); - my $outstub = File::Spec->catfile($options->{'outdir'}, $options->{'tumour_name'}.'_vs_'.$options->{'normal_name'}); + my $vcf = catdir($tmp, 'vcf'); + my $outstub = catfile($options->{'outdir'}, $options->{'tumour_name'}.'_vs_'.$options->{'normal_name'}); my $command = "$^X "; $command .= _which('pindel_merge_vcf_bam.pl'); $command .= sprintf $PIN_MERGE, $outstub, $vcf, $options->{'reference'}; @@ -246,27 +393,27 @@ sub merge_and_bam { $command .= ' -s'; } - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 0); - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 0); } sub flag { my $options = shift; my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + return 1 if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 0); -# FlagVcf.pl -# -r ~kr2/GitHub/cgpPindel/perl/rules/genomicRules.lst -# -sr ~kr2/GitHub/cgpPindel/perl/rules/softRules.lst -# -a /lustre/scratch112/sanger/cgppipe/nst_pipe/test_ref/human/37/e58/vagrent/codingexon_regions.indel.bed.gz -# -u /lustre/scratch112/sanger/kr2/pan_cancer_test_sets/pindel_np_gen/huge_file.gff3.gz -# -s /lustre/scratch112/sanger/cgppipe/nst_pipe/test_ref/human/37/gsm_reference_repeat.gff.gz -# -i pindel_farm/PD13371a_vs_PD13371b.vcf.gz -# -o pindel_farm/PD13371a_vs_PD13371b.flag_new_np.github.vcf + # FlagVcf.pl + # -r ~kr2/GitHub/cgpPindel/perl/rules/genomicRules.lst + # -sr ~kr2/GitHub/cgpPindel/perl/rules/softRules.lst + # -a /lustre/scratch112/sanger/cgppipe/nst_pipe/test_ref/human/37/e58/vagrent/codingexon_regions.indel.bed.gz + # -u /lustre/scratch112/sanger/kr2/pan_cancer_test_sets/pindel_np_gen/huge_file.gff3.gz + # -s /lustre/scratch112/sanger/cgppipe/nst_pipe/test_ref/human/37/gsm_reference_repeat.gff.gz + # -i pindel_farm/PD13371a_vs_PD13371b.vcf.gz + # -o pindel_farm/PD13371a_vs_PD13371b.flag_new_np.github.vcf - my $stub = File::Spec->catfile($options->{'outdir'}, $options->{'tumour_name'}.'_vs_'.$options->{'normal_name'}); + my $stub = catfile($options->{'outdir'}, $options->{'tumour_name'}.'_vs_'.$options->{'normal_name'}); my $new_vcf = "$stub.flagged.vcf"; my $command = "$^X "; @@ -295,11 +442,11 @@ sub flag { $germ .= sprintf $PIN_GERM, find_germline_rule($options), $vcf_gz, $germ_bed; push @commands, $germ; - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), \@commands, 0); unlink $new_vcf; - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 0); } sub find_germline_rule { @@ -362,13 +509,23 @@ sub determine_jobs { my $tmp = $options->{'tmp'}; my @valid_seqs = valid_seqs($options); my %seqs; - for my $in_bam($options->{'tumour'}, $options->{'normal'}) { - my $samp_path = File::Spec->catdir($tmp, sanitised_sample_from_bam($in_bam)); + my @samples; + if(exists $options->{'tumour'} && exists $options->{'normal'}) { + push @samples, $options->{'tumour'}, $options->{'normal'}; + } + elsif(exists $options->{'hts_files'}) { + @samples = @{$options->{'hts_files'}}; + } + else { + die "ERROR: Unexpected combination of BAM/CRAM inputs"; + } + for my $in_bam(@samples) { + my $samp_path = catdir($tmp, sanitised_sample_from_bam($in_bam)); my @files = file_list($samp_path, qr/\.txt(:?\.gz)$/); for my $file(@files) { my ($seq) = $file =~ m/(.+)\.txt(:?\.gz)$/; if(first { $seq eq $_ } @valid_seqs) { - push @{$seqs{$seq}}, File::Spec->catfile($samp_path, $file); + push @{$seqs{$seq}}, catfile($samp_path, $file); } } } @@ -433,11 +590,247 @@ sub fragmented_files { sub _which { my $prog = shift; my $l_bin = $Bin; - my $path = File::Spec->catfile($l_bin, $prog); + my $path = catfile($l_bin, $prog); $path = which($prog) unless(-e $path); return $path; } +sub shared_setup { + my ($are_paths, $extra_opts) = @_; + my $script_name = basename($0); + my %opts; + pod2usage(-msg => "\nERROR: Option must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); + $opts{'cmd'} = join " ", $script_name, @ARGV; + my %load_opts = ( + 'h|help' => \$opts{'h'}, + 'm|man' => \$opts{'m'}, + 'c|cpus=i' => \$opts{'threads'}, + 'r|reference=s' => \$opts{'reference'}, + 'o|outdir=s' => \$opts{'outdir'}, + 'e|exclude=s' => \$opts{'exclude'}, + 'b|badloci=s' => \$opts{'badloci'}, + 'p|process=s' => \$opts{'process'}, + 'i|index=i' => \$opts{'index'}, + 'v|version' => \$opts{'version'}, + # these are specifically for pin2vcf + 'sp|species=s{0,}' => \@{$opts{'species'}}, + 'as|assembly=s' => \$opts{'assembly'}, + 'st|seqtype=s' => \$opts{'seqtype'}, + 'sg|skipgerm' => \$opts{'skipgerm'}, + # specifically for FlagVCF + 's|simrep=s' => \$opts{'simrep'}, + 'f|filters=s' => \$opts{'filters'}, + 'g|genes=s' => \$opts{'genes'}, + 'u|unmatched=s' => \$opts{'unmatched'}, + 'sf|softfil=s' => \$opts{'softfil'}, + 'l|limit=i' => \$opts{'limit'}, + 'd|debug' => \$opts{'debug'}, + 'a|apid:s' => \$opts{'apid'}, + # specifically for cohort + 'pad:f' => \$opts{'pad'}, + ); + for my $opt_key(keys %{$extra_opts}) { + my $v = $extra_opts->{$opt_key}; + $load_opts{$opt_key} = \$opts{$v}; + } + GetOptions(%load_opts) or pod2usage(2); + + pod2usage(-verbose => 1) if(defined $opts{'h'}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + if($opts{'version'}) { + print 'pindel version: ',Sanger::CGP::Pindel::Implement->VERSION,"\n"; + print 'cpindel version: ',$COHORT_VERSION,"\n"; + exit 0; + } + + if($script_name eq 'pindel.pl') { + PCAP::Cli::file_for_reading('simrep', $opts{'simrep'}); + PCAP::Cli::file_for_reading('filters', $opts{'filters'}); + PCAP::Cli::file_for_reading('genes', $opts{'genes'}); + PCAP::Cli::file_for_reading('unmatched', $opts{'unmatched'}); + PCAP::Cli::file_for_reading('softfil', $opts{'softfil'}) if(defined $opts{'softfil'}); + } + + PCAP::Cli::file_for_reading('reference', $opts{'reference'}); + PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); + + my $final_logs = catdir($opts{'outdir'}, 'logs'); + if(-e $final_logs) { + warn "NOTE: Presence of '$final_logs' directory suggests successful complete analysis, please delete to rerun\n"; + exit 0; + } + + delete $opts{'process'} unless(defined $opts{'process'}); + delete $opts{'index'} unless(defined $opts{'index'}); + delete $opts{'limit'} unless(defined $opts{'limit'}); + delete $opts{'exclude'} unless(defined $opts{'exclude'}); + delete $opts{'badloci'} unless(defined $opts{'badloci'}); + delete $opts{'apid'} unless(defined $opts{'apid'}); + + # now safe to apply defaults + $opts{'threads'} = 1 unless(defined $opts{'threads'}); + $opts{'seqtype'} = 'WGS' unless(defined $opts{'seqtype'}); + + # make all things that appear to be paths complete (absolute not great if BAM/BAI in different locations) + for my $key (keys %opts) { + next unless( first {$key eq $_} (qw(reference outdir badloci simrep filters genes unmatched softfil), @{$are_paths}) ); + $opts{$key} = cwd().'/'.$opts{$key} if(defined $opts{$key} && -e $opts{$key} && $opts{$key} !~ m/^\//); + } + + my $tmpdir = catdir($opts{'outdir'}, 'tmpPindel'); + make_path($tmpdir) unless(-d $tmpdir); + my $progress = catdir($tmpdir, 'progress'); + make_path($progress) unless(-d $progress); + my $logs = catdir($tmpdir, 'logs'); + make_path($logs) unless(-d $logs); + + $opts{'tmp'} = $tmpdir; + + if(scalar @{$opts{'species'}} > 0 ){ + $opts{'species'}="@{$opts{'species'}}"; + } + else { + delete $opts{'species'}; + } + + return \%opts; +} + +sub cohort_files { + my $opts = shift; + my @files; + my %uniq_chk; + for my $candidate(sort @ARGV) { + if(-e $candidate && -s _ && $candidate =~ m/[.](bam|cram)$/) { + $candidate = cwd().'/'.$candidate if($candidate !~ m{^/}); + if(exists $uniq_chk{$candidate}) { + die "Same file defined multiple times: $candidate"; + } + $uniq_chk{$candidate} = 1; + push @files, $candidate; + } + } + if(@files == 0) { + die "ERROR: Failed to find list of bam/cram files following arguments"; + } + for my $hts(@files) { + if(!-e "$hts.bai" && !-e "$hts.csi" && !-e "$hts.crai") { + die "ERROR: Failed to identify appropriate bam/cram index for $hts"; + } + unless(-e "$hts.bas") { + die "ERROR: Failed to identify appropriate *.bas file for $hts"; + } + } + $opts->{'hts_files'} = \@files; + return 1; +} + +sub cohort_split { + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 0); + + my $command = $^X.' '._which('pindelCohortVafSplit.pl'); + $command .= sprintf ' -i %s -o %s -s %d', $options->{input}, $options->{split_dir}, $options->{size}; + + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, 0); + + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 0); +} + +sub fill_split_vaf { + my ($index_in, $options) = @_; + my $tmp = $options->{tmp}; + + return 1 if(exists $options->{index} && $index_in != $options->{index}); + + my @split_files = @{$options->{split_files}}; + + my @indicies = limited_indicies($options, $index_in, scalar @split_files); + for my $index(@indicies) { + next if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); + my $split_file = $split_files[$index-1]; + my $fill_basename = fileparse($split_file, '.vcf.gz'); + my $fill_dir = catdir($options->{fill_dir}, $fill_basename); + my $command = $^X.' '._which('pindelCohortVafSliceFill.pl'); + $command .= sprintf ' -r %s -i %s -o %s -d %s', $options->{ref}, $split_file, $fill_dir, $options->{bwa_file_list}; + if(defined $options->{'simple'}) { + $command .= sprintf ' -sr %s', $options->{'simple'}; + } + + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), $command, $index); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); + } +} + +sub merge_vaf_bams { + my ($index_in, $options) = @_; + my $tmp = $options->{'tmp'}; + + return 1 if(exists $options->{'index'} && $index_in != $options->{'index'}); + my @indicies = limited_indicies($options, $index_in, scalar @{$options->{secondary_hts}}); + for my $index(@indicies) { + next if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), $index); + my $arr_idx = $index-1; + my $sample = sanitised_sample_from_bam($options->{primary_hts}->[$arr_idx]); + my $sort_tmp = catdir($options->{'tmp'}, sprintf 'sort_tmp_%d', $index); + make_path($sort_tmp); + my $in_file_list = catfile($sort_tmp, 'merge_files'); + + my @split_bams; + for my $f(glob(catfile($options->{'fill_dir'}, '*', sprintf '%s.bam', $sample))) { + push @split_bams, $f if($f =~ m{/\d+/$sample\.bam$}); + } + # list of bams to merge + lines_to_file($in_file_list, [$options->{secondary_hts}->[$arr_idx], @split_bams]); + my $merged_bam = catfile($options->{output}, sprintf('%s.vaf.bam', $sample)); + my $sort_prefix = catfile($sort_tmp, 'samsort'); + my $sort_cleanup = sprintf 'rm -rf %s.*.bam', $sort_prefix; + my $sam_merge = _which('samtools'); + # needs to attempt to clean up duplicate reads + $sam_merge .= sprintf q{ merge --output-fmt SAM -b %s - | pee 'grep ^@' 'grep -v ^@ | sort -S 2G -T %s | uniq' | samtools view -u - | samtools sort -m 2G -T %s -o %s -}, + $in_file_list, # merge filelist + $sort_tmp, # sort tmptfile + $sort_prefix, + $merged_bam; # outfile + my $sam_idx = _which('samtools'); + $sam_idx .= sprintf q{ index %s}, $merged_bam; + + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), [$sort_cleanup, $sam_merge, $sam_idx], $index); + remove_tree($sort_tmp); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), $index); + } +} + +sub lines_to_file { + my($file_name, $list_of_str) = @_; + open my $fh, '>', $file_name; + print $fh join "\n", @{$list_of_str}; + print $fh "\n"; # to make valid file + close $fh; +} + +sub fill_vcf_merge { + my $options = shift; + my $tmp = $options->{'tmp'}; + return if PCAP::Threaded::success_exists(catdir($tmp, 'progress'), 0); + my @split_filled_vcf; + for my $f(glob(catfile($options->{fill_dir}, '*', 'slice.vcf.gz'))) { + push @split_filled_vcf, $f if($f =~ m{/\d+/slice\.vcf\.gz$}); + } + my $complete_recs = catfile($options->{split_dir}, 'complete_rec.vaf.vcf.gz'); + my $final_vcf = catfile($options->{output}, sprintf '%s.vaf.vcf.gz', $options->{name}); + my $merge = sprintf q{(zgrep '^#' %s ; zgrep -vh '^#' %s | sort -k1,1 -k2,2n -k 4,4 -k5,5) | bgzip -c > %s}, + $complete_recs, join(q{ } , @split_filled_vcf, $complete_recs), + $final_vcf; + my $tabix = sprintf q{tabix -fp vcf %s}, $final_vcf; + + PCAP::Threaded::external_process_handler(catdir($tmp, 'logs'), [$merge, $tabix], 0); + PCAP::Threaded::touch_success(catdir($tmp, 'progress'), 0); +} + 1; __END__ diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index 191b8f2..b719abf 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -229,7 +229,7 @@ sub _process_set { } } -sub _tabix_to_interval_tree { +sub tabix_to_interval_tree { my $bed = shift; my %tree; my $z = IO::Uncompress::Gunzip->new($bed, MultiStream => 1) or die "gunzip failed: $GunzipError\n"; @@ -270,7 +270,7 @@ sub reads_to_pindel { my $tabix; if(defined $bed) { # was tabix, keeping name for consistency - $tabix = _tabix_to_interval_tree($bed); + $tabix = tabix_to_interval_tree($bed); } @reads = @{$reads[0]} if(ref $reads[0] eq 'ARRAY'); diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen/Pair.pm b/perl/lib/Sanger/CGP/Pindel/InputGen/Pair.pm index cff68ca..59c7f4d 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen/Pair.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen/Pair.pm @@ -54,33 +54,34 @@ sub new { sub exact { my $self = shift; - return 1 if($self->{'r1'}->exact && $self->{'r2'}->exact); + # r2 least likely to be exact so test it first, short-circuit by not testing r1 if not exact + return 1 if($self->{'r2'}->exact && $self->{'r1'}->exact); return 0; } sub unmapped_pair { my $self = shift; + # r1 least likely to be unmapped so test it first, short-circuit by not testing r2 if unmapped return 1 if($self->{'r1'}->unmapped && $self->{'r2'}->unmapped); return 0; } sub qcfailed_pair { my $self = shift; + # r1 least likely to be qc_fail so test it first, short-circuit by not testing r2 if qc_fail return 1 if($self->{'r1'}->qc_failed && $self->{'r2'}->qc_failed); return 0; } sub has_good_anchor { my $self = shift; - my $r2_state = $self->{'r2'}->good_anchor; # to ensure both fully populates - return 1 if($self->{'r1'}->good_anchor || $r2_state); + return 1 if($self->{'r1'}->good_anchor || $self->{'r2'}->good_anchor); return 0; } sub has_good_query { my $self = shift; - my $r2_state = $self->{'r2'}->good_query; # to ensure both fully populates - return 1 if($self->{'r1'}->good_query || $r2_state); + return 1 if($self->{'r1'}->good_query || $self->{'r2'}->good_query); return 0; } diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm index 58e3d73..a1d9da7 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen/Read.pm @@ -62,11 +62,9 @@ const my $MAX_CIGAR_OPS_FOR_ANCHOR => 7*2; #cigar operations array both elements sub new { my ($class, $sam, $end, $tabix) = @_; -# my @elements = (split /\t/, ${$sam})[0,1,2,3,4,5,8,9,10]; my ($qname, $flag, $rname, $pos, $mapq, $cigar, $seq, $qual) = (split /\t/, ${$sam})[0,1,2,3,4,5,9,10]; # just clean this up as it is of no use $cigar =~ s/[[:digit:]]+H//g if(index($cigar, 'H') != -1); - my ($rg) = ${$sam} =~ m/\tRG:Z:([^\t]+)/; my $self = {'qname' => $qname, 'flag' => int $flag, 'rname' => $rname, @@ -76,7 +74,7 @@ sub new { 'seq' => $seq, 'qual' => $qual, 'end' => int $end, - 'rg' => defined $rg ? $rg : '.', + 'rg' => ${$sam} =~ m/\tRG:Z:([^\t]+)/ ? $1 : q{.}, }; $self->{'tabix'} = $tabix if(defined $tabix); bless $self, $class; diff --git a/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecord.pm b/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecord.pm index c085646..bbac5ea 100644 --- a/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecord.pm +++ b/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecord.pm @@ -62,6 +62,8 @@ sub new{ _repeats => $args{'-repeats'}, _num_samples => $args{'-num_samples'}, _sample_contrib => $args{'-sample_contrib'}, + _ref_left => $args{'-ref_left'}, + _ref_right => $args{'-ref_right'}, }; bless $self, $class; return $self; @@ -134,6 +136,14 @@ sub get_reads{ return $self->{_reads}->{$sample_name}->{$strand}; } +sub get_read_counts{ + my($self,$sample_name,$strand) = @_; + if(exists $self->{_reads}->{$sample_name}->{$strand}) { + return scalar @{$self->{_reads}->{$sample_name}->{$strand}}; + } + return 0; +} + =head samples Returns an array of UNORDERED sample names asscociated with the record. @@ -185,6 +195,18 @@ sub min_change{ return $self->{_min_change}; } +sub ref_left { + my($self,$value) = @_; + $self->{_ref_left} = $value if defined $value; + return $self->{_ref_left}; +} + +sub ref_right { + my($self,$value) = @_; + $self->{_ref_right} = $value if defined $value; + return $self->{_ref_right}; +} + sub lub{ my($self,$value) = @_; $self->{_lub} = $value if defined $value; @@ -244,3 +266,21 @@ sub repeats{ $self->{_repeats} = $value if defined $value; return $self->{_repeats}; } + +sub gc_5p{ + my($self,$value) = @_; + $self->{_gc_5p} = $value if defined $value; + return $self->{_gc_5p}; +} + +sub gc_3p{ + my($self,$value) = @_; + $self->{_gc_3p} = $value if defined $value; + return $self->{_gc_3p}; +} + +sub gc_rng{ + my($self,$value) = @_; + $self->{_gc_rng} = $value if defined $value; + return $self->{_gc_rng}; +} diff --git a/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecordParser.pm b/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecordParser.pm index 49c5dd3..57d725f 100644 --- a/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecordParser.pm +++ b/perl/lib/Sanger/CGP/Pindel/OutputGen/PindelRecordParser.pm @@ -64,6 +64,7 @@ sub init{ $self->{_fh} = $fh; $self->{_fai} = $args{-fai}; + $self->{_noreads} = $args{-noreads} || 0; ## clear the first line of ##+.... @@ -133,6 +134,8 @@ sub _process_record{ _parse_header_v02($record,$record_header); } + + my $alignments = []; # Collect all the reads... @@ -145,6 +148,30 @@ sub _process_record{ $self->_parse_alignment($record, $alignments, \$ref_line); + # this is where we can add additional information about GC content + my $chr = $record->chro; + my $lhs_end = $record->range_start; + my $rhs_start = $record->range_end; + my $fai = $self->{_fai}; + my $r_type = $record->type; + + my $seq = $fai->fetch(sprintf '%s:%d-%d', $chr, $lhs_end - 199, $lhs_end); + my $lhs_gc = ($seq =~ tr/GCgc//)/200; + + $seq = $fai->fetch(sprintf '%s:%d-%d', $chr, $rhs_start, $rhs_start + 199); + my $rhs_gc = ($seq =~ tr/GCgc//)/200; + my $ref_tmp; + if($record->type eq 'D'){ + $ref_tmp = $record->ref_seq; + } + else { + $ref_tmp = $record->alt_seq; + } + my $rng_gc = ($ref_tmp =~ tr/GCgc//) / length $ref_tmp; + $record->gc_5p($lhs_gc); + $record->gc_3p($rhs_gc); + $record->gc_rng($rng_gc); + return $record; } @@ -307,6 +334,9 @@ sub _parse_alignment { my ($ref_left, $ref_change, $ref_right) = ($$ref_line =~ m/([A-Z]+)(\s+|[a-z]+|[a-z]+.*[a-z]+)([A-Z]+)/); + $record->ref_left($ref_left); + $record->ref_right($ref_right); + my $change_ref_offset = length $ref_left; my $change_ref_offset_end = $change_ref_offset + length $ref_change; my $record_type = $record->type(); @@ -360,7 +390,7 @@ sub _parse_alignment { foreach my $read(@{$alignment}) { $read =~ s/ ([+-])/\t$1/ if($record_type eq 'D'); ## correction for a bug in the pindel output layout.... this is done here to allow use to pass a ref of the read into _parse_read - _parse_read($record, $chr, $start_pos, \$read, $ref_seq_length, ($read_num++.$record_idx), $change_ref_offset, $change_ref_offset_end,\$_buffer_region,$_buffer_region_start); + _parse_read($record, $chr, $start_pos, \$read, $ref_seq_length, ($read_num++.$record_idx), $change_ref_offset, $change_ref_offset_end,\$_buffer_region,$_buffer_region_start, $self->{_noreads}); } ## This is not strictly read from the pindel input but is useful for woring out the number of repeats within the repeat-range. @@ -518,82 +548,81 @@ between bwa and pindel. is used to grab variant sequence from the read string. =cut sub _parse_read { - my ($record, $chr, $start_pos, $read, $ref_seq_length, $read_idx, $change_ref_start, $change_ref_end, $_buffer_region, $_buffer_region_start) = @_; + my ($record, $chr, $start_pos, $read, $ref_seq_length, $read_idx, $change_ref_start, $change_ref_end, $_buffer_region, $_buffer_region_start, $no_read_data) = @_; + $no_read_data ||= 0; my @bits = split /\t+/, ${$read}; - - ## This is a custom read name component added to the read name when it is put into Pindel. - ## As pindel currently does not preserve the read group, if we want to identify read group - ## specific errors we need to track the read groups from the reads.... - my ($read_group) = $bits[-1] =~ /\/[12]_RG(.+)$/; - $read_group = '' unless $read_group; - - my ($name, $rg_pair) = split /\//, $bits[-1]; - $name = substr($name,1) if substr($name,0,1) eq '@'; - - # need this to force uniqness in reads that have multiple events - # and make display in gbrowse work for overlapping reads - $name .= '_r'.$read_idx; - my $sample = $bits[-2]; - my $mapq = $bits[-3]; # mapq of anchor read, sensible to use this in the output - my $strand = $bits[-5]; # will need to be inverted as is the strand of the anchor - - # locate the left and right parts of the read string - my $read_seq = $bits[0]; - my $read_left = substr($read_seq, 0, $change_ref_start); - my $read_right = substr($read_seq, $change_ref_end); - my $event = substr($read_seq, $change_ref_start,$change_ref_end - $change_ref_start); - - ## we do this so that we can efficiently strip the space characters from the read components... - my $left_seq_length = $read_left =~ tr/ATCGN/ATCGN/;## the tr simply counts the number of atcgs in the string... v-efficient - my $right_seq_length = $read_right =~ tr/ATCGN/ATCGN/;## the tr simply counts the number of atcgs in the string... v-efficient - my $event_seq_length = $event =~ tr/ATCGN/ATCGN/; - my $event_length = length $event; - - ## create a read sequence without any spaces. This is MUCH MUCH faster than using s///. - my $space_stripped_read_seq = substr($read_left, (length($read_left) - $left_seq_length), $change_ref_start); - $space_stripped_read_seq .= $event if $event_seq_length; - $space_stripped_read_seq .= substr($read_right, 0,$right_seq_length); - - #$read_left =~ s/^ +//; - #$read_right =~ s/ //g; - #$read_seq =~ s/ //g; - - $start_pos -= $left_seq_length ; - - # Some data have -ve position starts so need to be corrected. i.e. the position starts before the beginning of the reference. - # This occurs with species like Devil that map to shattered contig sequences. - if($start_pos < 1) { - my $corr_size = (abs $start_pos)+1; - #$read_seq = substr($read_seq, $corr_size); - #$read_left = substr($read_left, $corr_size); - $read_seq = substr($space_stripped_read_seq, $corr_size); - $read_left = substr(substr($read_left, (length($read_left) - $left_seq_length),$change_ref_start), $corr_size); - - $left_seq_length = $read_left =~ tr/ATCGN/ATCGN/; - $start_pos = 1; - } + my $strand = $bits[-5]; + $strand =~ tr/\+\-/\-\+/; # need to invert as is strand of anchor -## Keep track of all the reads associated with a variant. -## These bits are for pindel_bam creation. -## These bam files only contain the reads identified from within pindel as having a variant -## These bam files are used in things like gbrowse/jbrowse for display + my $read_data; + if($no_read_data == 0) { + ## This is a custom read name component added to the read name when it is put into Pindel. + ## As pindel currently does not preserve the read group, if we want to identify read group + ## specific errors we need to track the read groups from the reads.... + my ($read_group) = $bits[-1] =~ /\/[12]_RG(.+)$/; + $read_group = '' unless $read_group; + + my ($name, $rg_pair) = split /\//, $bits[-1]; + $name = substr($name,1) if substr($name,0,1) eq '@'; + + # need this to force uniqness in reads that have multiple events + # and make display in gbrowse work for overlapping reads + $name .= '_r'.$read_idx; + + my $mapq = $bits[-3]; # mapq of anchor read, sensible to use this in the output + + # locate the left and right parts of the read string + my $read_seq = $bits[0]; + my $read_left = substr($read_seq, 0, $change_ref_start); + my $read_right = substr($read_seq, $change_ref_end); + my $event = substr($read_seq, $change_ref_start,$change_ref_end - $change_ref_start); + + ## we do this so that we can efficiently strip the space characters from the read components... + my $left_seq_length = $read_left =~ tr/ATCGN/ATCGN/;## the tr simply counts the number of atcgs in the string... v-efficient + my $right_seq_length = $read_right =~ tr/ATCGN/ATCGN/;## the tr simply counts the number of atcgs in the string... v-efficient + my $event_seq_length = $event =~ tr/ATCGN/ATCGN/; + my $event_length = length $event; + + ## create a read sequence without any spaces. This is MUCH MUCH faster than using s///. + my $space_stripped_read_seq = substr($read_left, (length($read_left) - $left_seq_length), $change_ref_start); + $space_stripped_read_seq .= $event if $event_seq_length; + $space_stripped_read_seq .= substr($read_right, 0,$right_seq_length); + + $start_pos -= $left_seq_length ; + + # Some data have -ve position starts so need to be corrected. i.e. the position starts before the beginning of the reference. + # This occurs with species like Devil that map to shattered contig sequences. + if($start_pos < 1) { + my $corr_size = (abs $start_pos)+1; + $read_seq = substr($space_stripped_read_seq, $corr_size); + $read_left = substr(substr($read_left, (length($read_left) - $left_seq_length),$change_ref_start), $corr_size); + + $left_seq_length = $read_left =~ tr/ATCGN/ATCGN/; + $start_pos = 1; + } - my @cig_list = ($left_seq_length, 'M'); - push @cig_list, $ref_seq_length, 'D' if $ref_seq_length; - push @cig_list, $event_seq_length, 'I' if $event_seq_length; - push @cig_list, $right_seq_length, 'M'; + ## Keep track of all the reads associated with a variant. + ## These bits are for pindel_bam creation. + ## These bam files only contain the reads identified from within pindel as having a variant + ## These bam files are used in things like gbrowse/jbrowse for display - $strand =~ tr/\+\-/\-\+/; # need to invert as is strand of anchor - my $flag = $strand eq '-' ? 16 : 0; # previously 1+8 but as not really paired anymore - my @tags = calmd($chr, $start_pos, \@cig_list, \$space_stripped_read_seq, $_buffer_region, $_buffer_region_start); + my @cig_list = ($left_seq_length, 'M'); + push @cig_list, $ref_seq_length, 'D' if $ref_seq_length; + push @cig_list, $event_seq_length, 'I' if $event_seq_length; + push @cig_list, $right_seq_length, 'M'; - pop @tags; # we dont need the last value.. at the moment... - unshift @tags, "RG:Z:$read_group" if($read_group); + my $flag = $strand eq '-' ? 16 : 0; # previously 1+8 but as not really paired anymore + my @tags = calmd($chr, $start_pos, \@cig_list, \$space_stripped_read_seq, $_buffer_region, $_buffer_region_start); + + pop @tags; # we dont need the last value.. at the moment... + unshift @tags, "RG:Z:$read_group" if($read_group); + $read_data = [$name,$flag,$chr,$start_pos,$mapq,join(q{},@cig_list),'*','0','0',$space_stripped_read_seq,'*',@tags]; + } # Create a basic sam line for the read and add it to the record. - $record->add_read($sample,$strand,[$name,$flag,$chr,$start_pos,$mapq,join(q{},@cig_list),'*','0','0',$space_stripped_read_seq,'*',@tags]); + $record->add_read($sample,$strand,$read_data); return 1; } diff --git a/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfBlatAugment.pm b/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfBlatAugment.pm new file mode 100644 index 0000000..138022d --- /dev/null +++ b/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfBlatAugment.pm @@ -0,0 +1,793 @@ +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# +package Sanger::CGP::Pindel::OutputGen::VcfBlatAugment; +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use Capture::Tiny qw(capture); +use Const::Fast qw(const); +use File::Basename; +use File::Path qw(remove_tree); +use File::Spec::Functions; +use File::Temp qw(tempfile tempdir); +use IO::Compress::Gzip qw(:constants gzip $GzipError); +use List::Util qw(min max); +use Set::IntervalTree; +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); + +use Bio::DB::HTS; +use Bio::DB::HTS::Faidx; +use Vcf; + +use PCAP::Bam::Bas; +use Sanger::CGP::Pindel; +use Sanger::CGP::PindelPostProcessing::VcfSoftFlagger; +use Sanger::CGP::Vcf::VcfProcessLog; +use Sanger::CGP::Vcf::VcfUtil; + +const my $SD_MULT => 2; +const my $V_INFO => 7; +const my $V_FMT => 8; +const my $V_GT_START => 9; +const my $READS_ONLY => q{bash -c 'set -o pipefail; (samtools view -H %s && samtools view -F 3840 %s %s | sort | uniq) | samtools fasta - > %s'}; +const my $BLAT_ONLY => q{bash -c 'blat -t=dna -q=dna -noTrimA -minIdentity=95 -noHead -out=psl %s %s %s && pslPretty -long -axt %s %s %s /dev/stdout'}; +# returns +ve and then -ve results +const my $LOCI_FMT => '%s:%d-%d'; +const my $PAD_EVENT => 3; +const my $LARGE_D => 100; +const my $MAPPED_RL_MULT => 0.9; + +1; + +=head new + +Basic code template for use: + + $augment = Sanger::CGP::Pindel::OutputGen::VcfBlatAugment->new( + input => $options->{input}, + ref => $options->{ref}, + ofh => $options->{output}, + sam => $options->{align}, + hts_files => $options->{hts}, + outpath => $options->{outpath}, + debug => $options->{debug}, + ); + $augment->output_header; + $augment->process_records; + +=cut + +sub new{ + my $proto = shift; + my (%args) = @_; + my $class = ref($proto) || $proto; + + my $self = { + input => $args{input}, + ref => $args{ref}, + ofh => $args{ofh}, # vcf + hts_files => $args{hts_files}, + fill_in => $args{fill_in}, + debug => $args{debug} || 0, + rpts => $args{simple_rpt}, + }; + bless $self, $class; + + $self->_init($args{outpath}); + + return $self; +} + +sub _init { + my ($self, $outpath) = @_; + my $vcf = Vcf->new(file => $self->{input}); + $vcf->parse_header; + $self->{vcf} = $vcf; + $self->_sample_order; # needs vcf + $self->_align_output($outpath); + $self->_add_headers unless($self->{fill_in}); + $self->_hts; # has to go before _buffer_sizes + $self->_buffer_sizes; # has to go before _validate_sample and _populate_interval_tree + $self->_validate_samples; + if($self->{fill_in}) { + $self->_populate_interval_tree; # adds a header item to VCF + } + # load the fai + $self->{fai} = Bio::DB::HTS::Faidx->new($self->{ref}); + return 1; +} + +sub _populate_interval_tree { + my $self = shift; + # even if we don't have a file we should add the header for consistency + $self->{vcf}->add_header_line({key => 'INFO', ID => 'PSRPT', Description => q{BLAT search space is >50% simple repeat}}, 'append' => 1); + + my %tree; + if(defined $self->{rpts}) { + my $z = IO::Uncompress::Gunzip->new($self->{rpts}, MultiStream => 1) or die "gunzip failed: $GunzipError\n"; + while(my $line = <$z>) { + next if ($line =~ m/^#/); + chomp $line; + my ($chr, $s, $e) = split /\t/, $line; + my $dist = ($e - $s) + 1; + next if($dist < $self->{target_pad} * 2); + $tree{$chr} = Set::IntervalTree->new() unless(exists $tree{$chr}); + $tree{$chr}->insert([$s, $e, ($e - $s) + 1], $s, $e); + } + close $z; + } + $self->{repeat_tree} = \%tree; +} + +sub _padded_interval_hit { + my ($self, $chr, $l_pos, $h_pos) = @_; + return 0 unless (exists $self->{repeat_tree}->{$chr}); + my $l_pad = ($l_pos - $self->{target_pad}) - 1; # as completely 0-based + $l_pad = 0 if($l_pad < 0); + my $h_pad = $h_pos + $self->{target_pad}; + #warn sprintf "%s:%d-%d\n", $chr, $l_pad, $h_pad; + for my $hit(@{$self->{repeat_tree}->{$chr}->fetch($l_pad, $h_pad)}) { + #warn sprintf "\t%s:%d-%d (%d)\n", $chr, $hit->[0], $hit->[1], $hit->[2]; + my ($numer, $denom); + if($l_pad > $hit->[0] && $h_pad < $hit->[1]) { + # contained, as completely repeat + $numer = 1; + $denom = 1; # as completely repeat + } + elsif($l_pad <= $hit->[0]) { + # low overhang + $numer = ($h_pad - $hit->[0]) + 1; + $denom = ($h_pad - $l_pad) + 1 + } + elsif($h_pad >= $hit->[1]) { + # high overhang + $numer = ($hit->[1] - $l_pad) + 1; + $denom = ($h_pad - $l_pad) + 1 + } + else { + # search is larger than the repeat, but could still be huge component + $numer = $hit->[2]; + $denom = ($h_pad - $l_pad) + 1; + } + my $result = $numer/$denom; + #warn "\t\t$result\n"; + return 1 if($result > 0.5); + } + return 0; +} + +sub _close_sams { + my $self = shift; + for my $sample(@{$self->{vcf_sample_order}}) { + close $self->{sfh}->{$sample}; + } + return 1; +} + +sub _fa_dict { + my $self = shift; + open my $D, '<', $self->{ref}.'.dict' or die "Failed to find $self->{ref}.dict, please generate with 'samtools dict'"; + chomp(my @dict = <$D>); + close $D; + $self->{fa_dict} = \@dict; +} + +sub _align_output { + my ($self, $outpath) = @_; + $self->_fa_dict(); + for my $sample(@{$self->{vcf_sample_order}}) { + my $sam_file = catfile($outpath, (sprintf '%s.sam.gz', $sample)); + $self->{samfile}->{$sample} = $sam_file; + $self->{bamfile}->{$sample} = catfile($outpath, (sprintf '%s.bam', $sample)); + unlink $sam_file if(-e $sam_file); + + my $SAM = new IO::Compress::Gzip $sam_file, -Level => Z_BEST_SPEED or die "IO::Compress::Gzip failed: $GzipError\n"; + print $SAM join "\n", @{$self->{fa_dict}}; + print $SAM "\n"; + $self->{sfh}->{$sample} = $SAM; + } + return 1; +} + +sub _sample_order { + my $self = shift; + my $i = 9; # genotype sample col from 9 + my %samp_pos; + my @ordered_samples = $self->{vcf}->get_samples; + for my $s(@ordered_samples) { + $samp_pos{$s} = $i++; + } + $self->{vcf_sample_pos} = \%samp_pos; + $self->{vcf_sample_order} = \@ordered_samples; + $self->{vcf_sample_count} = scalar @ordered_samples; + return 1; +} + +sub _sample_from_hts { + my ($self, $hts) = @_; + my $hts_sample; + foreach my $line (split(/\n/,$hts->header->text)) { + next unless($line =~ m/^\@RG/); + chomp $line; + ($hts_sample) = $line =~ m/SM:([^\t]+)/; + last if(defined $hts_sample); + } + die sprintf "ERROR: Failed to find a SM tag in a readgroup header of %s\n", $hts->hts_path unless(defined $hts_sample); + return $hts_sample; +} + +sub _validate_samples { + my $self = shift; + # check BAM/CRAM and VCF have same sample + my @samples = $self->{vcf}->get_samples(); + for my $vcf_s(sort @samples) { + next if(exists $self->{hts}->{$vcf_s}); + die sprintf "ERROR: Sample '%s' is not represented in the BAM/CRAM files provided.\n", $vcf_s; + } + return 1; +} + +sub _buffer_sizes { + my $self = shift; + my $max_ins = 0; + my $max_rl = 0; + my $min_rl = 1_000_000; + for my $hts_sample(keys %{$self->{hts}}) { + my $b = PCAP::Bam::Bas->new($self->{hts}->{$hts_sample}->hts_path.'.bas'); + my $sample; + for my $rg($b->read_groups) { + #my $m_sd = int ($b->get($rg, 'mean_insert_size') + ($b->get($rg, 'insert_size_sd') * $SD_MULT)); + my $m_sd = int ($b->get($rg, 'mean_insert_size') * 5); + $max_ins = $m_sd if($m_sd > $max_ins); + my $tmp_max = max ($b->get($rg, 'read_length_r1'), $b->get($rg, 'read_length_r2')); + my $tmp_min = min ($b->get($rg, 'read_length_r1'), $b->get($rg, 'read_length_r2')); + $min_rl = $tmp_min if($tmp_min < $min_rl); + $max_rl = $tmp_max if($tmp_max > $max_rl); + my $s = $b->get($rg, 'sample'); + if($sample) { + die "ERROR: Multiple samples found in %s.bas\n", $self->{hts}->{$hts_sample}->hts_path if($sample ne $s); + if($sample ne $hts_sample) { + die "ERROR: Sample in bas file (%s) doesn't match bam/cram file (%s), + %s vs %s.bas\n", $sample, $hts_sample, + $self->{hts}->{$hts_sample}->hts_path, $self->{hts}->{$hts_sample}->hts_path; + } + } + else { + $sample = $s; + } + } + } + $self->{min_rl} = $min_rl; + $self->{max_insert} = $max_ins; + $self->{max_rl} = $max_rl; + $self->{target_pad} = $max_ins; # as expanded search space we need to expan the match space + return 1; +} + +sub _hts { + my $self = shift; + for my $hts(@{$self->{hts_files}}) { + my $tmp = Bio::DB::HTS->new(-bam => $hts, -fasta => $self->{ref}); + my $sample = $self->_sample_from_hts($tmp); + if(exists $self->{hts}->{$sample}) { + die sprintf "ERROR: More than one BAM/CRAM file for sample %s, %s vs %s\n", $sample, $self->{hts}->{$sample}->hts_path, $hts; + } + $self->{hts}->{$sample} = $tmp; + } + return 1; +} + +sub output_header { + my $self = shift; + my $fh = $self->{ofh}; + print $fh Sanger::CGP::PindelPostProcessing::VcfSoftFlagger::reformat_header($self->{vcf}); +} + +sub _add_headers { + my $self = shift; + my $vcf = $self->{'vcf'}; + + my %options = ( + input => basename($self->{input}), + ref => basename($self->{ref}), + # probably more if we expose cutoffs + ); + Sanger::CGP::Vcf::VcfUtil::add_vcf_process_log($vcf, + Sanger::CGP::Vcf::VcfProcessLog->new( + -input_vcf_source => basename($0), + -input_vcf_ver => Sanger::CGP::Pindel->VERSION, + -input_vcf_param => \%options, + ) + ); + + $vcf->add_header_line({'key'=>'source', 'value' => basename($0)}, 'append' => 1); + + my @format = ( + {key => 'FORMAT', ID => 'WTP', Number => 1, Type => 'Integer', Description => q{+ve strand reads BLATed to reference sequence at this location, input alignment depth when WTM='.'}}, + {key => 'FORMAT', ID => 'WTN', Number => 1, Type => 'Integer', Description => q{-ve strand reads BLATed to reference sequence at this location, input alignment depth when WTM='.'}}, + {key => 'FORMAT', ID => 'WTM', Number => 1, Type => 'Float', Description => q{Mismatch fraction of reads BLATed to reference sequence at this location (3 d.p.), '.' when no reads found via BLAT}}, + {key => 'FORMAT', ID => 'MTP', Number => 1, Type => 'Integer', Description => q{+ve strand reads BLATed to alternate sequence at this location}}, + {key => 'FORMAT', ID => 'MTN', Number => 1, Type => 'Integer', Description => q{-ve strand reads BLATed to alternate sequence at this location}}, + {key => 'FORMAT', ID => 'MTM', Number => 1, Type => 'Float', Description => q{Mismatch fraction of reads BLATed to alternate sequence at this location (3 d.p.), '.' when no reads found via BLAT}}, + {key => 'FORMAT', ID => 'VAF', Number => 1, Type => 'Float', Description => q{Variant allele fraction using reads that unambiguously map to ref or alt seq (3 d.p.)'}}, + ); + $self->{fmt_ext} = q{}; + for my $f(@format) { + $vcf->add_header_line($f); + $self->{fmt_ext} .= q{:}.$f->{ID}; + } +} + +sub to_data_hash { + my ($self, $v_d) = @_; + my @items = @{$v_d}; + my %out; + # simplified version of Vcf->next_data_hash + # only stuff we need + $out{CHROM} = $items[0]; + $out{POS} = $items[1]; + # trim the first base from these + $out{REF} = substr $items[3], 1; + $out{ALT} = substr $items[4], 1; + + # parse the info block + for my $info (split(/;/,$items[7])) { + my ($key,$val) = split(/=/,$info); + # all the values we need are key/val + next unless(defined $val); + die "Clash between INFO and columns '$key'" if(exists $out{$key}); + $out{$key} = $val; + } + + # add END + $out{END} = $out{POS}; + if($out{PC} ne 'I') { # so D/DI + $out{END} += $out{LEN} + 1; + } + else { + $out{END} += 1; + } + + # skip FORMAT + # parse GT + my ($gt, $pp, $pn) = split /:/, $items[9]; + # put in top level as no clash + $out{PP} = $pp; + $out{PN} = $pn; + return \%out; +} + +=head process_records + +Primary entry point for action following new() + +=cut + +sub process_records { + my $self = shift; + my $fh = $self->{ofh}; + my $readtmp_dir = tempdir( CLEANUP => 0 ); # doesn't clear as you would expect + my $last_chr_pos = q{.}; + while(my $v_d = $self->{vcf}->next_data_array) { + my $this_c_p = sprintf "%s:%d", $v_d->[0], $v_d->[1]; + if($last_chr_pos ne $this_c_p) { + # we use very large search range, so if the start pos doesn't change don't want to reparse reads + remove_tree($readtmp_dir, { keep_root => 1 }); + $last_chr_pos = $this_c_p; + } + $v_d->[$V_FMT] .= $self->{fmt_ext} unless($self->{fill_in}); + $self->blat_record($v_d, $readtmp_dir); + # output the updated record + printf $fh "%s\n", join "\t", @{$v_d}; + } + if(-d $readtmp_dir) { + remove_tree($readtmp_dir); + } + $self->_close_sams +} + +sub blat_record { + my ($self, $v_d, $readtmp_dir) = @_; + my $v_h = $self->to_data_hash($v_d); + + # assess if this is completely embedded in a large simple repeat + my $simp_rep = $self->_padded_interval_hit($v_h->{CHROM}, $v_h->{RS}, $v_h->{RE}); + + if($simp_rep == 1) { + $v_d->[$V_INFO] .= ';PSRPT' + } + else { + # now attempt the blat stuff + my ($fh_target, $file_target) = tempfile( SUFFIX => '.fa', UNLINK => 1 ); + $self->blat_ref_alt($fh_target, $v_h); + close $fh_target or die "Failed to close blat ref temp file"; + + my $change_pos_low = $v_h->{change_pos}; + $change_pos_low++ if($v_h->{PC} eq 'I'); + my $range_l = ($v_h->{RE} - $v_h->{RS}) + 1; + my $change_pos_high = $change_pos_low + $range_l; # REF based range, adjusted in func + $v_h->{change_pos_low} = $change_pos_low; + $v_h->{change_pos_high} = $change_pos_high; + my $gt_pos = $V_GT_START-1; + for my $sample(@{$self->{vcf_sample_order}}) { + $gt_pos++; + if($self->{fill_in} && $v_d->[$gt_pos] ne q{.}) { + next; + } + my $gt_set = $self->blat_reads($v_h, $file_target, $readtmp_dir, $sample); + if($v_d->[$gt_pos] eq q{.}) { + $v_d->[$gt_pos] = join q{:}, './.:.:.:.:.', @{$gt_set}; + } + else { + $v_d->[$gt_pos] = join q{:}, $v_d->[$gt_pos], @{$gt_set}; + } + } + # tempfile unlink only does it on shutdown when used in this way + unlink $file_target; + } + return 1; +} + +sub read_ranges { + my ($self, $v_h, $sample) = @_; + # return a string of chr:s-e... if approprate. + my $read_buffer = $self->{max_insert}; + # but need to handle very large deletions + if($v_h->{q_end} - $v_h->{q_start} > $read_buffer) { + my $low = sprintf $LOCI_FMT, $v_h->{CHROM}, $v_h->{q_start} - $read_buffer, $v_h->{q_start} + $read_buffer; + my $high = sprintf $LOCI_FMT, $v_h->{CHROM}, $v_h->{q_end} - $read_buffer, $v_h->{q_end} + $read_buffer; + return "$low $high"; + } + return sprintf $LOCI_FMT, $v_h->{CHROM}, $v_h->{q_start} - $read_buffer, $v_h->{q_end} + $read_buffer; +} + +sub blat_reads { + my ($self, $v_h, $file_target, $readtmp_dir, $sample) = @_; + my $file_query = sprintf '%s/%s.fa', $readtmp_dir, $sample; + # setup the temp file + my ($fh_psl, $file_psl) = tempfile( SUFFIX => '.psl', UNLINK => 1); + close $fh_psl or die "Failed to close $file_psl (psl output)"; + + my $c_reads = sprintf $READS_ONLY, $self->{hts}->{$sample}->hts_path, $self->{hts}->{$sample}->hts_path, $self->read_ranges($v_h, $sample), $file_query; + if(! -e $file_query) { + my ($r_out, $r_err, $r_exit) = capture { system([0], $c_reads); }; + } + + my $c_blat = sprintf $BLAT_ONLY, $file_target, $file_query, $file_psl, $file_psl, $file_target, $file_query; + my ($c_out, $c_err, $c_exit) = capture { system([0,255], $c_blat); }; + if($c_exit == 255 && ($c_err =~ m/processed 0 reads/ms || $c_err =~ m/End of file reading 4 bytes/ms)) { + # No reads found + $c_exit = 0; + } + if($c_exit) { + warn "An error occurred while executing: $c_blat\n"; + warn "\tERROR: $c_err\n"; + warn "\tECODE: $c_exit\n"; + warn "Read command: $c_reads\n"; + warn "DATA BLOCK\n"; + warn "Target:\n"; + my ($t_out, $t_err, $t_exit) = capture { system("cat $file_target"); }; + warn $t_out; + warn "Query:\n"; + ($t_out, $t_err, $t_exit) = capture { system("cat $file_query"); }; + warn $t_out; + warn "PSL:\n"; + ($t_out, $t_err, $t_exit) = capture { system("cat $file_psl"); }; + warn $t_out; + exit $c_exit; + } + + # tempfile unlink only does it on shutdown when used in this way + unlink $file_psl; + + my ($wtp, $wtn, $mtp, $mtn, $wt_bmm, $mt_bmm) = $self->psl_axt_parser(\$c_out, $v_h, $sample); + my ($wtm, $mtm) = (q{.}, q{.}); + my $wtr = $wtp + $wtn; + if($wtr > 0) { + $wtm = sprintf("%.3f", $wt_bmm / $wtr); + } + my $mtr = $mtp+$mtn; + if($mtr > 0) { + $mtm = sprintf("%.3f", $mt_bmm / $mtr); + } + my $depth = $wtr+$mtr; + my $vaf = sprintf("%.3f", $depth ? $mtr/$depth : 0); + return [$wtp, $wtn, $wtm, $mtp, $mtn, $mtm, $vaf]; +} + +sub psl_axt_parser { + my ($self, $blat_axt, $v_h, $sample) = @_; + # collate the data by readname and order by score + my @lines = split /\n/, ${$blat_axt}; + my $line_c = @lines; + # group all reads and order by score + my %reads; + for(my $i = 0; $i<$line_c; $i+=4) { + my ($id, $t_name, $t_start, $t_end, $q_name, $q_start, $q_end, $strand, $score) = split q{ }, $lines[$i]; + next if($score < 0); # it happens + my $clean_qname = $q_name; + $clean_qname =~ s{/([12])$}{}; + my $q_seq = $lines[$i+2]; + next if(length $q_seq < int $self->{min_rl} * $MAPPED_RL_MULT); + push @{$reads{$clean_qname}{$score}}, [$t_name, $t_start, $t_end, $q_name, $q_start, $q_end, $strand, $score, $lines[$i+1], $q_seq]; + } + + my %type_strand; + my %bmm_sums; + my @ref_reads; + my $is_del = 0; + if(length $v_h->{change_ref} > length $v_h->{change_alt}) { + $is_del = 1; + } + # sort keys for consistency + READ: for my $read(sort keys %reads) { + # get the alignment with the highest score + for my $score(sort {$b<=>$a} keys %{$reads{$read}}) { + my @records = @{$reads{$read}{$score}}; + if(@records != 1) { # if best score has more than one alignment it is irrelevant + next READ; + } + my $record = $records[0]; + my $ref_or_alt = $record->[0]; + if($is_del == 1 && $ref_or_alt eq 'REF') { + # see block at end of this function + push @ref_reads, $record; + } + if($self->parse_axt_event($v_h, $record, $sample) == 1) { + $type_strand{$ref_or_alt.$record->[6]} += 1; + $bmm_sums{$ref_or_alt} += bmm($record->[8], $record->[9]); + } + next READ; # remaining items are worse alignments + } + } + my $wtp = $type_strand{'REF+'} || 0; + my $wtn = $type_strand{'REF-'} || 0; + my $mtp = $type_strand{'ALT+'} || 0; + my $mtn = $type_strand{'ALT-'} || 0; + + # only relevant for deletions + if($is_del == 1 && $wtp == 0 && $wtn == 0 && $v_h->{RE} - $v_h->{RS} > $LARGE_D) { + # if a deletion is large it can cause no REF depth as impossible for a read to span the ends of the event. + # rather than specifying a cutoff we rely on the data to drive this + my $add_bmb_sum; + ($wtp, $wtn, $add_bmb_sum) = $self->parse_axt_del_ref($v_h, \@ref_reads, $sample); + $bmm_sums{'REF'} += $add_bmb_sum; + } + + return ($wtp, $wtn, $mtp, $mtn, $bmm_sums{REF}, $bmm_sums{ALT}); +} + +sub parse_axt_del_ref { + my ($self, $v_h, $records, $sample) = @_; + # Need to return + # pos reads + # neg reads + # mismatch fractions (via bmm) + # Augment $self->sam_record($v_h, $rec, $sample); + + # some rules: + # - this shouldn't be getting called if the event spans both ends so can assume reads at each end can be saved without a check + # - require near perfect match to ref, but reads that get here are known NOT to have a better ALT mapping + + my ($wtp, $wtn, $bmm_sum) = (0,0,0); + my $change_len = $v_h->{change_pos_high} - $v_h->{change_pos_low}; + my $mid_point = $v_h->{change_pos_low} + int($change_len/2); + for my $rec(@{ $records }) { + my ($t_name, $t_start, $t_end, $q_name, $q_start, $q_end, $strand, $score, $t_seq, $q_seq) = @{ $rec }; + if( + ($t_start < $v_h->{change_pos_low} && $t_end > $v_h->{change_pos_low}) + || + ($t_start < $v_h->{change_pos_high} && $t_end > $v_h->{change_pos_high}) + #($t_start < $v_h->{change_pos_low} && $t_end > $v_h->{change_pos_low}) + ) { + if($strand eq '+') { + $wtp += 1; + } + else { + $wtn += 1; + } + $bmm_sum += bmm($t_seq, $q_seq); + $self->sam_record($v_h, $rec, $sample); + } + } + # we've counted 2 locations so can't just return the full value + # this is under review + if($wtp > 0) { + $wtp = int($wtp/2) + 1; + } + if($wtn > 0) { + $wtn = int($wtn/2) + 1; + } + return ($wtp, $wtn, $bmm_sum); +} + +sub parse_axt_event { + my ($self, $v_h, $rec, $sample) = @_; + my ($t_name, $t_start, $t_end, $q_name, $q_start, $q_end, $strand, $score, $t_seq, $q_seq) = @{$rec}; + + # specific to deletion class + my $change_seq = $v_h->{change_ref}; + my $change_pos_high = $v_h->{change_pos_high}; + if($t_name eq 'ALT') { + $change_pos_high -= $v_h->{LEN}; + $change_seq = $v_h->{change_alt}; + } + + # all the reads that span the range are kept + my $retval = 0; + if($t_start <= ($v_h->{change_pos_low} - $PAD_EVENT) && $t_end > ($change_pos_high + $PAD_EVENT)) { + # look for the change (or absence) where we expect it + my $exp_pos = $v_h->{change_pos_low} - $t_start; + if($exp_pos <= length $q_seq) { + my $sub_q_seq = substr($q_seq, $exp_pos, length $change_seq); + if(length $change_seq == length $sub_q_seq # same length + && index($t_seq, q{-}) == -1 # no gaps + && index($q_seq, q{-}) == -1 # no gaps + && substr($change_seq,0,1) eq substr($sub_q_seq,0,1) # matching first base + && substr($change_seq,-1,1) eq substr($sub_q_seq,-1,1) # matching last base + ) { + $retval = 1; + $self->sam_record($v_h, $rec, $sample); + } + } + } + return $retval; +} + +sub bmm { + my ($a, $b) = @_; # need a copy of the strings anyway + my $len = length $a; + my $diffs = 0; + for(0..($len-1)) { + $diffs++ if(chop $a ne chop $b); + } + return $diffs/$len; +} + +sub sam_record { + my($self, $v_h, $rec, $sample) = @_; + + my $qname = $rec->[3]; + my $seq = $rec->[9]; + my $flag = 0; # not paired + $flag += 16 if($rec->[6] eq '-'); + + # POS is the base preceeing any change, seq start it this - target_pad + my $pos = ($v_h->{POS} - $self->{target_pad}) + $rec->[1]; + my $cigar = q{}; + if($rec->[0] eq 'REF') { + $cigar = length($seq).'M'; + } + else { + my $m_c = ($v_h->{change_pos} - $rec->[1]) + 1; + $m_c += 1 if($v_h->{PC} eq 'I'); + $cigar = $m_c.'M'; + my $change_ref = length($v_h->{REF}); + my $change_alt = length($v_h->{ALT}); + if($change_ref) { + $cigar .= $change_ref.'D'; + } + if($change_alt) { + $cigar .= $change_alt.'I'; + $m_c += $change_alt; # as consumes read + } + $cigar .= (length($seq) - $m_c).'M'; + } + printf {$self->{sfh}->{$sample}} "%s\n", join "\t", $qname, $flag, $v_h->{CHROM}, $pos, 60, $cigar, '*', 0, 0, $seq, '*'; +} + +sub sam_to_bam { + my ($self) = @_; + for my $sample(@{$self->{vcf_sample_order}}) { + my $sam = $self->{samfile}->{$sample}; + my $bam = $self->{bamfile}->{$sample}; + my $tmp = $bam; + $tmp =~ s/bam$/tmp/; + my $command = sprintf q{bash -c 'set -o pipefail ; zcat %s | pee "grep ^@" "grep -v ^@ | sort | uniq" | samtools view -uT %s - | samtools sort -l 0 -T %s - | samtools calmd - %s > %s'}, + $sam, # zcat + $self->{ref}, # view + $tmp, # sort + $self->{'ref'}, $bam; # calmd + my ($c_out, $c_err, $c_exit) = capture { system($command); }; + if($c_exit) { + warn "An error occurred while executing $command\n"; + warn "\tERROR$c_err\n"; + exit $c_exit; + } + unlink(glob(sprintf '%s.*.bam', $tmp)); + unlink $sam; + } +} + +sub flanking_ref { + my ($self, $v_h) = @_; + my $ref_left = $self->{fai}->get_sequence_no_length( + sprintf $LOCI_FMT, + $v_h->{CHROM}, + ($v_h->{POS} - $self->{target_pad})+1, + $v_h->{POS}, + ); + + my $ref_right = $self->{fai}->get_sequence_no_length( + sprintf $LOCI_FMT, + $v_h->{CHROM}, + $v_h->{END}, + $v_h->{END} + $self->{target_pad}, + ); + + return [$ref_left, $ref_right] +} + +sub blat_ref_alt { + my ($self, $fh, $v_h) = @_; + my ($ref_left, $ref_right) = @{$self->flanking_ref($v_h)}; + my $ref = $v_h->{REF}; + my $alt = $v_h->{ALT}; + my $r_start = $v_h->{RS}; + my $r_end = $v_h->{RE}; + my $change_at = length $ref_left; + + my $call_type = $v_h->{PC}; + if($call_type eq 'I') { + $change_at -= 1; # force base before + } + + # used when getting reads from HTSfile + my $q_start = $v_h->{POS} - $change_at; # correcting for position handled in change_at + my $q_end = $v_h->{POS} + length $ref_right; + if($call_type eq 'D') { + $q_end += length $v_h->{REF}; + } + + print $fh sprintf ">REF\n%s%s%s\n", $ref_left, $ref, $ref_right or die "Failed to write REF to blat ref temp file"; + print $fh sprintf ">ALT\n%s%s%s\n", $ref_left, $alt, $ref_right or die "Failed to write ALT to blat ref temp file"; + + my $seq_left = substr($ref_left, -1); + + # -1 as includes the base before and after which would be -2 but need to correct for coord maths + # (for Del and Ins, unsure about DI at the moment) + my $seq_right; + if($call_type ne 'I') { + $seq_right = substr($ref_right, 0, ($r_end - $r_start) - $v_h->{LEN}); + } + else { + $seq_right = substr($ref_right, 0, ($r_end - $r_start)); + } + + my $change_ref = $seq_left.$ref.$seq_right; + my $change_alt = $seq_left.$alt.$seq_right; + + $v_h->{q_start} = $q_start; + $v_h->{q_end} = $q_end; + $v_h->{change_pos} = $change_at; + $v_h->{change_ref} = $change_ref; + $v_h->{change_alt} = $change_alt; + return 1; +} diff --git a/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfCohortConverter.pm b/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfCohortConverter.pm new file mode 100644 index 0000000..f29e974 --- /dev/null +++ b/perl/lib/Sanger/CGP/Pindel/OutputGen/VcfCohortConverter.pm @@ -0,0 +1,290 @@ +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# +package Sanger::CGP::Pindel::OutputGen::VcfCohortConverter; +use strict; +use File::Basename; +use File::Temp qw(tempfile); +use Capture::Tiny qw(capture); + +use Data::Dumper; + +use Sanger::CGP::Pindel; +use Sanger::CGP::Vcf::VcfUtil; +use Sanger::CGP::Vcf::VcfProcessLog; +use Const::Fast qw(const); +use Sanger::CGP::Pindel::InputGen; + +use Vcf; + +const my $SEP => "\t"; +const my $NL => "\n"; +const my $READS_AND_BLAT => q{bash -c 'set -o pipefail ; samtools view -uF 3840 %s %s:%d-%d | samtools fasta - > %s && blat -t=dna -q=dna -noTrimA -minIdentity=95 -noHead -out=psl %s %s %s && pslPretty -axt %s %s %s /dev/stdout'}; +# returns +ve and then -ve results +const my $SAM_DEPTH_PN => q{bash -c "set -o pipefail ; samtools view -uF 3844 %s %s:%d-%d | pee 'samtools view -c -F 16 -' 'samtools view -c -f 16 -'"}; + +const my $MATCH => 0; +const my $Q_GAP => 4; +const my $T_GAP => 6; +const my $T_BASES => 7; +const my $STRAND => 8; +const my $Q_NAME => 9; +const my $Q_SIZE => 10; +const my $Q_START => 11; +const my $T_NAME => 13; +const my $T_START => 15; +const my $T_END => 16; +const my $BLOCK_COUNT => 17; +const my $BLOCK_SIZES => 18; +const my $Q_STARTS => 19; +const my $T_STARTS => 20; +const my $Q_SEQ => 21; +const my $T_SEQ => 22; + +const my $SD_MULT => 2; + +1; + +sub new{ + my $proto = shift; + my (%args) = @_; + my $class = ref($proto) || $proto; + + my $self = {}; + bless $self, $class; + + $self->init(%args); + + return $self; +} + +sub init{ + my($self,%args) = @_; + $self->{_contigs} = $args{-contigs}; + $self->{_srt_samples} = $args{-samples}; + $self->{_hts_set} = $args{-hts_set}; + $self->{_bas_set} = $args{-bas_set}; + $self->_max_inserts() if(defined $self->{_bas_set}); + $self->{_all} = $args{-all}; + if(defined $args{-badloci}) { + $self->{_tabix} = Sanger::CGP::Pindel::InputGen::tabix_to_interval_tree($args{-badloci}); + } +} + +sub _interval_hit { + my ($self, $chr, $start, $stop) = @_; + return 0 unless (exists $self->{_tabix}->{$chr}); + return scalar @{$self->{_tabix}->{$chr}->fetch($start, $stop)}; +} + +sub _max_inserts { + my $self = shift; + my %bas = %{$self->{_bas_set}}; + my %ins_by_sample; + for my $s(keys %bas) { + my $max_ins = 0; + for my $rg($bas{$s}->read_groups) { + my $m_sd = $bas{$s}->get($rg, 'mean_insert_size') + ($bas{$s}->get($rg, 'insert_size_sd') * $SD_MULT); + $max_ins = $m_sd if($m_sd > $max_ins); + } + $ins_by_sample{$s} = $max_ins; + } + $self->{_ins_set} = \%ins_by_sample; +} + + +=head gen_header + +Generates a Vcf header String for NORMAL/TUMOUR comparisons. + +@param1 reference_path - a String containing the path to the reference used in the VCF. + +@param2 input_source - a String containing the name and version of the application or source of the VCF data. + +@param3 sample - hash-ref of a Sanger::CGP::Vcf::Sample objects representing samples to be included. + +@param3 options - hash-ref of options passed to generating command + +=cut +sub gen_header{ + my($self, $reference_path, $input_source, $samples, $options) = @_; + + my @process_logs = ( + Sanger::CGP::Vcf::VcfProcessLog->new( + -input_vcf_source => 'Pindel', + -input_vcf_ver => 'v02', # always have S2 at this point + ), + Sanger::CGP::Vcf::VcfProcessLog->new(-input_vcf_source => basename($0), + -input_vcf_ver => Sanger::CGP::Pindel->VERSION, + -input_vcf_param => $options, + ), + ); + + my @info = ( + {key => 'INFO', ID => 'PC', Number => 1, Type => 'String', Description => 'Pindel call'}, + {key => 'INFO', ID => 'RS', Number => 1, Type => 'Integer', Description => 'Range start'}, + {key => 'INFO', ID => 'RE', Number => 1, Type => 'Integer', Description => 'Range end'}, + {key => 'INFO', ID => 'LEN', Number => 1, Type => 'Integer', Description => 'Length'}, + {key => 'INFO', ID => 'REP', Number => 1, Type => 'Integer', Description => 'Change repeat count within range'}, + {key => 'INFO', ID => 'GC5P', Number => 1, Type => 'Float', Description => 'GC content of 200 bp. 5 prime'}, + {key => 'INFO', ID => 'GCRNG', Number => 1, Type => 'Float', Description => 'GC content of deleted/inserted seq, including range'}, + {key => 'INFO', ID => 'GC3P', Number => 1, Type => 'Float', Description => 'GC content of 200 bp. 3 prime'} + ); + + my @format = ( + {key => 'FORMAT', ID => 'GT', Number => 1, Type => 'String', Description => 'Genotype'}, + {key => 'FORMAT', ID => 'S1', Number => 1, Type => 'Integer', Description => 'Pindel S1 score'}, + {key => 'FORMAT', ID => 'S2', Number => 1, Type => 'Float', Description => 'Pindel S2 score, not present for all types'}, + {key => 'FORMAT', ID => 'PP', Number => 1, Type => 'Integer', Description => 'Pindel calls on the positive strand'}, + {key => 'FORMAT', ID => 'NP', Number => 1, Type => 'Integer', Description => 'Pindel calls on the negative strand'}, + ); + + my @blank_fmt = (q{.}) x (scalar @format -1); + $self->{_noread_gt} = join q{:}, './.', @blank_fmt; + + my $fmt_str = q{}; + for my $f(@format) { + $fmt_str .= q{:} if($fmt_str); + $fmt_str .= $f->{ID}; + } + $self->{_format} = $fmt_str; + + my $vcf = Vcf->new(version=>'4.1'); + my @timeData = localtime(time); + $vcf->add_header_line( { key => 'fileDate', value => sprintf '%d%02d%02d', 900 + $timeData[5], $timeData[4]+1, $timeData[3] } ); + $vcf->add_header_line( { key => 'source', value => $input_source }, 'append' => 1 ); + $vcf->add_header_line( { key => 'reference', value => $reference_path } ); + + for my $contig (@{$self->{_contigs}}){ + Sanger::CGP::Vcf::VcfUtil::add_vcf_contig($vcf,$contig) + } + + for my $inf (@info){ + $vcf->add_header_line($inf); + } + + for my $for (@format){ + $vcf->add_header_line($for); + } + + for my $process_log (@process_logs){ + Sanger::CGP::Vcf::VcfUtil::add_vcf_process_log($vcf,$process_log) + } + + for my $samp(sort keys %{$samples}) { + Sanger::CGP::Vcf::VcfUtil::add_vcf_sample($vcf, $samples->{$samp}, $samp); + } + + return $vcf->format_header(); +} + +sub gen_record{ + my($self, $record) = @_; + my $ret = q{}; + + # CHR POS ID REF ALT QUAL FILTER INFO FORMAT GENO GENO + + my $start = $record->start(); + $start-- if(substr($record->type(),0,1) eq 'D'); + + my $ref = uc ($record->lub . $record->ref_seq); + my $alt = uc ($record->lub . $record->alt_seq); + + if($self->_interval_hit($record->chro, $start, $start + length $ref)) { + return $ret; + } + + $ret .= $record->chro().$SEP; + $ret .= $start.$SEP; + $ret .= $record->id().$SEP; + + $ret .= $ref.$SEP; + $ret .= $alt.$SEP; + $ret .= $record->sum_ms().$SEP; + $ret .= '.'.$SEP; + + # INFO + $ret .= 'PC='.$record->type().';'; + $ret .= 'RS='.$record->range_start().';'; + $ret .= 'RE='.$record->range_end().';'; + $ret .= 'LEN='.$record->length().';'; + $ret .= 'REP='.$record->repeats().';'; + $ret .= sprintf 'GC5P=%.3f;', $record->gc_5p; + $ret .= sprintf 'GCRNG=%.3f;', $record->gc_rng; + $ret .= sprintf 'GC3P=%.3f', $record->gc_3p; + $ret .= $SEP; + + # FORMAT + $ret .= $self->{_format}; + + for my $samp(@{$self->{_srt_samples}}) { + $ret .= $SEP; + if($self->gen_all || exists $record->reads->{$samp}) { + $ret .= './.:'; + $ret .= $record->s1.q{:}; + $ret .= $record->s2 || '.'; + $ret .= q{:}; + $ret .= $record->get_read_counts($samp, '+').q{:}; + $ret .= $record->get_read_counts($samp, '-'); + } + else { + $ret .= $self->{_noread_gt}; + } + } + $ret .= $NL; + return $ret; +} + +sub gen_all { + return shift->{_all}; +} + +sub hts_file_by_sample { + my ($self, $sample) = @_; + return $self->{_hts_set}->{$sample}; +} + +sub bas_by_sample { + my ($self, $sample) = @_; + return $self->{_bas_set}->{$sample}; +} + +sub ins_by_sample { + my ($self, $sample) = @_; + return $self->{_ins_set}->{$sample}; +} + +sub _dump_rec_detail { + my $self = shift; + for my $k(sort qw(change_pos_low change_pos_high change_l change_ref change_alt q_start q_end type chr)) { + printf "%s: %s\n", $k, ref $self->{$k} ? Dumper($self->{$k}) : $self->{$k}; + } + print "\n"; + return 0; +} diff --git a/perl/lib/Sanger/CGP/PindelPostProcessing/MetanormFilterRules.pm b/perl/lib/Sanger/CGP/PindelPostProcessing/MetanormFilterRules.pm new file mode 100644 index 0000000..59b70fd --- /dev/null +++ b/perl/lib/Sanger/CGP/PindelPostProcessing/MetanormFilterRules.pm @@ -0,0 +1,123 @@ +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# +package Sanger::CGP::PindelPostProcessing::MetanormFilterRules; + +use strict; +use Bio::DB::HTS::Tabix; +use Sanger::CGP::Pindel; + +my %RULE_DESCS = ( + 'F006' => { 'tag' => 'INFO/LEN', + 'name' => 'F006', + 'desc' => 'Small call excessive repeat check: Fail if Length <= 4 and Repeats > 9', + 'test' => \&flag_006}, + 'F017' => { 'tag' => 'INFO/LEN', + 'name' => 'F017', + 'desc' => 'Variant must not overlap with a simple repeat', + 'test' => \&flag_017}, + 'LONG' => { 'tag' => 'INFO/LEN', + 'name' => 'LONG', + 'desc' => 'Event larger than 1kbp', + 'test' => \&long} + +); + +our $previous_format_hash; +our $previous_format_string = q{}; +our $vcf_flagging_repeats_tabix; + +sub rule { + my (undef, $rule) = @_; # being called like an object function so throw away first varaible + return $RULE_DESCS{$rule}; +} + +sub available_rules { + return sort keys %RULE_DESCS; +} + +sub use_prev { + my $format = shift; + ### HACK Dirty dirty dirty...... done to try and cut down the number of times I have to parse the FORMAT string I am storing it as a global variable. + if($format ne $previous_format_string){ + my $i = 0; + map {$previous_format_hash->{$_} = $i++} split(':',$format); + $previous_format_string = $format; + } +} + +sub reuse_repeats_tabix { + unless(defined $vcf_flagging_repeats_tabix) { + $vcf_flagging_repeats_tabix = new Bio::DB::HTS::Tabix(filename=> $ENV{VCF_FLAGGING_REPEATS}); + } +} + +sub flag_006 { + my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_; + if($MATCH <= 4){ + my ($rep) = $$RECORD[7] =~ /REP=(\d+)/; + if($rep > 9) { + return $FAIL; + } + } + return $PASS; +} + +sub flag_017 { + my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_; + reuse_repeats_tabix(); + + my($from) = ";$$RECORD[7]" =~ m/;RS=(\d+)/; + my($to) = ";$$RECORD[7]" =~ m/;RE=(\d+)/; + + my $ret = eval{ + # as vcf POS for indels is the previous base pos is 0-based, but the new TABIX requires 1-based + my $iter = $vcf_flagging_repeats_tabix->query_full($CHROM,$from,$to); + return $PASS if(!defined $iter); # no valid entries (chromosome not in index) so must pass + while($iter->next){ + return $FAIL; + } + return $PASS; + }; + if($@) { + die $@; + } + return $ret; +} + +sub long { + my ($MATCH,$CHROM,$POS,$FAIL,$PASS,$RECORD,$VCF) = @_; + if($MATCH > 1000 ){ + return $FAIL + } + return $PASS; +} + + +1; diff --git a/perl/rules/metanormRules.lst b/perl/rules/metanormRules.lst new file mode 100644 index 0000000..5f61282 --- /dev/null +++ b/perl/rules/metanormRules.lst @@ -0,0 +1,4 @@ +Sanger::CGP::PindelPostProcessing::MetanormFilterRules +F006 +F017 +LONG diff --git a/perl/t/1_pm_compile.t b/perl/t/1_pm_compile.t index 5e55993..a351fc5 100644 --- a/perl/t/1_pm_compile.t +++ b/perl/t/1_pm_compile.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + # this is a catch all to ensure all modules do compile # added as lots of 'use' functionality is dynamic in pipeline @@ -38,7 +19,7 @@ my $lib_path = "$Bin/../lib"; # Add modules here that cannot be instantiated (should be extended and have no 'new') # or need a set of inputs - these should be tested in own test script -use constant MODULE_SKIP => qw(Sanger::CGP::Pindel::InputGen::Pair Sanger::CGP::Pindel::InputGen::Read Sanger::CGP::Pindel::OutputGen::CombinedRecordGenerator Sanger::CGP::Pindel::OutputGen::PindelRecordParser Sanger::CGP::Pindel::OutputGen::VcfConverter); +use constant MODULE_SKIP => qw(Sanger::CGP::Pindel::InputGen::Pair Sanger::CGP::Pindel::InputGen::Read Sanger::CGP::Pindel::OutputGen::CombinedRecordGenerator Sanger::CGP::Pindel::OutputGen::PindelRecordParser Sanger::CGP::Pindel::OutputGen::VcfConverter Sanger::CGP::Pindel::OutputGen::VcfBlatAugment); my $init_cwd = getcwd; diff --git a/perl/t/2_pl_compile.t b/perl/t/2_pl_compile.t index e76269c..c2d8bd5 100644 --- a/perl/t/2_pl_compile.t +++ b/perl/t/2_pl_compile.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + # this is a catch all to ensure all modules do compile # added as lots of 'use' functionality is dynamic in pipeline diff --git a/perl/t/data/blat/D/test.vcf b/perl/t/data/blat/D/test.vcf new file mode 100644 index 0000000..34c489f --- /dev/null +++ b/perl/t/data/blat/D/test.vcf @@ -0,0 +1,3385 @@ +##fileformat=VCFv4.1 +##fileDate=10200428 +##source_20200428.1=pindelCohort_to_vcf.pl_v3.3.0 +##reference=/lustre/scratch119/casm/team78pipelines/reference/human/GRCh38_full_analysis_set_plus_decoy_hla/genome.fa +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##vcfProcessLog_20200428.1=,InputVCFVer=> +##vcfProcessLog_20200428.2=,InputVCFVer=<3.3.0>> +##SAMPLE= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PD26988a +chr10 11201 CA C 390 . PC=D;RS=11201;RE=11205;LEN=1;S1=11;S2=849.236;REP=3 GT:PP:NP ./.:10:0 diff --git a/perl/t/data/blat/DI/test.vcf b/perl/t/data/blat/DI/test.vcf new file mode 100644 index 0000000..fac9b07 --- /dev/null +++ b/perl/t/data/blat/DI/test.vcf @@ -0,0 +1,3385 @@ +##fileformat=VCFv4.1 +##fileDate=10200428 +##source_20200428.1=pindelCohort_to_vcf.pl_v3.3.0 +##reference=/lustre/scratch119/casm/team78pipelines/reference/human/GRCh38_full_analysis_set_plus_decoy_hla/genome.fa +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##vcfProcessLog_20200428.1=,InputVCFVer=> +##vcfProcessLog_20200428.2=,InputVCFVer=<3.3.0>> +##SAMPLE= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PD26988a +chr10 22777 AGAAACTGTG ACTGTGAGATAGATATATATAGATAGATATAT 105 . PC=DI;RS=22777;RE=22787;LEN=9;S1=6;REP=0 GT:PP:NP ./.:0:5 diff --git a/perl/t/data/blat/SI/test.vcf b/perl/t/data/blat/SI/test.vcf new file mode 100644 index 0000000..f83e652 --- /dev/null +++ b/perl/t/data/blat/SI/test.vcf @@ -0,0 +1,3385 @@ +##fileformat=VCFv4.1 +##fileDate=10200428 +##source_20200428.1=pindelCohort_to_vcf.pl_v3.3.0 +##reference=/lustre/scratch119/casm/team78pipelines/reference/human/GRCh38_full_analysis_set_plus_decoy_hla/genome.fa +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##vcfProcessLog_20200428.1=,InputVCFVer=> +##vcfProcessLog_20200428.2=,InputVCFVer=<3.3.0>> +##SAMPLE= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PD26988a +chr10 11643 C CG 150 . PC=I;RS=11643;RE=11649;LEN=1;S1=6;S2=421.908;REP=4 GT:PP:NP ./.:0:5 diff --git a/perl/t/data/blat/chr10_1-23700.fa b/perl/t/data/blat/chr10_1-23700.fa new file mode 100644 index 0000000..0f3edae --- /dev/null +++ b/perl/t/data/blat/chr10_1-23700.fa @@ -0,0 +1,396 @@ +>chrdiff --git a/perl/t/data/blat/chr10_1-23700.fa.dict b/perl/t/data/blat/chr10_1-23700.fa.dict new file mode 100644 index 0000000..25c6d07 --- /dev/null +++ b/perl/t/data/blat/chr10_1-23700.fa.dict @@ -0,0 +1,2 @@ +@HD VN:1.0 SO:unsorted +@SQ SN:chr10 LN:23700 M5:5d182da12dc542ac36636bab171df9fb UR:file:///home/kr2/GitHub/cancerit/cgpPindel/perl/t/data/blat/chr10_1-23700.fa diff --git a/perl/t/data/blat/chr10_1-23700.fa.fai b/perl/t/data/blat/chr10_1-23700.fa.fai new file mode 100644 index 0000000..d99b55d --- /dev/null +++ b/perl/t/data/blat/chr10_1-23700.fa.fai @@ -0,0 +1 @@ +chr10 23700 7 60 61 diff --git a/perl/t/data/blat/test.bam b/perl/t/data/blat/test.bam new file mode 100644 index 0000000..5bf9969 Binary files /dev/null and b/perl/t/data/blat/test.bam differ diff --git a/perl/t/data/blat/test.bam.bai b/perl/t/data/blat/test.bam.bai new file mode 100644 index 0000000..0905a97 Binary files /dev/null and b/perl/t/data/blat/test.bam.bai differ diff --git a/perl/t/data/blat/test.bam.bas b/perl/t/data/blat/test.bam.bas new file mode 100644 index 0000000..41e25ba --- /dev/null +++ b/perl/t/data/blat/test.bam.bas @@ -0,0 +1,2 @@ +bam_filename sample platform platform_unit library readgroup read_length_r1 read_length_r2 #_mapped_bases #_mapped_bases_r1 #_mapped_bases_r2 #_divergent_bases #_divergent_bases_r1 #_divergent_bases_r2 #_total_reads #_total_reads_r1 #_total_reads_r2 #_mapped_reads #_mapped_reads_r1 #_mapped_reads_r2 #_mapped_reads_properly_paired #_gc_bases_r1 #_gc_bases_r2 mean_insert_size insert_size_sd median_insert_size #_duplicate_reads #_mapped_pairs #_inter_chr_pairs #_qc_fail_r1 #_qc_fail_r2 +- PD26988a ILLUMINA 20016_2 91997 201373 151 151 110646158540 56887159078 53758999462 603934948 241629848 362305100 787310684 393655342 393655342 743181302 380223694 362957608 365546896 24044102821 24329729234 436.769 135.373 419.000 99915177 377442095 8795503 6805992 21985142 diff --git a/perl/t/inputGen.t b/perl/t/inputGen.t index a540e9e..9980942 100644 --- a/perl/t/inputGen.t +++ b/perl/t/inputGen.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + use strict; use File::Temp qw(tempdir); diff --git a/perl/t/inputGenRead.t b/perl/t/inputGenRead.t index 62309e8..c83f77a 100644 --- a/perl/t/inputGenRead.t +++ b/perl/t/inputGenRead.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + use strict; use Test::More; diff --git a/perl/t/outputGenCombinedRecordGenerator.t b/perl/t/outputGenCombinedRecordGenerator.t index 068e347..d1189ff 100644 --- a/perl/t/outputGenCombinedRecordGenerator.t +++ b/perl/t/outputGenCombinedRecordGenerator.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + use strict; use Test::More; diff --git a/perl/t/outputGenPindelRecordParser.t b/perl/t/outputGenPindelRecordParser.t index 35f4d8c..8a4f7a9 100644 --- a/perl/t/outputGenPindelRecordParser.t +++ b/perl/t/outputGenPindelRecordParser.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + use strict; use Test::More; @@ -470,7 +451,9 @@ subtest 'Object funcions' => sub { ['EAS139_64:1:55:1728:1427_r2_D0',16,22,16060468,29,'12M6D63M','*',0,0,'AGTTAACTCTCTTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTT','*','MD:Z:1A10^TTTTTC63','NM:i:7']]}}, -lub => 'T', -min_change => lc'TTTTTC', - -repeats => 7 + -repeats => 7, + -ref_left => 'GAGACCTCCCCAGAAATGGATGCCAGCATTATGCTTCCTATACAGCCTGCAGAACCATGAGCCAATTAACTCTCT', + -ref_right => 'TTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCT', ); $obj->_parse_alignment($record_1,$alignments_1,\$header_string_1); @@ -514,7 +497,9 @@ subtest 'Object funcions' => sub { ['EAS131_6:8:80:742:1825_r2_D0',16,22,16060477,60,'45M30D30M','*',0,0,'TCTTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTC','*','MD:Z:45^TTTCTTTCTTTCTTTCTTTCTTTCTTTCTT29T0','NM:i:31']]}}, -lub => 'C', -min_change => 'TTTCTTTCTTTCTTTCTTTCTTTCTTTCTT', - -repeats => 1 # The minimum repeat is not actually repeated within the local vicinity of the event, despite the region being a repeat-region. + -repeats => 1, # The minimum repeat is not actually repeated within the local vicinity of the event, despite the region being a repeat-region. + -ref_left => 'CAGCCTGCAGAACCATGAGCCAATTAACTCTCTTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTCTTTTTC', + -ref_right => 'TCTTTCTTTCTTTCTTTCTTTCTTTCTTTTTGTTTTCTTTCATCTTTCCTTCTTCTTTTTT', ); $obj->_parse_alignment($record_2,$alignments_2,\$header_string_2); diff --git a/perl/t/outputGenVcfConverter.t b/perl/t/outputGenVcfConverter.t index 17d5499..b44036c 100644 --- a/perl/t/outputGenVcfConverter.t +++ b/perl/t/outputGenVcfConverter.t @@ -1,23 +1,4 @@ -########## LICENCE ########## -# Copyright (c) 2014-2021 Genome Research Ltd. -# -# Author: CASM/Cancer IT -# -# This file is part of cgpPindel. -# -# cgpPindel is free software: you can redistribute it and/or modify it under -# the terms of the GNU Affero General Public License as published by the Free -# Software Foundation; either version 3 of the License, or (at your option) any -# later version. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more -# details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -########## LICENCE ########## + use strict; use Test::More; diff --git a/perl/t/vcfBlatAugment.t b/perl/t/vcfBlatAugment.t new file mode 100644 index 0000000..1ca3fc5 --- /dev/null +++ b/perl/t/vcfBlatAugment.t @@ -0,0 +1,99 @@ +use strict; +use File::Temp qw(tempdir); +use File::Path qw(make_path); +use Test::More; +use Test::Fatal; +use File::Spec::Functions; +use Const::Fast qw(const); +use FindBin qw($Bin); + +const my $MODULE => 'Sanger::CGP::Pindel::OutputGen::VcfBlatAugment'; +const my $DATA => "$Bin/data/blat"; +const my @HEADER_ENDS => do { + no warnings 'qw'; + qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT PD26988a); +}; +const my $HEADER_LINES => 3393; + +const my $DATA_ARR_D => [qw(chr10 11201 id CA C 390 . PC=D;RS=11201;RE=11205;LEN=1;S1=11;S2=849.236;REP=3 GT:PP:NP ./.:10:0)]; +const my $RES_ARR_D => [qw(chr10 11201 id CA C 390 . PC=D;RS=11201;RE=11205;LEN=1;S1=11;S2=849.236;REP=3 GT:PP:NP ./.:10:0:2:0:0.010:0:1:0.007:0.333)]; +const my $DATA_ARR_DI => [qw(chr10 22777 id AGAAACTGTG ACTGTGAGATAGATATATATAGATAGATATAT 105 . PC=DI;RS=22777;RE=22787;LEN=9;S1=6;REP=0 GT:PP:NP ./.:0:5)]; +const my $RES_ARR_DI => [qw(chr10 22777 id AGAAACTGTG ACTGTGAGATAGATATATATAGATAGATATAT 105 . PC=DI;RS=22777;RE=22787;LEN=9;S1=6;REP=0 GT:PP:NP ./.:0:5:1:2:0.002:0:1:0.007:0.250)]; +const my $DATA_ARR_SI => [qw(chr10 11643 id C CG 150 . PC=I;RS=11643;RE=11649;LEN=1;S1=6;S2=421.908;REP=4 GT:PP:NP ./.:0:5)]; +const my $RES_ARR_SI => [qw(chr10 11643 id C CG 150 . PC=I;RS=11643;RE=11649;LEN=1;S1=6;S2=421.908;REP=4 GT:PP:NP ./.:0:5:2:6:0.014:5:5:0.003:0.556)]; + +my ($stdout_fh, $buffer); +my ($sam_stdout_fh, $sam_buffer); + +subtest 'Initialisation checks' => sub { + use_ok($MODULE); + new_vba(catdir($DATA, 'D')); +}; + +subtest 'Header checks' => sub { + my $vba = new_vba(catdir($DATA, 'D')); + ok($vba->output_header); + my @lines = split /\n/, $buffer; + is(scalar @lines, $HEADER_LINES, 'Expected number of header lines'); + is($lines[-1], join("\t", @HEADER_ENDS), 'Expected final header line'); +}; + +subtest 'Simple Deletion checks' => sub { + my $vba = new_vba(catdir($DATA, 'D')); + my @tmp = @{$DATA_ARR_D}; + $vba->blat_record(\@tmp, tempdir(CLEANUP => 1)); + is_deeply(\@tmp, $RES_ARR_D); +}; + +subtest 'Simple Insertion checks' => sub { + my $vba = new_vba(catdir($DATA, 'SI')); + my @tmp = @{$DATA_ARR_SI}; + $vba->blat_record(\@tmp, tempdir(CLEANUP => 1)); + is_deeply(\@tmp, $RES_ARR_SI); +}; + +subtest 'Complex event checks' => sub { + my $vba = new_vba(catdir($DATA, 'DI')); + my @tmp = @{$DATA_ARR_DI}; + $vba->blat_record(\@tmp, tempdir(CLEANUP => 1)); + print join q{ }, @tmp; + is_deeply(\@tmp, $RES_ARR_DI); +}; + +done_testing(); + + +sub new_vba { + my $dir = shift; + my $tmp = '/tmp/pindel_test_stuff'; + make_path($tmp); + my $obj = new_ok($MODULE, [ + input => catfile($dir, 'test.vcf'), + ref => catfile($DATA, 'chr10_1-23700.fa'), + ofh => buffer_fh(), + outpath => $tmp, + hts_files => [catfile($DATA, 'test.bam')], + ]); + my $sample = $obj->{vcf_sample_order}->[0]; + $obj->{sfh}->{$sample} = sam_buffer_fh(); + unlink $tmp; + return $obj; +} + +sub buffer_fh { + if(defined $stdout_fh) { + close $stdout_fh; + } + $buffer = q{}; + open $stdout_fh, ">", \$buffer or die $!; + return $stdout_fh; +} + +sub sam_buffer_fh { + if(defined $sam_stdout_fh) { + close $sam_stdout_fh; + } + $sam_buffer = q{}; + open $sam_stdout_fh, ">", \$sam_buffer or die $!; + return $sam_stdout_fh; +} diff --git a/perl/t/vcfPindelFlagger.t b/perl/t/vcfPindelFlagger.t index b99f2bb..2387235 100644 --- a/perl/t/vcfPindelFlagger.t +++ b/perl/t/vcfPindelFlagger.t @@ -1,9 +1,3 @@ -#################################################### -# Copyright (c) 2014-2021 Genome Research Ltd. -# Author: CASM/Cancer IT, cgphelp@sanger.ac.uk -# See LICENCE for details -#################################################### - use strict; use warnings; use Cwd 'abs_path'; diff --git a/perl/util/README.md b/perl/util/README.md new file mode 100644 index 0000000..f170a28 --- /dev/null +++ b/perl/util/README.md @@ -0,0 +1,3 @@ +# perl/util + +Scripts to aid in R&D processes, these are not installed. diff --git a/perl/util/cohortVcfToGrids.pl b/perl/util/cohortVcfToGrids.pl new file mode 100755 index 0000000..3243693 --- /dev/null +++ b/perl/util/cohortVcfToGrids.pl @@ -0,0 +1,87 @@ +#!/usr/bin/perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use IO::Uncompress::Gunzip qw($GunzipError) ; +use List::Util qw(sum); + +my ($combined_vcf, $exclude_sample, $output) = @ARGV; +# use a basic bed style output so we can intersect etc + +my ($wtp, $wtn, $mtp, $mtn) = (5,6,8,9); + +my $z = new IO::Uncompress::Gunzip $combined_vcf, MultiStream => 1 or die "IO::Uncompress::Gunzip failed: $GunzipError\n"; +open my $O_DPTH, '>', $output.'.depth' or die "Failed to create $output.depth"; +open my $O_ALT, '>', $output.'.alt' or die "Failed to create $output.alt"; +my @sample_order; +while (my $l = <$z>) { + next if $l =~ m/^##/; + chomp $l; + my @data = split "\t", $l; + my ($chr, $pos, $ref, $alt, $gt) = @data[0,1,3,4,8]; + my @samples = @data[9..$#data]; + # header junk + if($chr eq '#CHROM'){ + @sample_order = @samples; + print $O_DPTH 'ID'; + print $O_ALT 'ID'; + for my $idx (0..$#samples) { + if($sample_order[$idx] eq $exclude_sample) { + next; + } + print $O_DPTH "\t".$sample_order[$idx]; + print $O_ALT "\t".$sample_order[$idx]; + } + print $O_DPTH "\n"; + print $O_ALT "\n"; + + next; + } + # body + $chr =~ s/^chr//; + my $id = sprintf '%s_%d_%s_%s', $chr, $pos, $ref, $alt; + my @total_depth = ( $id ); + my @alt_depth = ( $id ); + for my $idx (0..$#samples) { + if($sample_order[$idx] eq $exclude_sample) { + next; + } + my @gt_data = split ':', $samples[$idx]; + push @total_depth, sum (@gt_data[$wtp, $wtn, $mtp, $mtn]); + push @alt_depth, sum (@gt_data[$mtp, $mtn]); + #print "$gt : $samples[$idx] : $total_depth : $mut_depth\n"; + } + print $O_DPTH join("\t", @total_depth)."\n"; + print $O_ALT join("\t", @alt_depth)."\n"; +} +close $O_DPTH; +close $O_ALT; +close $z; diff --git a/perl/util/pairedSplit.pl b/perl/util/pairedSplit.pl new file mode 100755 index 0000000..e464c68 --- /dev/null +++ b/perl/util/pairedSplit.pl @@ -0,0 +1,95 @@ +#!/usr/bin/env perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; +use v5.16; +use Data::Dumper; + +my ($t_name, $n_name, $in) = @ARGV; +open my $IF, '<', $in or die "Failed to open $in: $!"; +my ($t_pos, $n_pos); +while(my $l = <$IF>) { + next if ($l =~ m/^##/); + chomp $l; + my @bits = split /\t/, $l; + if($l =~ m/^#CHRO/) { + ($t_pos, $n_pos) = find_samples(\@bits, $t_name, $n_name); + next; + } + process_record($bits[$t_pos], $bits[$n_pos], $l); +} +close $IF; + +sub process_record { + my ($t_geno, $n_geno, $record) = @_; + if($n_geno eq q{.}) { + return; + } + if($t_geno eq q{.}) { + return; + } + else { + my (undef, $s1, $s2, $pp, $np, $wp, $wn, $wm, $mp,$mn, $mm, $vaf) = split ':', $t_geno; + return if(($mp + $mn + $wp + $wn) > 60); # example is 60x + return if($mm >= 0.035); + return if($vaf < 0.05); + return if($mp == 0); + return if($mn == 0); + return if($wp == 0); + return if($wn == 0); + say $record; + #if($mm < 0.035 && $vaf >= 0.02 && $mp > 0 && $mn > 0 && ) { + # say $record; + #} + } + +} + +sub find_samples { + my ($bits, $t_name, $n_name) = @_; + my ($t_pos, $n_pos); + my $a_s = @{$bits} - 1; + for my $i(0..$a_s) { + if($bits->[$i] eq $t_name) { + $t_pos = $i; + } + if($bits->[$i] eq $n_name) { + $n_pos = $i; + } + } + unless(defined $t_pos) { + die "Failed to find $t_name\n" + } + unless(defined $n_pos) { + die "Failed to find $n_name\n" + } + return ($t_pos, $n_pos); +} diff --git a/perl/util/vafCorrectToGrids.pl b/perl/util/vafCorrectToGrids.pl new file mode 100755 index 0000000..e8f1e6b --- /dev/null +++ b/perl/util/vafCorrectToGrids.pl @@ -0,0 +1,85 @@ +#!/usr/bin/perl +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. +# + +use strict; + +my ($vaf_correct, $exclude_sample, $output) = @ARGV; + +open my $O_VC, '<', $vaf_correct; + +my $header = <$O_VC>; +chomp $header; +my @head_items = split "\t", $header; +my ($chr, $pos, $ref, $alt) = @head_items[2,3,4,5]; +my %mtr_by_sample; +for my $i(6..$#head_items) { + if($head_items[$i] =~ m/_MTR$/) { + my $sample = $head_items[$i]; + $sample =~ s/_MTR$//; + next if($sample eq $exclude_sample); + $mtr_by_sample{$sample} = $i; + } +} + +my @samples = sort keys %mtr_by_sample; +my (@mtr_cols, @wtr_cols); +for my $s(@samples) { + push @mtr_cols, $mtr_by_sample{$s}; + push @wtr_cols, $mtr_by_sample{$s} + 1; +} + + +open my $O_DPTH, '>', $output.'.depth' or die "Failed to create $output.depth"; +open my $O_ALT, '>', $output.'.alt' or die "Failed to create $output.alt"; + +print $O_DPTH join("\t", "ID", @samples)."\n"; +print $O_ALT join("\t", "ID", @samples)."\n"; + +while(my $l = <$O_VC>) { + chomp $l; + my @row = split "\t", $l; + my ($chr, $pos, $ref, $alt) = @row[2,3,4,5]; + $chr =~ s/^chr//; + my $id = sprintf '%s_%d_%s_%s', $chr, $pos, $ref, $alt; + my @total_depth; + my @alt_depth; + for my $i(0..$#mtr_cols) { + # same length (or broken) + push @total_depth, $row[$mtr_cols[$i]] + $row[$wtr_cols[$i]]; + push @alt_depth, $row[$mtr_cols[$i]]; + } + print $O_DPTH join("\t", $id, @total_depth)."\n"; + print $O_ALT join("\t", $id, @alt_depth)."\n"; +} + +close $O_DPTH; +close $O_ALT; +close $O_VC; diff --git a/python/pindelMmPlots.py b/python/pindelMmPlots.py new file mode 100755 index 0000000..efd41bb --- /dev/null +++ b/python/pindelMmPlots.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2014-2021 Genome Research Ltd +# +# Author: CASM/Cancer IT +# +# This file is part of cgpPindel. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# 1. The usage of a range of years within a copyright statement contained within +# this distribution should be interpreted as being equivalent to a list of years +# including the first and last year specified and all consecutive years between +# them. For example, a copyright statement that reads ‘Copyright (c) 2005, 2007- +# 2009, 2011-2012’ should be interpreted as being identical to a statement that +# reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright +# statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being +# identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, +# 2009, 2010, 2011, 2012’. + +# core libs +import os # for mkdir, path stuff +import sys +import argparse + +# requirements +import vcfpy +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +def parse_vcf(df_list, vaf_counts, rl, vcfin, type): + reader = vcfpy.Reader.from_path(vcfin) + sample = reader.header.samples.names[0] + for record in reader: + if record.INFO['PC'] != type: + continue + call = record.call_for_sample[sample] + if call.data.get('MTP') is None and call.data.get('MTN'): + continue + + # mismatch data only on Pos + # diff data only on MT + df_list.append({'RL': rl, 'SAMPLE': 'MT', 'STRAND': 'POS', 'Reads': call.data.get('MTP'), 'Mismatch': call.data.get('WTM') or 0, 'Diff': call.data.get('MTP') - call.data.get('PP')}) + df_list.append({'RL': rl, 'SAMPLE': 'MT', 'STRAND': 'NEG', 'Reads': call.data.get('MTN'), 'Diff': call.data.get('MTN') - call.data.get('NP')}) + if(call.data.get('VAF') == 0.000): + if rl in vaf_counts: + vaf_counts[float(rl)] += 1 + else: + vaf_counts[float(rl)] = 1 + df_list.append({'RL': rl, 'SAMPLE': 'WT', 'STRAND': 'POS', 'Reads': call.data.get('WTP'), 'Mismatch': call.data.get('MTM') or 0}) + df_list.append({'RL': rl, 'SAMPLE': 'WT', 'STRAND': 'NEG', 'Reads': call.data.get('WTN')}) + +def process_vcfs(options): + ### split the labels and create map with vcfs + r_lengths = options.labels.split(',') + if len(r_lengths) != len(options.vcfs): + sys.exit('Error: "-labels" needs to have the same number of elements as the number of vcfs supplied.') + vcfs = {} + for l, v in zip(r_lengths, options.vcfs): + vcfs[float(l)] = v + + df_list = [] + vaf_counts = {} + for vcf_set in vcfs.items(): + print(f'Processing read length multiplier {vcf_set[0]} for type {options.type}') + parse_vcf(df_list, vaf_counts, vcf_set[0], vcf_set[1], options.type) + df = pd.DataFrame.from_records(df_list) + + facet_boxplot(df, 'SAMPLE', 'STRAND', 'RL', 'Reads', options.type+'_reads.png', title='BLAT read depth', ylim=(0, 50)) + facet_boxplot(df, 'SAMPLE', None, 'RL', 'Mismatch', options.type+'_mm.png', title='Mismatch fraction for BLAT reads', aspect=1.2, ylim=(0, 0.05)) + facet_boxplot(df, 'STRAND', None, 'RL', 'Diff', options.type+'_diff.png', title='BLAT reads - Pindel reads', aspect=1.2, ylim=(-20, 10)) + barchart(vaf_counts, 'RL', '0-VAF', options.type+'_0vaf.png', title='Events with VAF=0') + +def facet_boxplot(df, row: str, col: str, x_item: str, y_item: str, out_file: str, title=None, aspect=1, ylim=None): + sns.set() + grid = sns.FacetGrid( + df, + row=row, col=col, margin_titles=True, + ylim=ylim, aspect=aspect + ) + grid.map(sns.boxplot, x_item, y_item); + if title: + grid.fig.subplots_adjust(top=.9) + grid.fig.suptitle(title, size=14) + grid.set_xticklabels(rotation=80) + grid.savefig(out_file) + grid.fig.clf() # this clears the figure + +def barchart(data_dict, x_label: str, y_label: str, out_file: str, title=None): + sns.set() + x_items = [] + y_items = [] + for k in sorted(data_dict): + x_items.append(k) + y_items.append(data_dict[k]) + bp = sns.barplot(x=x_items, y=y_items) + if title: + bp.set_title(title) + bp.set_yscale('log') + bp.set_xlabel(x_label) + bp.set_ylabel(y_label) + bp.set_xticklabels(bp.get_xticklabels(), rotation=80) + plt.savefig(out_file) + + +parser = argparse.ArgumentParser(description='Generate wisker plots of mismatch rates for a set of vcfs') +parser.add_argument('-d', '--dir', dest='outDir', metavar='outDir', help='Directory for output', required=True) +parser.add_argument('-l', '-labels', dest='labels', metavar='1.0,...', help='CSV of readlength multipliers, same order as vcfs they apply to', required=True) +parser.add_argument('-t', '-type', dest='type', choices=['D','DI','I'], help='Pindel data type (PC=?)', required=True) +parser.add_argument('-f', '--format', dest='imgFormat', choices=['png','pdf','svg'], help='Format to save venn diagram', required=False, default='png') +parser.add_argument('vcfs', nargs='+') + +args = parser.parse_args() + +# build the output folder before starting work +if os.path.exists(args.outDir) is False: + os.mkdir(args.outDir, mode=0o700) # rwx owner only + +process_vcfs(args) diff --git a/setup.sh b/setup.sh index f744386..350285e 100755 --- a/setup.sh +++ b/setup.sh @@ -33,6 +33,7 @@ # need to keep in sync with Dockerfile export VER_CGPVCF="v2.2.1" export VER_VCFTOOLS="0.1.16" +export VER_BLAT="v385" get_file () { # output, source @@ -43,7 +44,6 @@ get_file () { fi } - if [[ ($# -ne 1 && $# -ne 2) ]] ; then echo "Please provide an installation path and optionally perl lib paths to allow, e.g." echo " ./setup.sh /opt/myBundle" @@ -58,6 +58,7 @@ if [[ $# -eq 2 ]] ; then CGP_PERLLIBS=$2 fi + # get current directory INIT_DIR=`pwd`