diff --git a/conda/linux-latest.txt b/conda/linux-latest.txt index a8cf8c0..b264af8 100644 --- a/conda/linux-latest.txt +++ b/conda/linux-latest.txt @@ -2,9 +2,8 @@ # $ conda create --name --file # platform: linux-64 @EXPLICIT -https://conda.anaconda.org/bioconda/linux-64/seqkit-2.8.0-h9ee0642_0.tar.bz2 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.6.2-hbcca054_0.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/noarch/font-ttf-dejavu-sans-mono-2.37-hab24e00_0.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/noarch/font-ttf-inconsolata-3.000-h77eed37_0.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/noarch/font-ttf-source-code-pro-2.038-h77eed37_0.tar.bz2 @@ -49,7 +48,7 @@ https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.cond https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4-h59595ed_2.conda https://conda.anaconda.org/conda-forge/linux-64/oniguruma-6.9.9-hd590300_0.conda -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_0.conda +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.1-h4ab18f5_0.conda https://conda.anaconda.org/conda-forge/linux-64/pixman-0.43.2-h59595ed_0.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/linux-64/pthread-stubs-0.4-h36c2ea0_1001.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/linux-64/wfa2-lib-2.3.4-h4ac6f70_0.tar.bz2 @@ -115,7 +114,7 @@ https://conda.anaconda.org/conda-forge/noarch/bitstring-3.1.9-pyhd8ed1ab_0.tar.b https://conda.anaconda.org/conda-forge/linux-64/brotli-1.1.0-hd590300_1.conda https://conda.anaconda.org/conda-forge/linux-64/brotli-python-1.1.0-py310hc6cd4ac_1.conda https://conda.anaconda.org/conda-forge/noarch/cachetools-4.2.4-pyhd8ed1ab_0.tar.bz2 -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.2.2-pyhd8ed1ab_0.conda +https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.3.2-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/cycler-0.12.1-pyhd8ed1ab_0.conda @@ -138,10 +137,8 @@ https://conda.anaconda.org/conda-forge/linux-64/openjpeg-2.5.0-h488ebb8_3.conda https://conda.anaconda.org/conda-forge/noarch/packaging-23.2-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/pycparser-2.21-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/pygments-2.17.2-pyhd8ed1ab_0.conda -https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.1.2-pyhd8ed1ab_0.conda +https://conda.anaconda.org/conda-forge/noarch/pyparsing-3.1.1-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/pysocks-1.7.1-pyha2e5f31_6.tar.bz2 -https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2023.4-pyhd8ed1ab_0.conda -https://conda.anaconda.org/conda-forge/noarch/pytz-2023.4-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/setuptools-69.0.3-pyhd8ed1ab_0.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/tomli-2.0.1-pyhd8ed1ab_0.tar.bz2 @@ -155,7 +152,7 @@ https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/linux-64/cairo-1.18.0-h3faef2a_0.conda https://conda.anaconda.org/conda-forge/linux-64/cffi-1.16.0-py310h2fee648_0.conda https://conda.anaconda.org/conda-forge/noarch/deprecation-2.1.0-pyh9f0ad1d_0.tar.bz2 -https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.49.0-py310h2372a71_0.conda +https://conda.anaconda.org/conda-forge/linux-64/fonttools-4.47.2-py310h2372a71_0.conda https://conda.anaconda.org/bioconda/linux-64/htslib-1.19.1-h81da01d_0.tar.bz2 https://conda.anaconda.org/bioconda/noarch/itol-config-0.1.0-pyhdfd78af_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/jinja2-3.1.3-pyhd8ed1ab_0.conda @@ -172,7 +169,7 @@ https://conda.anaconda.org/conda-forge/noarch/python-docx-1.1.0-pyhd8ed1ab_0.con https://conda.anaconda.org/conda-forge/noarch/screed-1.1.3-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/tqdm-4.66.1-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/typing-extensions-4.9.0-hd8ed1ab_0.conda -https://conda.anaconda.org/conda-forge/noarch/urllib3-2.1.0-pyhd8ed1ab_0.conda +https://conda.anaconda.org/conda-forge/noarch/urllib3-2.2.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/linux-64/xorg-libxi-1.7.10-h7f98852_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.6.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/bioconda/linux-64/delly-1.2.6-hb7e2ac5_0.tar.bz2 @@ -191,17 +188,16 @@ https://conda.anaconda.org/conda-forge/linux-64/contourpy-1.2.0-py310hd41b1e2_0. https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/linux-64/lofreq-2.1.5-py310h47ef89e_10.tar.bz2 https://conda.anaconda.org/bioconda/linux-64/mash-2.3-ha9a2dd8_3.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/conda-forge/linux-64/openjdk-17.0.8-hbf40d96_2.conda -https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.0-py310hcc13569_0.conda https://conda.anaconda.org/conda-forge/noarch/pydantic-2.6.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/rich-argparse-1.4.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/linux-64/scipy-1.12.0-py310hb13e2d6_2.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/linux-64/tabixpp-1.1.2-hd68fcf3_1.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/noarch/gatk4-4.4.0.0-py36hdfd78af_0.tar.bz2 -https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.8.3-py310h62c0568_0.conda +https://conda.anaconda.org/conda-forge/linux-64/matplotlib-base-3.8.2-py310h62c0568_0.conda https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/noarch/pilon-1.24-hdfd78af_0.tar.bz2 https://conda.anaconda.org/bioconda/noarch/snpeff-5.2-hdfd78af_0.tar.bz2 https://conda.anaconda.org/t/no-15126f85-4dce-4462-b48d-9bc580ca6641/bioconda/noarch/trimmomatic-0.39-hdfd78af_2.tar.bz2 https://conda.anaconda.org/bioconda/linux-64/vcflib-1.0.9-h146fbdb_4.tar.bz2 https://conda.anaconda.org/bioconda/linux-64/freebayes-1.3.6-hb0f3ef8_7.tar.bz2 -https://conda.anaconda.org/conda-forge/linux-64/sourmash-minimal-4.8.6-py310h373cc83_0.conda -https://conda.anaconda.org/bioconda/noarch/sourmash-4.8.6-hdfd78af_0.tar.bz2 \ No newline at end of file +https://conda.anaconda.org/conda-forge/linux-64/sourmash-minimal-4.8.5-py310h373cc83_0.conda +https://conda.anaconda.org/bioconda/noarch/sourmash-4.8.5-hdfd78af_0.tar.bz2 \ No newline at end of file diff --git a/conda/macos-latest.txt b/conda/macos-latest.txt index ef8b1e3..062b139 100644 --- a/conda/macos-latest.txt +++ b/conda/macos-latest.txt @@ -8,13 +8,14 @@ https://conda.anaconda.org/conda-forge/osx-64/tree-2.1.1-h0dc2134_0.conda https://conda.anaconda.org/conda-forge/osx-64/bc-1.07.1-h0d85af4_0.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/bzip2-1.0.8-h0d85af4_4.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/c-ares-1.19.1-h0dc2134_0.conda -https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.2.2-h8857fd0_0.conda +https://conda.anaconda.org/conda-forge/osx-64/ca-certificates-2024.6.2-h8857fd0_0.conda https://conda.anaconda.org/conda-forge/osx-64/isa-l-2.30.0-h0d85af4_4.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/jpeg-9e-hac89ed1_2.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/libbrotlicommon-1.0.9-hb7f2c08_9.conda https://conda.anaconda.org/conda-forge/osx-64/libcxx-16.0.6-hd57cbcb_0.conda https://conda.anaconda.org/conda-forge/osx-64/libdeflate-1.13-h775f41a_0.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/libev-4.33-haf1e3a3_1.tar.bz2 +https://conda.anaconda.org/conda-forge/osx-64/libexpat-2.6.2-h73e2aa4_0.conda https://conda.anaconda.org/conda-forge/osx-64/libffi-3.4.2-h0d85af4_5.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/libiconv-1.17-hac89ed1_0.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/libogg-1.3.4-h35c211d_1.tar.bz2 @@ -37,6 +38,7 @@ https://conda.anaconda.org/conda-forge/osx-64/xorg-libxdmcp-1.1.3-h35c211d_0.tar https://conda.anaconda.org/conda-forge/osx-64/xz-5.2.6-h775f41a_0.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/yaml-0.2.5-h0d85af4_2.tar.bz2 https://conda.anaconda.org/bioconda/osx-64/bedtools-2.31.0-h6372da2_2.tar.bz2 +https://conda.anaconda.org/conda-forge/osx-64/expat-2.6.2-h73e2aa4_0.conda https://conda.anaconda.org/conda-forge/osx-64/gettext-0.21.1-h8a4c099_0.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/htop-3.2.2-h908806b_0.conda https://conda.anaconda.org/conda-forge/osx-64/icu-70.1-h96cf925_0.tar.bz2 @@ -61,6 +63,7 @@ https://conda.anaconda.org/conda-forge/osx-64/openssl-1.1.1w-h8a1eda9_0.conda https://conda.anaconda.org/conda-forge/osx-64/pandoc-3.1.3-h9d075a6_0.conda https://conda.anaconda.org/conda-forge/osx-64/parallel-20230522-h694c41f_0.conda https://conda.anaconda.org/conda-forge/osx-64/pcre2-10.40-h1c4e4bc_0.tar.bz2 +https://conda.anaconda.org/conda-forge/osx-64/pigz-2.8-h92b6c6a_0.conda https://conda.anaconda.org/conda-forge/osx-64/readline-8.2-h9e318b2_1.conda https://conda.anaconda.org/bioconda/noarch/samclip-0.4.0-hdfd78af_1.tar.bz2 https://conda.anaconda.org/conda-forge/osx-64/tk-8.6.12-h5dbffcc_0.tar.bz2 @@ -97,7 +100,7 @@ https://conda.anaconda.org/conda-forge/noarch/bitstring-3.1.9-pyhd8ed1ab_0.tar.b https://conda.anaconda.org/conda-forge/osx-64/brotli-1.0.9-hb7f2c08_9.conda https://conda.anaconda.org/conda-forge/osx-64/brotli-python-1.0.9-py310h7a76584_9.conda https://conda.anaconda.org/conda-forge/noarch/cachetools-4.2.4-pyhd8ed1ab_0.tar.bz2 -https://conda.anaconda.org/conda-forge/noarch/certifi-2024.2.2-pyhd8ed1ab_0.conda +https://conda.anaconda.org/conda-forge/noarch/certifi-2024.6.2-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/charset-normalizer-3.2.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/colorama-0.4.6-pyhd8ed1ab_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/cycler-0.11.0-pyhd8ed1ab_0.tar.bz2 @@ -224,6 +227,7 @@ https://conda.anaconda.org/bioconda/osx-64/usher-0.6.2-hdcdf62d_0.tar.bz2 https://conda.anaconda.org/conda-forge/noarch/annotated-types-0.6.0-pyhd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/osx-64/argon2-cffi-bindings-21.2.0-py310h90acd4f_3.tar.bz2 https://conda.anaconda.org/bioconda/osx-64/delly-1.1.6-h69f7742_0.tar.bz2 +https://conda.anaconda.org/conda-forge/osx-64/git-2.39.1-pl5321h00ebd2c_0.conda https://conda.anaconda.org/conda-forge/osx-64/gstreamer-1.22.4-h840fbdc_1.conda https://conda.anaconda.org/conda-forge/noarch/importlib_metadata-6.8.0-hd8ed1ab_0.conda https://conda.anaconda.org/conda-forge/noarch/jsonschema-specifications-2023.6.1-pyhd8ed1ab_0.conda diff --git a/pathogenprofiler/__init__.py b/pathogenprofiler/__init__.py index 8553dbd..1dfacb7 100644 --- a/pathogenprofiler/__init__.py +++ b/pathogenprofiler/__init__.py @@ -10,4 +10,4 @@ from .cli import * from .rules import * from .models import * -__version__ = "4.2.0" \ No newline at end of file +__version__ = "4.3.0" \ No newline at end of file diff --git a/pathogenprofiler/bam.py b/pathogenprofiler/bam.py index e1a9047..46972d1 100644 --- a/pathogenprofiler/bam.py +++ b/pathogenprofiler/bam.py @@ -37,12 +37,7 @@ def __init__( filecheck(self.bam_file) index_bam(bam_file,threads=threads) self.filetype = "cram" if bam_file[-5:]==".cram" else "bam" - for l in cmd_out("samtools view -H %s" % (bam_file)): - if l[:3]=="@RG": - row = l.strip().split("\t") - for r in row: - if r.startswith("SM:"): - self.bam_sample_name = r.replace("SM:","") + def calculate_bed_depth(self,bed_file: str) -> List[GenomePositionDepth]: """ Calculate depth of BAM file in regions specified by a BED file @@ -67,12 +62,13 @@ def calculate_bed_depth(self,bed_file: str) -> List[GenomePositionDepth]: for l in cmd_out("samtools view -Mb -L %(bed_file)s %(bam_file)s | samtools depth - " % vars(self)): row = l.strip().split() p = GenomePosition(chrom=row[0],pos=int(row[1])) - position_depth[p] = int(row[2]) + if p in position_depth: + position_depth[p] = int(row[2]) self.position_depth = [GenomePositionDepth(chrom=p.chrom,pos=p.pos,depth=v) for p,v in position_depth.items()] return self.position_depth - def run_delly(self,bed_file: str) -> Vcf: + def run_delly(self,ref_file: str,bed_file: str) -> Vcf: """ Method to run delly and extract variants overlapping with a BED file @@ -87,6 +83,7 @@ def run_delly(self,bed_file: str) -> Vcf: """ logging.info("Running delly") self.bed_file = bed_file + self.ref_file = ref_file if self.platform=="illumina": run_cmd("delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)) else: @@ -97,8 +94,37 @@ def run_delly(self,bed_file: str) -> Vcf: run_cmd("bcftools view -R %(bed_file)s %(prefix)s.delly.vcf.gz -Oz -o %(prefix)s.delly.targets.vcf.gz" % vars(self)) return Vcf("%(prefix)s.delly.targets.vcf.gz" % vars(self)) - def call_variants( + self, + ref_file: str, + caller: str, + filters: dict, + bed_file: Optional[str] = None, + threads: int = 1, + calling_params: Optional[str] = None, + samclip: bool = False, + cli_args: dict = {} + ) -> Vcf: + from .variant_calling import VariantCaller + subclasses = {cls.__software__:cls for cls in VariantCaller.__subclasses__()} + chosen_class = subclasses[caller] + caller = chosen_class( + ref_file=ref_file, + bam_file=self.bam_file, + prefix=self.prefix, + bed_file=bed_file, + threads=threads, + samclip=samclip, + platform=self.platform, + calling_params=calling_params, + filters=filters, + cli_args=cli_args + ) + return caller.call_variants() + + + + def __call_variants__deprecated( self, ref_file: str, caller: str, @@ -125,23 +151,33 @@ def call_variants( self.vcf_file = "%s.short_variants.vcf.gz" % (self.prefix) if bed_file else "%s.vcf.gz" % (self.prefix) genome_chunks = [r.safe for r in get_genome_chunks(self.ref_file,self.threads)] - self.samclip_cmd = "| samclip --ref %(ref_file)s" % vars(self) if samclip else "" + if self.platform in ("illumina"): + self.samclip_cmd = "| samclip --ref %(ref_file)s" % vars(self) if samclip else "" + else: + self.samclip_cmd = "" + # Run through different options. + + # Nanopore if self.platform == "nanopore" and self.caller=="bcftools": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "bcftools mpileup -f %(ref_file)s %(calling_params)s -a DP,AD,ADF,ADR -r {region} %(bam_file)s | bcftools call -mv | annotate_maaf.py | bcftools +fill-tags | bcftools view -c 1 | bcftools norm -f %(ref_file)s | bcftools filter -e 'IMF < 0.7' -S 0 -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "bcftools mpileup -f %(ref_file)s %(calling_params)s -a DP,AD,ADF,ADR -r {region} %(bam_file)s | bcftools call -mv | annotate_maaf.py | bcftools +fill-tags | bcftools norm -f %(ref_file)s | bcftools filter -e 'IMF < 0.7' -S 0 -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform == "nanopore" and self.caller=="freebayes": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "freebayes -f %(ref_file)s -F %(af_hard)s -r {region} --haplotype-length -1 %(calling_params)s %(bam_file)s | annotate_maaf.py | bcftools +fill-tags | bcftools view -c 1 | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "freebayes -f %(ref_file)s -F %(af_hard)s -r {region} --haplotype-length -1 %(calling_params)s %(bam_file)s | annotate_maaf.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform=="nanopore" and self.caller == "pilon": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = """pilon --genome %(ref_file)s --targets {region} %(calling_params)s --nanopore %(bam_file)s --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -i 'AF>0' -c 1 %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz""" % vars(self) + self.calling_cmd = """pilon --genome %(ref_file)s --targets {region} %(calling_params)s --nanopore %(bam_file)s --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -i 'AF>0' %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz""" % vars(self) + + # Pacbio elif self.platform == "pacbio" and self.caller=="freebayes": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "freebayes -f %(ref_file)s -r {region} --haplotype-length -1 %(calling_params)s %(bam_file)s | annotate_maaf.py | bcftools +fill-tags | bcftools view -c 1 | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "freebayes -f %(ref_file)s -r {region} --haplotype-length -1 %(calling_params)s %(bam_file)s | annotate_maaf.py | bcftools +fill-tags | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform=="pacbio" and self.caller == "pilon": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = """pilon --genome %(ref_file)s --targets {region} %(calling_params)s --pacbio %(bam_file)s --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -i 'AF>0' -c 1 %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz""" % vars(self) + self.calling_cmd = """pilon --genome %(ref_file)s --targets {region} %(calling_params)s --pacbio %(bam_file)s --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -i 'AF>0' %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz""" % vars(self) + + # Illumina elif self.platform=="illumina" and self.caller == "bcftools": self.calling_params = calling_params if calling_params else "" self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && bcftools mpileup -f %(ref_file)s %(calling_params)s -a DP,AD,ADF,ADR -r {region} %(temp_file_prefix)s.{region_safe}.tmp.bam | bcftools call -mv | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) @@ -150,13 +186,13 @@ def call_variants( self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && gatk HaplotypeCaller -R %(ref_file)s -I %(temp_file_prefix)s.{region_safe}.tmp.bam -O /dev/stdout -L {region} %(calling_params)s -OVI false | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform=="illumina" and self.caller == "freebayes": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && freebayes -f %(ref_file)s -r {region} --haplotype-length -1 %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | bcftools view -c 1 | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && freebayes -f %(ref_file)s -r {region} --haplotype-length -1 %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform=="illumina" and self.caller == "pilon": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "samtools view -T %(ref_file)s -f 0x1 -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && pilon --genome %(ref_file)s --targets {region} --diploid %(calling_params)s --frags %(temp_file_prefix)s.{region_safe}.tmp.bam --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -e " % vars(self) + r"""'ALT="."'""" + " -c 1 %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "samtools view -T %(ref_file)s -f 0x1 -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && pilon --genome %(ref_file)s --targets {region} --diploid %(calling_params)s --frags %(temp_file_prefix)s.{region_safe}.tmp.bam --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -e " % vars(self) + r"""'ALT="."'""" + " %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) elif self.platform=="illumina" and self.caller == "lofreq": self.calling_params = calling_params if calling_params else "" - self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && lofreq call --call-indels -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | modify_lofreq_vcf.py --sample %(bam_sample_name)s | add_dummy_AD.py --ref %(ref_file)s --add-dp | bcftools view -c 1 | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && lofreq call --call-indels -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | modify_lofreq_vcf.py --sample %(bam_sample_name)s | add_dummy_AD.py --ref %(ref_file)s --add-dp | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) else: logging.debug("Unknown combination %(platform)s + %(caller)s" % vars(self)) logging.info("Running variant calling") diff --git a/pathogenprofiler/cli.py b/pathogenprofiler/cli.py index 8b1599f..fe5f3b6 100644 --- a/pathogenprofiler/cli.py +++ b/pathogenprofiler/cli.py @@ -10,6 +10,7 @@ from .models import Variant, DrVariant, Gene, DrGene, SpeciesPrediction, Species, BarcodeResult from .mutation_db import MutationDB from .vcf import Vcf +from .sanity import check_bam_for_rg, check_vcf_chrom_match def get_variant_filters(args): filters = {} @@ -51,11 +52,12 @@ def run_bam_qc(args: argparse.Namespace): def run_fasta_qc(args: argparse.Namespace): fasta = Fasta(args.fasta) - paf = Paf(args.paf) qc = fasta.get_fasta_qc() - qc.target_qc = paf.get_target_qc( - bed_file=args.conf["bed"] - ) + if hasattr(args,'paf') and args.paf: + paf = Paf(args.paf) + qc.target_qc = paf.get_target_qc( + bed_file=args.conf["bed"] + ) return qc def run_vcf_qc(args: argparse.Namespace): @@ -66,7 +68,8 @@ def run_vcf_qc(args: argparse.Namespace): def get_resistance_db_from_species_prediction(args: argparse.Namespace,species_prediction:SpeciesPrediction): logging.debug("Attempting to load db with species prediction") - if len(species_prediction.species)==1: + number_of_species = len(set([s.species for s in species_prediction.species])) + if number_of_species==1: return get_db(args.software_name,species_prediction.species[0].species.replace(" ","_")) else: return None @@ -141,17 +144,17 @@ def get_vcf_from_bam(args: argparse.Namespace): ### Create bam object and call variants ### bam = Bam(args.bam, args.files_prefix, platform=args.platform, threads=args.threads) if args.call_whole_genome: - wg_vcf_obj = bam.call_variants(conf["ref"], caller=args.caller, filters = conf['variant_filters'], threads=args.threads, calling_params=args.calling_params, samclip = args.samclip) + wg_vcf_obj = bam.call_variants(conf["ref"], caller=args.caller, filters = conf['variant_filters'], threads=args.threads, calling_params=args.calling_params, samclip = args.samclip, cli_args=vars(args)) vcf_obj = wg_vcf_obj # TODO optional? # vcf_obj = wg_vcf_obj.view_regions(conf["bed"]) else: - vcf_obj = bam.call_variants(conf["ref"], caller=args.caller, filters = conf['variant_filters'], bed_file=conf["bed"], threads=args.threads, calling_params=args.calling_params, samclip = args.samclip) + vcf_obj = bam.call_variants(conf["ref"], caller=args.caller, filters = conf['variant_filters'], bed_file=conf["bed"], threads=args.threads, calling_params=args.calling_params, samclip = args.samclip, cli_args=vars(args)) ### Run delly if specified ### final_target_vcf_file = args.files_prefix+".targets.vcf.gz" if not args.no_delly: - delly_vcf_obj = bam.run_delly(conf['bed']) + delly_vcf_obj = bam.run_delly(conf['ref'],conf['bed']) if delly_vcf_obj is not None: run_cmd("bcftools index %s" % delly_vcf_obj.filename) run_cmd("bcftools concat %s %s | bcftools sort -Oz -o %s" % (vcf_obj.filename,delly_vcf_obj.filename,final_target_vcf_file)) @@ -200,7 +203,7 @@ def run_profiler(args) -> List[Union[Variant,DrVariant,Gene,DrGene]]: tmp_vcf_file = f"{args.files_prefix}.tmp.vcf.gz" run_cmd(f"bcftools view {args.vcf} | modify_lofreq_vcf.py --sample {args.prefix} | bcftools view -Oz -o {tmp_vcf_file}") args.vcf = tmp_vcf_file - + # check_vcf_chrom_match(args.vcf,args.conf["ref"]) annotated_variants = vcf_profiler(args) return annotated_variants @@ -258,6 +261,7 @@ def get_bam_file(args): ) bam_file = bam_obj.bam_file else: + check_bam_for_rg(args.bam) bam_file = args.bam return bam_file @@ -296,7 +300,7 @@ def get_sourmash_hit(args): fastq = Fastq(fq_file) sourmash_sig = fastq.sourmash_sketch(args.files_prefix) - sourmash_sig = sourmash_sig.gather(args.species_conf["sourmash_db"],args.species_conf["sourmash_db_info"],intersect_bp=2500000,f_match_threshold=0.1) + sourmash_sig = sourmash_sig.gather(args.species_conf["sourmash_db"],args.species_conf["sourmash_db_info"],intersect_bp=500000,f_match_threshold=0.1) result = [] if len(sourmash_sig)>0: diff --git a/pathogenprofiler/db.py b/pathogenprofiler/db.py index a916ee4..9b327cd 100644 --- a/pathogenprofiler/db.py +++ b/pathogenprofiler/db.py @@ -180,7 +180,7 @@ def so_term_in_mutation(mutation: str) -> bool: def get_snpeff_formated_mutation_list(hgvs_variants,ref,gff,snpEffDB): logging.debug("Converting HGVS to snpEff format") - genes = load_gff(gff,aslist=True) + genes = load_gff(gff) refseq = FastaFile(ref) converted_mutations = {} so_term_rows = [r for r in hgvs_variants if so_term_in_mutation(r['Mutation'])] @@ -421,8 +421,9 @@ def create_db(args,extra_files = None): O.write("\t".join(row)+"\n") genes = load_gff(gff_file) - gene_name2gene_id = {g.name:g.gene_id for g in genes.values()} - gene_name2gene_id.update({g.gene_id:g.gene_id for g in genes.values()}) + gene_name2gene_id = {g.name:g.gene_id for g in genes} + gene_name2gene_id.update({g.gene_id:g.gene_id for g in genes}) + gene_dict = {g.gene_id:g for g in genes} db = {} locus_tag_to_ann_dict = defaultdict(set) with open(args.prefix+".conversion.log","w") as L: @@ -449,8 +450,8 @@ def create_db(args,extra_files = None): tmp_annotation = annotation_info db[locus_tag][mut]["annotations"].append(tmp_annotation) - db[locus_tag][mut]["genome_positions"] = get_genome_position(genes[locus_tag],mut) if mut not in supported_so_terms else None - db[locus_tag][mut]["chromosome"] = genes[locus_tag].chrom + db[locus_tag][mut]["genome_positions"] = get_genome_position(gene_dict[locus_tag],mut) if mut not in supported_so_terms else None + db[locus_tag][mut]["chromosome"] = gene_dict[locus_tag].chrom @@ -501,13 +502,13 @@ def create_db(args,extra_files = None): O.write("\t".join(row)+"\n") if "amplicon_primers" in vars(args) and args.amplicon_primers: - write_amplicon_bed(genome_file,genes,db,args.amplicon_primers,bed_file) + write_amplicon_bed(genome_file,gene_dict,db,args.amplicon_primers,bed_file) variables['amplicon'] = True else: write_bed( db=db, gene_dict=locus_tag_to_ann_dict, - gene_info=genes, + gene_info=gene_dict, ref_file=genome_file, outfile=bed_file ) diff --git a/pathogenprofiler/gff.py b/pathogenprofiler/gff.py index 5e46ec9..258a273 100644 --- a/pathogenprofiler/gff.py +++ b/pathogenprofiler/gff.py @@ -1,6 +1,8 @@ import re from uuid import uuid4 from collections import defaultdict +from typing import List +from .models import GenomeRange, GenomePosition class Gene: def __init__(self,name,gene_id,strand,chrom,start,end,length): @@ -16,6 +18,11 @@ def __init__(self,name,gene_id,strand,chrom,start,end,length): self.transcripts = [] def __repr__(self) -> str: return f"Gene: {vars(self)}" + def __contains__(self,position: tuple[str,int]) -> bool: + chrom,pos = position + pos = GenomePosition(chrom=chrom,pos=pos) + range = GenomeRange(chrom=self.chrom,start=self.feature_start,end=self.feature_end) + return pos in range class Exon: def __init__(self, chrom, start, end, strand, phase): @@ -32,7 +39,7 @@ def __init__(self,name): self.name = name self.exons = [] -def load_gff(gff,aslist=False): +def load_gff(gff) -> List[Gene]: GFF = open(gff) genes = {} relationships = {} @@ -105,7 +112,5 @@ def load_gff(gff,aslist=False): if isinstance(items[item],Gene): genes[items[item].gene_id] = items[item] - if aslist: - return list(genes.values()) - else: - return genes \ No newline at end of file + + return list(genes.values()) \ No newline at end of file diff --git a/pathogenprofiler/profiler.py b/pathogenprofiler/profiler.py index 97a1a28..a4ef444 100644 --- a/pathogenprofiler/profiler.py +++ b/pathogenprofiler/profiler.py @@ -8,18 +8,23 @@ from .models import Variant from typing import List, Union import argparse +import logging def bam_barcoder(args: argparse.Namespace) -> List[BarcodeResult]: conf = args.conf + if 'barcode' not in conf: + return [] bam = Bam(args.bam, args.files_prefix, platform=args.platform, threads=args.threads) - barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform) if not hasattr(args,'barcode_snps'): args.barcode_snps = None + barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform) barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps) return barcode_assignment def vcf_barcoder(args: argparse.Namespace) -> List[BarcodeResult]: conf = args.conf + if 'barcode' not in conf: + return [] vcf = Vcf(args.vcf) barcode_mutations = vcf.get_bed_gt(conf["barcode"],conf["ref"]) if not hasattr(args,'barcode_snps'): diff --git a/pathogenprofiler/sanity.py b/pathogenprofiler/sanity.py new file mode 100644 index 0000000..d8a495a --- /dev/null +++ b/pathogenprofiler/sanity.py @@ -0,0 +1,19 @@ +import pysam + +def check_bam_for_rg(bam_file) -> None: + """ + Check if the BAM file has a read group. + """ + bam = pysam.AlignmentFile(bam_file, "rb") + if 'RG' not in bam.header: + raise Exception(f"No read group found in {bam_file}. Please create create your bam with a read group or add using `samtools addreplacerg`.") + +def check_vcf_chrom_match(vcf_file, ref_file) -> None: + """ + Check if the chromosomes in the VCF file match the reference file. + """ + vcf = pysam.VariantFile(vcf_file) + ref = pysam.FastaFile(ref_file) + for chrom in vcf.header.contigs: + if chrom not in ref.references: + raise Exception(f"Chromosome {chrom} in VCF file {vcf_file} not found in reference file {ref_file}.") diff --git a/pathogenprofiler/utils.py b/pathogenprofiler/utils.py index a3e1ab4..b91c08e 100644 --- a/pathogenprofiler/utils.py +++ b/pathogenprofiler/utils.py @@ -33,6 +33,18 @@ def __exit__(self, exc_type, exc_value, traceback): for f in glob(self.prefix+"*"): os.remove(f) +class TempFolder(object): + """Create a temporary file prefix""" + def __init__(self): + self.prefix = str(uuid4()) + def __enter__(self): + os.mkdir(self.prefix) + return self.prefix + def __exit__(self, exc_type, exc_value, traceback): + """Remove temporary files""" + logging.debug("Removing temporary folder: %s" % self.prefix) + os.rmdir(self.prefix) + def get_tmp_file(prefix=None): """Get a temporary file""" if prefix: diff --git a/pathogenprofiler/variant_calling.py b/pathogenprofiler/variant_calling.py new file mode 100644 index 0000000..0b39629 --- /dev/null +++ b/pathogenprofiler/variant_calling.py @@ -0,0 +1,132 @@ +import logging +from abc import ABC, abstractmethod +from .utils import cmd_out, run_cmd, run_cmd_parallel_on_genome, load_bed_regions, get_genome_chunks, TempFilePrefix, shared_dict +from uuid import uuid4 +from glob import glob +import os +from .vcf import Vcf +from typing import Optional + + +class VariantCaller: + def __init__( + self, + ref_file: str, + bam_file: str, + prefix: str, + bed_file: str = None, + threads: int = 1, + samclip: bool = False, + platform: str = "illumina", + calling_params: Optional[str] = None, + filters: dict = {}, + cli_args: dict = {} + ): + self.temp_file_prefix = str(uuid4()) + self.ref_file = ref_file + self.bam_file = bam_file + self.prefix = prefix + self.bed_file = bed_file + self.threads = threads + self.samclip = samclip + self.platform = platform + self.calling_params = calling_params if calling_params else "" + self.af_hard = filters['af_hard'] if 'af_hard' in filters else 0 + + # set the samclip command based on the platform + if self.platform in ("illumina"): + self.samclip_cmd = "| samclip --ref %(ref_file)s" % vars(self) if self.samclip else "" + else: + self.samclip_cmd = "" + + # get the sample name from the bam file + for l in cmd_out("samtools view -H %s" % (bam_file)): + if l[:3]=="@RG": + row = l.strip().split("\t") + for r in row: + if r.startswith("SM:"): + self.bam_sample_name = r.replace("SM:","") + + # for each key in cli_args, set the value of the key as an attribute of the class + for k,v in cli_args.items(): + if not hasattr(self,k): + setattr(self,k,v) + + @abstractmethod + def call_variants(self): + pass + + def run_calling(self, cmd): + if self.bed_file: + self.vcf_file = "%s.short_variants.targets.vcf.gz" % (self.prefix) + genome_chunks = [r.safe for r in load_bed_regions(self.bed_file)] + else: + self.vcf_file = "%s.vcf.gz" % (self.prefix) + genome_chunks = [r.safe for r in get_genome_chunks(self.ref_file,self.threads)] + + + run_cmd_parallel_on_genome(self.calling_cmd,self.ref_file,bed_file = self.bed_file,threads=self.threads,desc="Calling variants") + cmd = "bcftools index %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + run_cmd_parallel_on_genome(cmd,self.ref_file,bed_file = self.bed_file,threads=self.threads,desc="Indexing variants") + temp_vcf_files = ' '.join([f"{self.temp_file_prefix}.{r}.vcf.gz" for r in genome_chunks]) + run_cmd(f"bcftools concat -aD {temp_vcf_files} | bcftools view -Oz -o {self.vcf_file}") + for f in glob(self.temp_file_prefix+"*"): + os.remove(f) + + return Vcf(self.vcf_file) + +class BcftoolsCaller(VariantCaller): + __software__ = "bcftools" + def call_variants(self) -> Vcf: + if self.platform=="illumina": + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && bcftools mpileup -f %(ref_file)s %(calling_params)s -a DP,AD,ADF,ADR -r {region} %(temp_file_prefix)s.{region_safe}.tmp.bam | bcftools call -mv | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + elif self.platform=="nanopore": + self.calling_cmd = "bcftools mpileup -f %(ref_file)s %(calling_params)s -a DP,AD,ADF,ADR -r {region} %(bam_file)s | bcftools call -mv | annotate_maaf.py | bcftools +fill-tags | bcftools norm -f %(ref_file)s | bcftools filter -e 'IMF < 0.7' -S 0 -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + else: + raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) + return self.run_calling(self.calling_cmd) + +class FreebayesCaller(VariantCaller): + __software__ = "freebayes" + def call_variants(self) -> Vcf: + # Call variants using Freebayes + if self.platform=="illumina": + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && freebayes -f %(ref_file)s -r {region} --haplotype-length -1 %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + elif self.platform=="nanopore": + self.calling_cmd = "freebayes -f %(ref_file)s -F %(af_hard)s -r {region} --haplotype-length -1 %(calling_params)s %(bam_file)s | annotate_maaf.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + else: + raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) + return self.run_calling(self.calling_cmd) + +class GatkCaller(VariantCaller): + __software__ = "gatk" + def call_variants(self) -> Vcf: + # Call variants using GATK + if self.platform=="illumina": + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && gatk HaplotypeCaller -R %(ref_file)s -I %(temp_file_prefix)s.{region_safe}.tmp.bam -O /dev/stdout -L {region} %(calling_params)s -OVI false | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + else: + raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) + return self.run_calling(self.calling_cmd) + +class PilonCaller(VariantCaller): + __software__ = "pilon" + def call_variants(self) -> Vcf: + # Call variants using Pilon + if self.platform=="illumina": + self.calling_cmd = "samtools view -T %(ref_file)s -f 0x1 -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && pilon --genome %(ref_file)s --targets {region} --diploid %(calling_params)s --frags %(temp_file_prefix)s.{region_safe}.tmp.bam --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -e " % vars(self) + r"""'ALT="."'""" + " %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + elif self.platform=="nanopore": + self.calling_cmd = """pilon --genome %(ref_file)s --targets {region} %(calling_params)s --nanopore %(bam_file)s --variant --output %(temp_file_prefix)s.{region_safe} && bcftools view -i 'AF>0' %(temp_file_prefix)s.{region_safe}.vcf | fix_pilon_headers.py --sample %(bam_sample_name)s| add_dummy_AD.py | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz""" % vars(self) + else: + raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) + return self.run_calling(self.calling_cmd) + +class LofreqCaller(VariantCaller): + __software__ = "lofreq" + def call_variants(self) -> Vcf: + # Call variants using Lofreq + if self.platform=="illumina": + self.calling_cmd = "samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && lofreq call --call-indels -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | modify_lofreq_vcf.py --sample %(bam_sample_name)s | add_dummy_AD.py --ref %(ref_file)s --add-dp | bcftools norm -f %(ref_file)s -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz" % vars(self) + else: + raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) + return self.run_calling(self.calling_cmd) + diff --git a/pathogenprofiler/vcf.py b/pathogenprofiler/vcf.py index e199ce9..b5ba8cd 100644 --- a/pathogenprofiler/vcf.py +++ b/pathogenprofiler/vcf.py @@ -136,8 +136,8 @@ def run_snpeff(self,db,ref_file,gff_file,rename_chroms = None, split_indels=True else: self.phasing_bam = "" run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self)) - run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self)) - run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self)) + run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self)) + run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self)) run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self)) else : run_cmd("bcftools view %(filename)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools view -Oz -o %(vcf_csq_file)s" % vars(self)) @@ -186,12 +186,11 @@ def load_ann( variants = [] vcf = pysam.VariantFile(self.filename) for var in vcf: - logging.debug(var) + logging.debug(str(var)[:500]) chrom = var.chrom pos = var.pos ref = var.ref alleles = var.alleles - alt_str = list(var.alts)[0] if "SVTYPE" in var.info: if var.info['SVTYPE']!="DEL": continue @@ -343,19 +342,19 @@ def get_vcf_qc(self): num_variants = int(l.split("\t")[3].strip()) return VcfQC(total_variants=num_variants) -class DellyVcf(Vcf): - def __init__(self,filename): - Vcf.__init__(self,filename) - def get_robust_calls(self,prefix,bed_file = None): - self.tmpfile = f"{prefix}.tmp.delly.vcf.gz" - self.outfile = f"{prefix}.delly.vcf.gz" - run_cmd("bcftools view -c 2 %(filename)s | bcftools view -e '(INFO/END-POS)>=100000' -Oz -o %(tmpfile)s" % vars(self)) - if bed_file: - run_cmd(f"bcftools index {self.tmpfile}") - run_cmd(f"bcftools view -R {bed_file} {self.tmpfile} -Oz -o {self.outfile}") - else: - self.outfile = self.tmpfile - return DellyVcf(self.outfile) +# class DellyVcf(Vcf): +# def __init__(self,filename): +# Vcf.__init__(self,filename) +# def get_robust_calls(self,prefix,bed_file = None): +# self.tmpfile = f"{prefix}.tmp.delly.vcf.gz" +# self.outfile = f"{prefix}.delly.vcf.gz" +# run_cmd("bcftools view -c 2 %(filename)s | bcftools view -e '(INFO/END-POS)>=100000' -Oz -o %(tmpfile)s" % vars(self)) +# if bed_file: +# run_cmd(f"bcftools index {self.tmpfile}") +# run_cmd(f"bcftools view -R {bed_file} {self.tmpfile} -Oz -o {self.outfile}") +# else: +# self.outfile = self.tmpfile +# return DellyVcf(self.outfile) def uniqify_dict_list(data): s = [] @@ -439,4 +438,5 @@ def filter_variant(var,filter_params): qc = "hard_fail" elif var_qc_test(var,filter_params["depth_soft"],filter_params["af_soft"],filter_params["strand_soft"]): qc = "soft_fail" + logging.debug(qc) return qc diff --git a/scripts/add_dummy_AD.py b/scripts/add_dummy_AD.py index 89eb0d4..36b8109 100644 --- a/scripts/add_dummy_AD.py +++ b/scripts/add_dummy_AD.py @@ -41,16 +41,21 @@ def main(args): alt = dp4[2]+dp4[3] row[9]+=f":{ref},{alt}" elif "AF=" in row[7]: - r = re.search("AF=([0-9\.]+)",row[7]) - if r: - alt = int(100*float(r.group(1))) - ref = int(100-alt) + raf = re.search("AF=([0-9\.]+)",row[7]) + rdp = re.search("DP=([0-9]+)",row[7]) + if raf and rdp: + dp = int(rdp.group(1)) + alt = int(dp*float(raf.group(1))) + ref = int(dp-alt) row[9]+=f":{ref},{alt}" + else: + quit(f"AF and DP not found in {row[7]}") else: alt = 100 ref = 0 row[9]+=f":{ref},{alt}" + # quit(f"AD not found in {row[7]}") if args.add_dp: if "DP" not in row[8]: row[8]+=":DP" diff --git a/scripts/combine_vcf_variants.py b/scripts/combine_vcf_variants.py index cfa477c..07df736 100755 --- a/scripts/combine_vcf_variants.py +++ b/scripts/combine_vcf_variants.py @@ -69,7 +69,7 @@ def get_haplotype_counts( return counts -gff = load_gff(args.gff,aslist=True) +gff = load_gff(args.gff) exons = [] for gene in gff: for transcript in gene.transcripts: @@ -143,18 +143,18 @@ def has_md_tag(bam): v = variants[0] if 'DP4' in v.info: haplotypes_by_strand = { - ref_hap: v.info['DP4'][0]+v.info['DP4'][1] + ref_hap: int(v.info['DP4'][0])+int(v.info['DP4'][1]) } elif 'AD' in v.samples[0]: haplotypes_by_strand = { - ref_hap: v.samples[0]['AD'][0] + ref_hap: int(v.samples[0]['AD'][0]) } for alts in product(*ds.values()): hap = ''.join(alts) if 'DP4' in v.info: - haplotypes_by_strand[hap] = v.info['DP4'][2]+v.info['DP4'][3] + haplotypes_by_strand[hap] = int(v.info['DP4'][2])+int(v.info['DP4'][3]) elif 'AD' in v.samples[0]: - haplotypes_by_strand[hap] = v.samples[0]['AD'][1] + haplotypes_by_strand[hap] = int(v.samples[0]['AD'][1]) diff --git a/scripts/realign_tandem_deletions.py b/scripts/realign_tandem_deletions.py new file mode 100644 index 0000000..9883e98 --- /dev/null +++ b/scripts/realign_tandem_deletions.py @@ -0,0 +1,92 @@ +#! /usr/bin/env python +import pysam +import argparse +import logging +from pathogenprofiler.gff import load_gff +from collections import defaultdict +from tqdm import tqdm +import bisect + +logging.basicConfig(level=logging.INFO) + +parser = argparse.ArgumentParser(description='Right-align deletions in a VCF file') +parser.add_argument('input', help='VCF file with deletions to right-align') +parser.add_argument('ref', help='Reference genome in fasta format') +parser.add_argument('gff', help='Gff annotation file') +parser.add_argument('output', help='Output VCF file with right-aligned deletions') + +args = parser.parse_args() + +def check_coordinates(var: pysam.VariantRecord,refseq: pysam.FastaFile, direction: str) -> tuple[int,int,str,tuple]: + original_tuple = (var.start, var.stop, var.ref, var.alts) + if var.alts[0] != '' and len(var.ref)==len(var.alts[0]): + return original_tuple + start = var.start + 1 + if var.alts[0] == '': + end = var.stop + else: + end = var.start + len(var.ref) + 1 + svlen = end-start + if svlen<=1: + return original_tuple + deleted_seq = refseq.fetch(var.chrom, start, end-1) + if direction == "right": + nextseq = refseq.fetch(var.chrom, end-1, end+svlen-2) + else: + if start-svlen < 0: + return original_tuple + nextseq = refseq.fetch(var.chrom, start-svlen, start-1) + if deleted_seq == nextseq: + logging.info((svlen,deleted_seq,nextseq)) + logging.info(f"Deletion {var.chrom}:{start}-{end} is a tandem repeat") + if direction == "right": + last_del_nuc = deleted_seq[-1] + ref = last_del_nuc + deleted_seq + if var.alts[0] == '': + alt = var.alts[0] + ref = ref[0] + else: + alt = last_del_nuc + return end-2, end+svlen-1, ref, (alt,) + else: + first_del_nuc = deleted_seq[0] + ref = deleted_seq + first_del_nuc + if var.alts[0] == '': + alt = var.alts[0] + ref = ref[-1] + else: + alt = first_del_nuc + return start-svlen+1, start, ref, (alt,) + else: + logging.info(f"Deletion {var.chrom}:{start}-{end} is not a tandem repeat") + return original_tuple + +vcf_in = pysam.VariantFile(args.input) +header = vcf_in.header +refseq = pysam.FastaFile(args.ref) +genes = load_gff(args.gff) + +gene_ends = defaultdict(list) +gene_names = defaultdict(list) + +for chrom in refseq.references: + for gene in genes: + gene_ends[gene.chrom].append(gene.end) + gene_names[gene.chrom].append(gene.name) + + +vcf_out = pysam.VariantFile(args.output, "w", header=header) +for var in vcf_in: + # logging.info(f"Processing variant {var.chrom}:{var.start}-{var.stop}") + # find if the variant overlaps with a gene end using the bisect module + gene_ends_list = gene_ends[var.chrom] + gene_end_index = bisect.bisect_left(gene_ends_list, var.start) + if gene_ends_list[gene_end_index] > var.start and gene_ends_list[gene_end_index] < var.stop: + logging.info(f"Variant overlaps with gene end {genes[gene_end_index].name}") + direction = "right" if genes[gene_end_index].strand == "+" else "left" + start,end,ref,alts = check_coordinates(var,refseq,direction) + var.start = start + var.stop = end + var.ref = ref + var.alts = alts + vcf_out.write(var) \ No newline at end of file