diff --git a/pathogenprofiler/__init__.py b/pathogenprofiler/__init__.py index 1dfacb7..0eec21e 100644 --- a/pathogenprofiler/__init__.py +++ b/pathogenprofiler/__init__.py @@ -10,4 +10,4 @@ from .cli import * from .rules import * from .models import * -__version__ = "4.3.0" \ No newline at end of file +__version__ = "4.4.0" \ No newline at end of file diff --git a/pathogenprofiler/bam.py b/pathogenprofiler/bam.py index 46972d1..1ed8ea1 100644 --- a/pathogenprofiler/bam.py +++ b/pathogenprofiler/bam.py @@ -85,9 +85,16 @@ def run_delly(self,ref_file: str,bed_file: str) -> Vcf: self.bed_file = bed_file self.ref_file = ref_file if self.platform=="illumina": - run_cmd("delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)) + cmd = "delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self) + result = run_cmd(cmd, exit_on_error=False) else: - run_cmd("delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)) + cmd = "delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self) + result = run_cmd(cmd, exit_on_error=False) + + exitcode = result.returncode + if exitcode!=0: + logging.error("Delly failed, skipping") + return None run_cmd("bcftools view -c 2 %(prefix)s.delly.bcf | bcftools view -e '(INFO/END-POS)>=100000' -Oz -o %(prefix)s.delly.vcf.gz" % vars(self)) run_cmd("bcftools index %(prefix)s.delly.vcf.gz" % vars(self)) diff --git a/pathogenprofiler/cli.py b/pathogenprofiler/cli.py index fe5f3b6..9110161 100644 --- a/pathogenprofiler/cli.py +++ b/pathogenprofiler/cli.py @@ -1,7 +1,7 @@ from .fastq import Fastq from .utils import run_cmd, cmd_out from .bam import Bam -from .db import get_db +from .db import get_db, check_db_exists from .fasta import Fasta, Paf from .profiler import vcf_profiler, bam_barcoder, vcf_barcoder import logging @@ -335,6 +335,7 @@ def set_species(args: argparse.Namespace) -> SpeciesPrediction: >>> species_prediction.species [Species(species='Mycobacterium abscessus', prediction_info=None)] """ + check_db_exists(args.software_name,args.resistance_db) conf = get_db(args.software_name,args.resistance_db) species = Species( species=conf["species"] diff --git a/pathogenprofiler/db.py b/pathogenprofiler/db.py index 3f5295c..0c875e9 100644 --- a/pathogenprofiler/db.py +++ b/pathogenprofiler/db.py @@ -126,7 +126,7 @@ def write_bed(db: dict,gene_dict: dict,gene_info: List[Gene],ref_file: str,outfi if len(drugs)==0: drugs = "None" else: - drugs = ",".join(drugs) + drugs = ",".join(sorted(list(drugs))) lines.append([ gene_info[gene].chrom, str(genome_start), @@ -574,12 +574,30 @@ def check_db_files(variables): if not os.path.isfile(val+".fai"): index_ref(val) +def is_db_path(string): + if "/" in string: + return True + elif os.path.isfile(string+".variables.json"): + return True + return False + +def check_db_exists(software_name,db_name): + db = get_db(software_name=software_name,db_name=db_name,verbose=False) + if db is None: + share_path = f"{sys.base_prefix}/share/{software_name}/" + logging.error(f"DB {db_name} does not exist in the current directory or in {share_path}") + raise FileExistsError + -def get_db(software_name,db_name): - if "/" in db_name: - share_path = "/".join(db_name.split("/")[:-1]) - db_name = db_name.split("/")[-1] - variable_file_name = f"{share_path}/{db_name}.variables.json" +def get_db(software_name,db_name,verbose=True): + if is_db_path(db_name): + if "/" in db_name: + share_path = "/".join(db_name.split("/")[:-1]) + db_name = db_name.split("/")[-1] + variable_file_name = f"{share_path}/{db_name}.variables.json" + else: + share_path = '.' + variable_file_name = f"{share_path}/{db_name}.variables.json" else: share_path = f"{sys.base_prefix}/share/{software_name}/" variable_file_name = get_variable_file_name(software_name,db_name) @@ -588,7 +606,8 @@ def get_db(software_name,db_name): return None variables = json.load(open(variable_file_name)) for key,val in variables['files'].items(): - logging.info(f"Using {key} file: {share_path}/{val}") + if verbose: + logging.info(f"Using {key} file: {share_path}/{val}") if ".json" in val: variables[key] = json.load(open(f"{share_path}/{val}")) elif key=='rules': @@ -608,9 +627,13 @@ def list_db(software_name): def create_species_db(args,extra_files = None): + variables = json.load(open("variables.json")) if not extra_files: extra_files = {} - version = {"name":args.prefix} + version = { + "db-schema-version":variables['db-schema-version'], + "name":args.prefix + } if os.path.isdir('.git'): for l in pp.cmd_out("git log | head -4"): row = l.strip().split() @@ -626,12 +649,12 @@ def create_species_db(args,extra_files = None): shutil.copyfile(file,target) variables_file = args.prefix+".variables.json" - variables = { + variables.update({ "version": version, "files":{ "variables": variables_file } - } + }) if extra_files: for key,val in extra_files.items(): diff --git a/pathogenprofiler/hgvs.py b/pathogenprofiler/hgvs.py index c3274c4..79d0c66 100644 --- a/pathogenprofiler/hgvs.py +++ b/pathogenprofiler/hgvs.py @@ -332,4 +332,30 @@ def verify_mutation_list(hgvs_mutations: List[dict], genes: List[Gene], refseq: converted_mutations[key] = mutation_conversion[key] logging.debug(converted_mutations) - return converted_mutations \ No newline at end of file + return converted_mutations + + +def split_protein_hgvs(hgvs) -> List[str]: + """ + Split a protein HGVS into its components. + + Parameters: + hgvs (str): The protein HGVS. + + Returns: + List[str]: A list of protein HGVS components. + + Example: + >>> split_protein_hgvs("p.MetAsnLys74IleGluThr") + ['p.Met74Ile', 'p.Asn74Glu', 'p.Lys74Thr'] + """ + # Remove the 'p.' prefix + hgvs = hgvs[2:] + components = [] + aa = re.findall(r'[A-Z][a-z][a-z]', hgvs) + refs = aa[:len(aa)//2] + alts = aa[len(aa)//2:] + start_pos = int(re.search(r'\d+', hgvs).group()) + for i,(ref,alt) in enumerate(zip(refs, alts)): + components.append(f'p.{ref}{start_pos+i}{alt}') + return components \ No newline at end of file diff --git a/pathogenprofiler/mutation_db.py b/pathogenprofiler/mutation_db.py index ec4ad0e..01e9279 100644 --- a/pathogenprofiler/mutation_db.py +++ b/pathogenprofiler/mutation_db.py @@ -23,7 +23,7 @@ def add(self,data: Union[dict,list]) -> None: self.container.add(json.dumps(data)) def to_dict_list(self) -> List[dict]: - return [json.loads(d) for d in self.container] + return [json.loads(d) for d in sorted(list(self.container))] diff --git a/pathogenprofiler/profiler.py b/pathogenprofiler/profiler.py index a4ef444..7dc7627 100644 --- a/pathogenprofiler/profiler.py +++ b/pathogenprofiler/profiler.py @@ -48,7 +48,8 @@ def vcf_profiler(args: argparse.Namespace) -> List[Union[Variant,DrVariant,Gene, vcf_targets_file = "%s.targets.vcf.gz" % args.files_prefix if not vcf_is_indexed(args.vcf): run_cmd("bcftools index %s" % args.vcf) - # run_cmd("bcftools view -R %s %s -Oz -o %s" % (conf["bed"],args.vcf,vcf_targets_file)) + if args.caller == 'freebayes-haplotype': + args.supplementary_bam = None annotated_variants = vcf_variant_profiler(conf, args.files_prefix, args.vcf, bam_for_phasing=args.supplementary_bam) return annotated_variants diff --git a/pathogenprofiler/utils.py b/pathogenprofiler/utils.py index b91c08e..c02d672 100644 --- a/pathogenprofiler/utils.py +++ b/pathogenprofiler/utils.py @@ -20,6 +20,43 @@ shared_dict = {} +def get_version(tool): + cmds = { + 'bcftools': 'bcftools --version', + 'samtools': 'samtools --version', + 'delly': 'delly --version', + 'bwa': 'bwa', + 'trimmomatic': 'trimmomatic -version', + 'gatk': 'gatk -version', + 'lofreq': 'lofreq version', + 'bedtools': 'bedtools --version', + 'minimap2': 'minimap2 --version', + 'freebayes': 'freebayes --version', + 'pilon': 'pilon --version', + 'snpEff': 'snpEff -version', + 'kmc': 'kmc', + 'sourmash': 'sourmash --version', + } + regex = { + 'bcftools': r'bcftools (\d+\.\d+\.?\d?)', + 'samtools': r'samtools (\d+\.\d+\.?\d?)', + 'delly': r'Delly version: v(\d+\.\d+\.?\d?)', + 'bwa': r'Version: (\d+\.\d+\.?\d?)', + 'trimmomatic': r'(\d+\.\d+)', + 'gatk': r'The Genome Analysis Toolkit \(GATK\) v(\d+\.\d+\.?\d?)', + 'lofreq': r'version: (\d+\.\d+\.?\d?)', + 'bedtools': r'bedtools v(\d+\.\d+\.?\d?)', + 'minimap2': r'(\d+\.\d+)', + 'freebayes': r'version: v(\d+\.\d+\.?\d?)', + 'pilon': r'Pilon version (\d+\.\d+\.?\d?)', + 'snpEff': r'SnpEff\t(\d+\.\d+.?)', + 'kmc': r'K-Mer Counter \(KMC\) ver. (\d+\.\d+\.\d+)', + 'sourmash': r'sourmash (\d+\.\d+\.\d?)', + } + x = sp.run([cmds[tool]], shell=True, stdout=sp.PIPE, stderr=sp.PIPE) + text = x.stdout.decode('utf-8') + x.stderr.decode('utf-8') + version = re.search(regex[tool], text).group(1) + return version class TempFilePrefix(object): """Create a temporary file prefix""" @@ -482,10 +519,10 @@ def add_arguments_to_self(self,args: dict) -> None: -def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess: +def run_cmd(cmd: str, desc=None, log: str=None, exit_on_error: bool=True) -> sp.CompletedProcess: if desc: logging.info(desc) - programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd) if x!=""]) + programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd.strip()) if x!=""]) missing = [p for p in programs if which(p)==False] if len(missing)>0: raise ValueError("Cant find programs: %s\n" % (", ".join(missing))) @@ -495,7 +532,8 @@ def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess: result = sp.run(cmd,shell=True,stderr=output,stdout=output) if result.returncode != 0: logging.error(result.stderr.decode("utf-8")) - raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode())) + if exit_on_error: + raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode())) return result def cmd_out(cmd: str) -> Generator[str, None, None]: diff --git a/pathogenprofiler/variant_calling.py b/pathogenprofiler/variant_calling.py index 0b39629..6fe65cc 100644 --- a/pathogenprofiler/variant_calling.py +++ b/pathogenprofiler/variant_calling.py @@ -130,3 +130,19 @@ def call_variants(self) -> Vcf: raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform)) return self.run_calling(self.calling_cmd) +class FreebayesHaplotypeCaller(VariantCaller): + __software__ = "freebayes-haplotype" + def call_variants(self) -> Vcf: + # Call variants using Lofreq + + self.calling_cmd = """ + samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | \ + samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && \ + samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && \ + freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | \ + bcftools view -Oz -o %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \ + tabix %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \ + freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam --haplotype-length 100 --haplotype-basis-alleles %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz | bcftools norm -a -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz + """ % vars(self) + + return self.run_calling(self.calling_cmd) diff --git a/pathogenprofiler/vcf.py b/pathogenprofiler/vcf.py index b5ba8cd..4047081 100644 --- a/pathogenprofiler/vcf.py +++ b/pathogenprofiler/vcf.py @@ -135,10 +135,11 @@ def run_snpeff(self,db,ref_file,gff_file,rename_chroms = None, split_indels=True self.phasing_bam = f"--bam {bam_for_phasing}" else: self.phasing_bam = "" - run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self)) - run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self)) - run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self)) - run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self)) + run_cmd("bcftools view -c 1 -a %(filename)s | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self)) + # run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self)) + # run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self)) + # run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self)) + # run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self)) else : run_cmd("bcftools view %(filename)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools view -Oz -o %(vcf_csq_file)s" % vars(self)) return Vcf(self.vcf_csq_file,self.prefix) @@ -186,7 +187,7 @@ def load_ann( variants = [] vcf = pysam.VariantFile(self.filename) for var in vcf: - logging.debug(str(var)[:500]) + logging.debug(str(var)[:5000]) chrom = var.chrom pos = var.pos ref = var.ref @@ -211,14 +212,23 @@ def load_ann( ann_list = [x.split("|") for x in ann_strs] for alt in alleles[1:]: strand_support = get_stand_support(var,alt) - + if strand_support[0] != None: + freq = sum(strand_support)/sum(ad) + else: + if 'QNAME' in var.info: + freq = 1.0 + elif sv: + freq = af_dict[alt] + else: + raise NotImplementedError + tmp_var = Variant( chrom = chrom, pos = int(pos), ref = ref, alt = alt, depth = sum(ad), - freq = af_dict[alt], + freq = freq, forward_reads = strand_support[0], reverse_reads = strand_support[1], sv = sv, diff --git a/scripts/combine_vcf_variants.py b/scripts/combine_vcf_variants.py index 07df736..511705c 100755 --- a/scripts/combine_vcf_variants.py +++ b/scripts/combine_vcf_variants.py @@ -6,6 +6,7 @@ from pathogenprofiler.models import GenomePosition from pathogenprofiler.gff import load_gff, Exon from itertools import product +import logging parser = argparse.ArgumentParser(description='tbprofiler script',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--vcf',type=str,help='') @@ -13,8 +14,14 @@ parser.add_argument('--ref',type=str,help='',required = True) parser.add_argument('--out',type=str,help='') parser.add_argument('--bam',type=str,help='') +parser.add_argument('--debug',action='store_true',help='Log file') args = parser.parse_args() +if args.debug: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) + def get_alleles( read:pysam.libcalignedsegment.AlignedSegment, @@ -54,6 +61,7 @@ def get_haplotype_counts( start = min(positions).pos end = max(positions).pos for read in bam.fetch(contig=chrom,start=start,end=end): + logging.debug((read.qname, read.mapping_quality<10, read.is_secondary, read.is_supplementary, read.is_unmapped)) if read.mapping_quality < 10: continue if read.is_secondary: @@ -63,19 +71,14 @@ def get_haplotype_counts( if read.is_unmapped: continue alleles = get_alleles(read,positions=positions,bystrand=bystrand) + logging.debug(alleles) if all(alleles): allele_combinations.append(''.join(alleles)) counts = Counter(allele_combinations) + logging.debug(counts) return counts -gff = load_gff(args.gff) -exons = [] -for gene in gff: - for transcript in gene.transcripts: - for i,exon in enumerate(transcript.exons): - exon.id = f"{gene.gene_id}_exon{i+1}" - exons.append(exon) def get_overlapping_exons(chrom: str,pos: int,exons: List[Exon]): exons = [e for e in exons if e.start<=pos and e.end>=pos and chrom==e.chrom] @@ -88,13 +91,21 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]): e = get_overlapping_exons(chrom,pos,exons) if e==None: return (None,None) + logging.debug(f"{chrom}:{pos} {vars(e)}") if e.strand=="+": - codon_pos = (pos-e.start)//3 + 1 + codon_pos = (pos-e.start-e.phase)//3 + 1 else: - codon_pos = (e.end - pos )//3 + 1 + codon_pos = (e.end + e.phase - pos )//3 + 1 return (e.id,codon_pos) +gff = load_gff(args.gff) +exons = [] +for gene in gff: + for transcript in gene.transcripts: + for i,exon in enumerate(transcript.exons): + exon.id = f"{gene.gene_id}_exon{i+1}" + exons.append(exon) ref = pysam.FastaFile(args.ref) coding_variants = defaultdict(list) @@ -107,7 +118,9 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]): for var in vcf: gene,cpos = get_codon_pos(var.chrom,var.pos,exons) - if gene==None: + if var.rlen!=1 or len(var.alts[0])!=1: + other_variants.append(var) + elif gene==None: other_variants.append(var) else: coding_variants[(gene,cpos)].append(var) @@ -149,6 +162,13 @@ def has_md_tag(bam): haplotypes_by_strand = { ref_hap: int(v.samples[0]['AD'][0]) } + if len(ds)==2 and list(ds)[0]+2 == list(ds)[1]: + midpos = list(ds)[0]+1 + # insert the ref allele in between + ds[midpos] = ref.fetch(v.chrom,midpos-1,midpos) + # sort + ds = dict(sorted(ds.items())) + for alts in product(*ds.values()): hap = ''.join(alts) if 'DP4' in v.info: @@ -168,7 +188,7 @@ def has_md_tag(bam): for i,(hap,count) in enumerate(haplotypes.items()): hap_fwd = haplotypes_by_strand.get(hap.upper(),0) hap_rev = haplotypes_by_strand.get(hap.lower(),0) - if hap==ref_hap: + if hap.upper()==ref_hap.upper(): continue variant = variants[0].copy() variant.alts = (hap,) @@ -176,6 +196,14 @@ def has_md_tag(bam): variant.info.update({'AF':count/dp}) if 'DP4' in variant.info: variant.info['DP4'] = [ref_fwd,ref_rev,hap_fwd,hap_rev] + if 'SAF' in variant.info: + variant.info['SAF'] = hap_fwd + variant.info['SAR'] = hap_rev + if 'ADF' in var.samples[0]: + var.samples[0]['ADF'] = hap_fwd + var.samples[0]['ADR'] = hap_rev + if 'AD' in variant.samples[0]: + variant.samples[0]['AD'] = [dp - count, hap_fwd + hap_rev] other_variants.append(variant)