diff --git a/pathogenprofiler/bam.py b/pathogenprofiler/bam.py index 84c8bb3..3c55906 100644 --- a/pathogenprofiler/bam.py +++ b/pathogenprofiler/bam.py @@ -301,20 +301,26 @@ def get_bed_gt(self,bed_file: str,ref_file: str,caller: str,platform: str): cmd = "freebayes -f %(ref_file)s -t %(bed_file)s %(prefix)s.tmp.bam --haplotype-length -1 | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self) elif caller == "bcftools": cmd = "bcftools mpileup -f %(ref_file)s -T %(bed_file)s %(prefix)s.tmp.bam -BI -a AD | bcftools call -mv | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self) + elif caller == "mpileup": + cmd = "bcftools mpileup -f %(ref_file)s -T %(bed_file)s %(prefix)s.tmp.bam -BI -a AD | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%AD]\\n'" % vars(self) else: cmd = "freebayes -f %(ref_file)s -t %(bed_file)s %(prefix)s.tmp.bam --haplotype-length -1 | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self) for l in cmd_out(cmd): # Chromosome 4348079 0/0 51 - chrom, pos, ref, alt, gt, ad = l.rstrip().split() + row = l.strip().split() + if len(row)==6: + chrom, pos, ref, alt, gt, ad = l.rstrip().split() + elif len(row)==5: + chrom, pos, ref, alt, ad = l.rstrip().split() + gt = None + p = GenomePosition(chrom=chrom,pos=int(pos)) d = {} alts = alt.split(",") ad = [int(x) for x in ad.split(",")] - # if gt == "0/0": - # d[ref] = ad[0] - # elif gt == "./.": - if gt == "./.": + + if gt and gt == "./.": d[ref] = 0 else: genotypes = list([ref]+alts) diff --git a/pathogenprofiler/barcode.py b/pathogenprofiler/barcode.py index c3107ed..239a29d 100644 --- a/pathogenprofiler/barcode.py +++ b/pathogenprofiler/barcode.py @@ -57,7 +57,9 @@ def get_barcoding_mutations(mutations: dict, barcode_bed: str) -> tuple[dict, Li return (barcode_support,snps_report) -def barcode(mutations,barcode_bed: str,snps_file=None) -> List[BarcodeResult]: +def barcode(mutations,barcode_bed: str,snps_file=None,stdev_cutoff=0.15) -> List[BarcodeResult]: + if stdev_cutoff is None: + stdev_cutoff = 0.15 bed_num_col = len(open(barcode_bed).readline().rstrip().split("\t")) # bed = [] lineage_info = {} @@ -75,24 +77,42 @@ def barcode(mutations,barcode_bed: str,snps_file=None) -> List[BarcodeResult]: barcode_frac = defaultdict(float) for l in barcode_support: - # If stdev of fraction across all barcoding positions > 0.15 + logging.debug("Processing %s" % l) + logging.debug(barcode_support[l]) + # If stdev of fraction across all barcoding positions > stdev_cutoff # Only look at positions with >5 reads tmp_allelic_dp = [x[1]/(x[0]+x[1]) for x in barcode_support[l] if sum(x)>5] + logging.debug(tmp_allelic_dp) + # remove positions with no SNP (fraction=0) tmp_allelic_dp = [x for x in tmp_allelic_dp if x>0] - if len(tmp_allelic_dp)==0: continue - if stdev(tmp_allelic_dp)>0.15: continue + logging.debug(tmp_allelic_dp) + + if len(tmp_allelic_dp)==0: + logging.debug("No SNPs found for %s" % l) + continue + if stdev(tmp_allelic_dp)>stdev_cutoff: + logging.debug(f"Stdev {stdev(tmp_allelic_dp)} > {stdev_cutoff} for {l}") + continue # if number of barcoding positions > 5 and only one shows alternate num_positions_with_alt = len([x for x in barcode_support[l] if (x[1]/(x[0]+x[1]))>0]) - if len(barcode_support[l])>5 and num_positions_with_alt<2: continue + logging.debug("Number of positions with alternate %s" % num_positions_with_alt) + if len(barcode_support[l])>5 and num_positions_with_alt<2: + logging.debug("Number of positions with alternate <2 for %s" % l) + continue # if number of barcoding positions > 5 and there are less than 5% of possible positions with alternate - if len(barcode_support[l])>5 and num_positions_with_alt<=0.10*len(barcode_support[l]): continue + if len(barcode_support[l])>5 and num_positions_with_alt<=0.10*len(barcode_support[l]): + logging.debug("Number of positions with alternate < 10 percent for %s" % l) + continue barcode_pos_reads = sum([x[1] for x in barcode_support[l]]) barcode_neg_reads = sum([x[0] for x in barcode_support[l]]) lf = barcode_pos_reads/(barcode_pos_reads+barcode_neg_reads) - if lf<0.05:continue + logging.debug("Barcode %s has frequency %s" % (l,lf)) + if lf<0.05: + logging.debug("Frequency < 0.05 for %s" % l) + continue barcode_frac[l] = lf final_results = [] diff --git a/pathogenprofiler/profiler.py b/pathogenprofiler/profiler.py index 7dc7627..7e36f70 100644 --- a/pathogenprofiler/profiler.py +++ b/pathogenprofiler/profiler.py @@ -17,8 +17,9 @@ def bam_barcoder(args: argparse.Namespace) -> List[BarcodeResult]: bam = Bam(args.bam, args.files_prefix, platform=args.platform, threads=args.threads) if not hasattr(args,'barcode_snps'): args.barcode_snps = None - barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform) - barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps) + barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform) + stdev_cutoff = args.barcode_stdev if hasattr(args,'barcode_stdev') else None + barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps,stdev_cutoff=stdev_cutoff) return barcode_assignment def vcf_barcoder(args: argparse.Namespace) -> List[BarcodeResult]: @@ -29,7 +30,8 @@ def vcf_barcoder(args: argparse.Namespace) -> List[BarcodeResult]: barcode_mutations = vcf.get_bed_gt(conf["barcode"],conf["ref"]) if not hasattr(args,'barcode_snps'): args.barcode_snps = None - barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps) + stdev_cutoff = args.barcode_stdev if hasattr(args,'barcode_stdev') else None + barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps,stdev_cutoff=stdev_cutoff) return barcode_assignment diff --git a/scripts/fa2vcf.py b/scripts/fa2vcf.py new file mode 100644 index 0000000..b5c4c77 --- /dev/null +++ b/scripts/fa2vcf.py @@ -0,0 +1,65 @@ +#! /usr/bin/env python +import argparse +import pysam + +parser = argparse.ArgumentParser(description='Convert fasta to vcf') +parser.add_argument('aln', help='Alignment file') +parser.add_argument('out', help='Output vcf file') + +args = parser.parse_args() + + + +aln = pysam.FastaFile(args.aln) +chrom = aln.references[0] +chrom_len = aln.get_reference_length(chrom) + +variants = [] +for i,(ref,alt) in enumerate(zip(aln.fetch(chrom),aln.fetch(aln.references[1]))): + if ref!=alt: + variants.append( + { + "chrom": aln.references[0], + "pos": i, + "ref": ref, + "alt": alt if alt!="N" else "*" + } + ) + +# create vcf file + +header = pysam.VariantHeader() +header.add_sample("sample1") +header.contigs.add(chrom, length=chrom_len) +header.add_line('##INFO=') +header.add_line('##INFO=') +header.add_line('##FORMAT=') + +vcf = pysam.VariantFile(args.out, "w", header=header) + +# add a variant +for variant in variants: + record = vcf.new_record( + contig=variant["chrom"], + start=variant["pos"], + alleles=[variant["ref"], variant["alt"]], + id=variant['ref'] + str(variant['pos']+1) + (variant['alt'] if variant["alt"]!="*" else variant['ref'] ) + ) + if variant["alt"]=="*": + record.info["AC"] = [0] + record.info["AN"] = 0 + record.samples["sample1"]["GT"] = (None) + + else: + record.info["AC"] = [1] + record.info["AN"] = 1 + record.samples["sample1"]["GT"] = (1) + + # write GT field + vcf.write(record) + +vcf.close() + + + +