Skip to content

Commit

Permalink
Merge pull request #57 from jodyphelan/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jodyphelan authored Dec 2, 2024
2 parents c3de437 + 552a04f commit 9fb0db7
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 15 deletions.
16 changes: 11 additions & 5 deletions pathogenprofiler/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,20 +301,26 @@ def get_bed_gt(self,bed_file: str,ref_file: str,caller: str,platform: str):
cmd = "freebayes -f %(ref_file)s -t %(bed_file)s %(prefix)s.tmp.bam --haplotype-length -1 | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self)
elif caller == "bcftools":
cmd = "bcftools mpileup -f %(ref_file)s -T %(bed_file)s %(prefix)s.tmp.bam -BI -a AD | bcftools call -mv | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self)
elif caller == "mpileup":
cmd = "bcftools mpileup -f %(ref_file)s -T %(bed_file)s %(prefix)s.tmp.bam -BI -a AD | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%AD]\\n'" % vars(self)
else:
cmd = "freebayes -f %(ref_file)s -t %(bed_file)s %(prefix)s.tmp.bam --haplotype-length -1 | bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT[\\t%%GT\\t%%AD]\\n'" % vars(self)

for l in cmd_out(cmd):
# Chromosome 4348079 0/0 51
chrom, pos, ref, alt, gt, ad = l.rstrip().split()
row = l.strip().split()
if len(row)==6:
chrom, pos, ref, alt, gt, ad = l.rstrip().split()
elif len(row)==5:
chrom, pos, ref, alt, ad = l.rstrip().split()
gt = None

p = GenomePosition(chrom=chrom,pos=int(pos))
d = {}
alts = alt.split(",")
ad = [int(x) for x in ad.split(",")]
# if gt == "0/0":
# d[ref] = ad[0]
# elif gt == "./.":
if gt == "./.":

if gt and gt == "./.":
d[ref] = 0
else:
genotypes = list([ref]+alts)
Expand Down
34 changes: 27 additions & 7 deletions pathogenprofiler/barcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ def get_barcoding_mutations(mutations: dict, barcode_bed: str) -> tuple[dict, Li
return (barcode_support,snps_report)


def barcode(mutations,barcode_bed: str,snps_file=None) -> List[BarcodeResult]:
def barcode(mutations,barcode_bed: str,snps_file=None,stdev_cutoff=0.15) -> List[BarcodeResult]:
if stdev_cutoff is None:
stdev_cutoff = 0.15
bed_num_col = len(open(barcode_bed).readline().rstrip().split("\t"))
# bed = []
lineage_info = {}
Expand All @@ -75,24 +77,42 @@ def barcode(mutations,barcode_bed: str,snps_file=None) -> List[BarcodeResult]:

barcode_frac = defaultdict(float)
for l in barcode_support:
# If stdev of fraction across all barcoding positions > 0.15
logging.debug("Processing %s" % l)
logging.debug(barcode_support[l])
# If stdev of fraction across all barcoding positions > stdev_cutoff
# Only look at positions with >5 reads
tmp_allelic_dp = [x[1]/(x[0]+x[1]) for x in barcode_support[l] if sum(x)>5]
logging.debug(tmp_allelic_dp)

# remove positions with no SNP (fraction=0)
tmp_allelic_dp = [x for x in tmp_allelic_dp if x>0]
if len(tmp_allelic_dp)==0: continue
if stdev(tmp_allelic_dp)>0.15: continue
logging.debug(tmp_allelic_dp)

if len(tmp_allelic_dp)==0:
logging.debug("No SNPs found for %s" % l)
continue
if stdev(tmp_allelic_dp)>stdev_cutoff:
logging.debug(f"Stdev {stdev(tmp_allelic_dp)} > {stdev_cutoff} for {l}")
continue

# if number of barcoding positions > 5 and only one shows alternate
num_positions_with_alt = len([x for x in barcode_support[l] if (x[1]/(x[0]+x[1]))>0])
if len(barcode_support[l])>5 and num_positions_with_alt<2: continue
logging.debug("Number of positions with alternate %s" % num_positions_with_alt)
if len(barcode_support[l])>5 and num_positions_with_alt<2:
logging.debug("Number of positions with alternate <2 for %s" % l)
continue
# if number of barcoding positions > 5 and there are less than 5% of possible positions with alternate
if len(barcode_support[l])>5 and num_positions_with_alt<=0.10*len(barcode_support[l]): continue
if len(barcode_support[l])>5 and num_positions_with_alt<=0.10*len(barcode_support[l]):
logging.debug("Number of positions with alternate < 10 percent for %s" % l)
continue

barcode_pos_reads = sum([x[1] for x in barcode_support[l]])
barcode_neg_reads = sum([x[0] for x in barcode_support[l]])
lf = barcode_pos_reads/(barcode_pos_reads+barcode_neg_reads)
if lf<0.05:continue
logging.debug("Barcode %s has frequency %s" % (l,lf))
if lf<0.05:
logging.debug("Frequency < 0.05 for %s" % l)
continue
barcode_frac[l] = lf
final_results = []

Expand Down
8 changes: 5 additions & 3 deletions pathogenprofiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ def bam_barcoder(args: argparse.Namespace) -> List[BarcodeResult]:
bam = Bam(args.bam, args.files_prefix, platform=args.platform, threads=args.threads)
if not hasattr(args,'barcode_snps'):
args.barcode_snps = None
barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform)
barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps)
barcode_mutations = bam.get_bed_gt(conf["barcode"],conf["ref"], caller=args.caller,platform=args.platform)
stdev_cutoff = args.barcode_stdev if hasattr(args,'barcode_stdev') else None
barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps,stdev_cutoff=stdev_cutoff)
return barcode_assignment

def vcf_barcoder(args: argparse.Namespace) -> List[BarcodeResult]:
Expand All @@ -29,7 +30,8 @@ def vcf_barcoder(args: argparse.Namespace) -> List[BarcodeResult]:
barcode_mutations = vcf.get_bed_gt(conf["barcode"],conf["ref"])
if not hasattr(args,'barcode_snps'):
args.barcode_snps = None
barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps)
stdev_cutoff = args.barcode_stdev if hasattr(args,'barcode_stdev') else None
barcode_assignment = barcode(barcode_mutations,conf["barcode"],args.barcode_snps,stdev_cutoff=stdev_cutoff)
return barcode_assignment


Expand Down
65 changes: 65 additions & 0 deletions scripts/fa2vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#! /usr/bin/env python
import argparse
import pysam

parser = argparse.ArgumentParser(description='Convert fasta to vcf')
parser.add_argument('aln', help='Alignment file')
parser.add_argument('out', help='Output vcf file')

args = parser.parse_args()



aln = pysam.FastaFile(args.aln)
chrom = aln.references[0]
chrom_len = aln.get_reference_length(chrom)

variants = []
for i,(ref,alt) in enumerate(zip(aln.fetch(chrom),aln.fetch(aln.references[1]))):
if ref!=alt:
variants.append(
{
"chrom": aln.references[0],
"pos": i,
"ref": ref,
"alt": alt if alt!="N" else "*"
}
)

# create vcf file

header = pysam.VariantHeader()
header.add_sample("sample1")
header.contigs.add(chrom, length=chrom_len)
header.add_line('##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes">')
header.add_line('##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">')
header.add_line('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')

vcf = pysam.VariantFile(args.out, "w", header=header)

# add a variant
for variant in variants:
record = vcf.new_record(
contig=variant["chrom"],
start=variant["pos"],
alleles=[variant["ref"], variant["alt"]],
id=variant['ref'] + str(variant['pos']+1) + (variant['alt'] if variant["alt"]!="*" else variant['ref'] )
)
if variant["alt"]=="*":
record.info["AC"] = [0]
record.info["AN"] = 0
record.samples["sample1"]["GT"] = (None)

else:
record.info["AC"] = [1]
record.info["AN"] = 1
record.samples["sample1"]["GT"] = (1)

# write GT field
vcf.write(record)

vcf.close()




0 comments on commit 9fb0db7

Please sign in to comment.