diff --git a/scripts/combine_vcf_variants.py b/scripts/combine_vcf_variants.py index ee1a121..a5e925f 100755 --- a/scripts/combine_vcf_variants.py +++ b/scripts/combine_vcf_variants.py @@ -40,6 +40,7 @@ def get_alleles( continue if read.query_qualities[read_pos] < 12: continue + if read_nt.islower(): read_nt = read.query_sequence[read_pos].upper() @@ -107,6 +108,14 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]): exon.id = f"{gene.gene_id}_exon{i+1}" exons.append(exon) +def has_md_tag(bam): + for read in bam.fetch(): + break + if read.has_tag('MD'): + return True + else: + return False + ref = pysam.FastaFile(args.ref) coding_variants = defaultdict(list) other_variants = [] @@ -117,6 +126,7 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]): vcf.header.add_line('##INFO=') for var in vcf: + logging.debug(var) gene,cpos = get_codon_pos(var.chrom,var.pos,exons) if var.rlen!=1 or len(var.alts[0])!=1: other_variants.append(var) @@ -125,15 +135,12 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]): else: coding_variants[(gene,cpos)].append(var) -def has_md_tag(bam): - for read in bam.fetch(): - break - if read.has_tag('MD'): - return True - else: - return False + +logging.debug(coding_variants[('PVP01_1429500_exon2', 164)][0]) for key,variants in coding_variants.items(): + logging.debug((key,[str(v) for v in variants])) + logging.debug(coding_variants[('PVP01_1429500_exon2', 164)][0]) if len(variants)==1: other_variants.append(variants[0]) continue @@ -145,11 +152,11 @@ def has_md_tag(bam): ref_hap = ''.join([ref.fetch(p.chrom,p.pos-1,p.pos) for p in positions]) if args.bam and has_md_tag(pysam.AlignmentFile(args.bam)): haplotypes_by_strand = get_haplotype_counts(pysam.AlignmentFile(args.bam),positions,bystrand=True) - + logging.debug(haplotypes_by_strand) else: # alt is just a combination of all the alt alleles alt_hap = ''.join([v.alts[0] for v in variants]) - + logging.debug('adkasjdaosdjaoisd') ds = defaultdict(list) for v in variants: ds[v.pos].append(v.alts[0]) @@ -204,8 +211,8 @@ def has_md_tag(bam): if 'ADF' in var.samples[0]: print(var) print(hap_fwd,hap_rev) - var.samples[0]['ADF'] = (ref_fwd,hap_fwd) - var.samples[0]['ADR'] = (ref_rev,hap_rev) + variant.samples[0]['ADF'] = (ref_fwd,hap_fwd) + variant.samples[0]['ADR'] = (ref_rev,hap_rev) if 'AD' in variant.samples[0]: variant.samples[0]['AD'] = [dp - count, hap_fwd + hap_rev]