Skip to content

Commit

Permalink
Merge pull request #48 from jodyphelan/annotation_speedup
Browse files Browse the repository at this point in the history
Annotation speedup
  • Loading branch information
jodyphelan authored Jul 31, 2024
2 parents 3fa820c + ae854c7 commit dab2fb1
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 44 deletions.
2 changes: 1 addition & 1 deletion pathogenprofiler/mutation_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def add(self,data: Union[dict,list]) -> None:
self.container.add(json.dumps(data))

def to_dict_list(self) -> List[dict]:
return [json.loads(d) for d in self.container]
return [json.loads(d) for d in sorted(list(self.container))]



Expand Down
62 changes: 31 additions & 31 deletions pathogenprofiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,37 +22,37 @@

def get_version(tool):
cmds = {
'bcftools': 'bcftools --version',
'samtools': 'samtools --version',
'delly': 'delly --version',
'bwa': 'bwa',
'trimmomatic': 'trimmomatic -version',
'gatk': 'gatk -version',
'lofreq': 'lofreq version',
'bedtools': 'bedtools --version',
'minimap2': 'minimap2 --version',
'freebayes': 'freebayes --version',
'pilon': 'pilon --version',
'snpEff': 'snpEff -version',
'kmc': 'kmc',
'sourmash': 'sourmash --version',
}
regex = {
'bcftools': r'bcftools (\d+\.\d+\.?\d?)',
'samtools': r'samtools (\d+\.\d+\.?\d?)',
'delly': r'Delly version: v(\d+\.\d+\.?\d?)',
'bwa': r'Version: (\d+\.\d+\.?\d?)',
'trimmomatic': r'(\d+\.\d+)',
'gatk': r'The Genome Analysis Toolkit \(GATK\) v(\d+\.\d+\.?\d?)',
'lofreq': r'version: (\d+\.\d+\.?\d?)',
'bedtools': r'bedtools v(\d+\.\d+\.?\d?)',
'minimap2': r'(\d+\.\d+)',
'freebayes': r'version: v(\d+\.\d+\.?\d?)',
'pilon': r'Pilon version (\d+\.\d+\.?\d?)',
'snpEff': r'SnpEff\t(\d+\.\d+.?)',
'kmc': r'K-Mer Counter \(KMC\) ver. (\d+\.\d+\.\d+)',
'sourmash': r'sourmash (\d+\.\d+\.\d?)',
}
'bcftools': 'bcftools --version',
'samtools': 'samtools --version',
'delly': 'delly --version',
'bwa': 'bwa',
'trimmomatic': 'trimmomatic -version',
'gatk': 'gatk -version',
'lofreq': 'lofreq version',
'bedtools': 'bedtools --version',
'minimap2': 'minimap2 --version',
'freebayes': 'freebayes --version',
'pilon': 'pilon --version',
'snpEff': 'snpEff -version',
'kmc': 'kmc',
'sourmash': 'sourmash --version',
}
regex = {
'bcftools': r'bcftools (\d+\.\d+\.?\d?)',
'samtools': r'samtools (\d+\.\d+\.?\d?)',
'delly': r'Delly version: v(\d+\.\d+\.?\d?)',
'bwa': r'Version: (\d+\.\d+\.?\d?)',
'trimmomatic': r'(\d+\.\d+)',
'gatk': r'The Genome Analysis Toolkit \(GATK\) v(\d+\.\d+\.?\d?)',
'lofreq': r'version: (\d+\.\d+\.?\d?)',
'bedtools': r'bedtools v(\d+\.\d+\.?\d?)',
'minimap2': r'(\d+\.\d+)',
'freebayes': r'version: v(\d+\.\d+\.?\d?)',
'pilon': r'Pilon version (\d+\.\d+\.?\d?)',
'snpEff': r'SnpEff\t(\d+\.\d+.?)',
'kmc': r'K-Mer Counter \(KMC\) ver. (\d+\.\d+\.\d+)',
'sourmash': r'sourmash (\d+\.\d+\.\d?)',
}
x = sp.run([cmds[tool]], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
text = x.stdout.decode('utf-8') + x.stderr.decode('utf-8')
version = re.search(regex[tool], text).group(1)
Expand Down
9 changes: 5 additions & 4 deletions pathogenprofiler/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ def run_snpeff(self,db,ref_file,gff_file,rename_chroms = None, split_indels=True
self.phasing_bam = f"--bam {bam_for_phasing}"
else:
self.phasing_bam = ""
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
# run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
else :
run_cmd("bcftools view %(filename)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools view -Oz -o %(vcf_csq_file)s" % vars(self))
return Vcf(self.vcf_csq_file,self.prefix)
Expand Down
18 changes: 10 additions & 8 deletions scripts/combine_vcf_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,6 @@ def get_haplotype_counts(
return counts


gff = load_gff(args.gff)
exons = []
for gene in gff:
for transcript in gene.transcripts:
for i,exon in enumerate(transcript.exons):
exon.id = f"{gene.gene_id}_exon{i+1}"
exons.append(exon)

def get_overlapping_exons(chrom: str,pos: int,exons: List[Exon]):
exons = [e for e in exons if e.start<=pos and e.end>=pos and chrom==e.chrom]
Expand All @@ -95,6 +88,13 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]):
return (e.id,codon_pos)


gff = load_gff(args.gff)
exons = []
for gene in gff:
for transcript in gene.transcripts:
for i,exon in enumerate(transcript.exons):
exon.id = f"{gene.gene_id}_exon{i+1}"
exons.append(exon)

ref = pysam.FastaFile(args.ref)
coding_variants = defaultdict(list)
Expand All @@ -107,7 +107,9 @@ def get_codon_pos(chrom: str,pos: int,exons: List[Exon]):

for var in vcf:
gene,cpos = get_codon_pos(var.chrom,var.pos,exons)
if gene==None:
if var.rlen!=1 or len(var.alts[0])!=1:
other_variants.append(var)
elif gene==None:
other_variants.append(var)
else:
coding_variants[(gene,cpos)].append(var)
Expand Down

0 comments on commit dab2fb1

Please sign in to comment.