Skip to content

Commit

Permalink
Merge pull request #50 from jodyphelan/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jodyphelan authored Aug 29, 2024
2 parents b0ba6e2 + d9e0e96 commit 67cd363
Show file tree
Hide file tree
Showing 11 changed files with 188 additions and 38 deletions.
2 changes: 1 addition & 1 deletion pathogenprofiler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
from .cli import *
from .rules import *
from .models import *
__version__ = "4.3.0"
__version__ = "4.4.0"
11 changes: 9 additions & 2 deletions pathogenprofiler/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,16 @@ def run_delly(self,ref_file: str,bed_file: str) -> Vcf:
self.bed_file = bed_file
self.ref_file = ref_file
if self.platform=="illumina":
run_cmd("delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self))
cmd = "delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)
result = run_cmd(cmd, exit_on_error=False)
else:
run_cmd("delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self))
cmd = "delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)
result = run_cmd(cmd, exit_on_error=False)

exitcode = result.returncode
if exitcode!=0:
logging.error("Delly failed, skipping")
return None

run_cmd("bcftools view -c 2 %(prefix)s.delly.bcf | bcftools view -e '(INFO/END-POS)>=100000' -Oz -o %(prefix)s.delly.vcf.gz" % vars(self))
run_cmd("bcftools index %(prefix)s.delly.vcf.gz" % vars(self))
Expand Down
3 changes: 2 additions & 1 deletion pathogenprofiler/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .fastq import Fastq
from .utils import run_cmd, cmd_out
from .bam import Bam
from .db import get_db
from .db import get_db, check_db_exists
from .fasta import Fasta, Paf
from .profiler import vcf_profiler, bam_barcoder, vcf_barcoder
import logging
Expand Down Expand Up @@ -335,6 +335,7 @@ def set_species(args: argparse.Namespace) -> SpeciesPrediction:
>>> species_prediction.species
[Species(species='Mycobacterium abscessus', prediction_info=None)]
"""
check_db_exists(args.software_name,args.resistance_db)
conf = get_db(args.software_name,args.resistance_db)
species = Species(
species=conf["species"]
Expand Down
43 changes: 33 additions & 10 deletions pathogenprofiler/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def write_bed(db: dict,gene_dict: dict,gene_info: List[Gene],ref_file: str,outfi
if len(drugs)==0:
drugs = "None"
else:
drugs = ",".join(drugs)
drugs = ",".join(sorted(list(drugs)))
lines.append([
gene_info[gene].chrom,
str(genome_start),
Expand Down Expand Up @@ -574,12 +574,30 @@ def check_db_files(variables):
if not os.path.isfile(val+".fai"):
index_ref(val)

def is_db_path(string):
if "/" in string:
return True
elif os.path.isfile(string+".variables.json"):
return True
return False

def check_db_exists(software_name,db_name):
db = get_db(software_name=software_name,db_name=db_name,verbose=False)
if db is None:
share_path = f"{sys.base_prefix}/share/{software_name}/"
logging.error(f"DB {db_name} does not exist in the current directory or in {share_path}")
raise FileExistsError


def get_db(software_name,db_name):
if "/" in db_name:
share_path = "/".join(db_name.split("/")[:-1])
db_name = db_name.split("/")[-1]
variable_file_name = f"{share_path}/{db_name}.variables.json"
def get_db(software_name,db_name,verbose=True):
if is_db_path(db_name):
if "/" in db_name:
share_path = "/".join(db_name.split("/")[:-1])
db_name = db_name.split("/")[-1]
variable_file_name = f"{share_path}/{db_name}.variables.json"
else:
share_path = '.'
variable_file_name = f"{share_path}/{db_name}.variables.json"
else:
share_path = f"{sys.base_prefix}/share/{software_name}/"
variable_file_name = get_variable_file_name(software_name,db_name)
Expand All @@ -588,7 +606,8 @@ def get_db(software_name,db_name):
return None
variables = json.load(open(variable_file_name))
for key,val in variables['files'].items():
logging.info(f"Using {key} file: {share_path}/{val}")
if verbose:
logging.info(f"Using {key} file: {share_path}/{val}")
if ".json" in val:
variables[key] = json.load(open(f"{share_path}/{val}"))
elif key=='rules':
Expand All @@ -608,9 +627,13 @@ def list_db(software_name):


def create_species_db(args,extra_files = None):
variables = json.load(open("variables.json"))
if not extra_files:
extra_files = {}
version = {"name":args.prefix}
version = {
"db-schema-version":variables['db-schema-version'],
"name":args.prefix
}
if os.path.isdir('.git'):
for l in pp.cmd_out("git log | head -4"):
row = l.strip().split()
Expand All @@ -626,12 +649,12 @@ def create_species_db(args,extra_files = None):
shutil.copyfile(file,target)

variables_file = args.prefix+".variables.json"
variables = {
variables.update({
"version": version,
"files":{
"variables": variables_file
}
}
})

if extra_files:
for key,val in extra_files.items():
Expand Down
28 changes: 27 additions & 1 deletion pathogenprofiler/hgvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,4 +332,30 @@ def verify_mutation_list(hgvs_mutations: List[dict], genes: List[Gene], refseq:
converted_mutations[key] = mutation_conversion[key]

logging.debug(converted_mutations)
return converted_mutations
return converted_mutations


def split_protein_hgvs(hgvs) -> List[str]:
"""
Split a protein HGVS into its components.
Parameters:
hgvs (str): The protein HGVS.
Returns:
List[str]: A list of protein HGVS components.
Example:
>>> split_protein_hgvs("p.MetAsnLys74IleGluThr")
['p.Met74Ile', 'p.Asn74Glu', 'p.Lys74Thr']
"""
# Remove the 'p.' prefix
hgvs = hgvs[2:]
components = []
aa = re.findall(r'[A-Z][a-z][a-z]', hgvs)
refs = aa[:len(aa)//2]
alts = aa[len(aa)//2:]
start_pos = int(re.search(r'\d+', hgvs).group())
for i,(ref,alt) in enumerate(zip(refs, alts)):
components.append(f'p.{ref}{start_pos+i}{alt}')
return components
2 changes: 1 addition & 1 deletion pathogenprofiler/mutation_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def add(self,data: Union[dict,list]) -> None:
self.container.add(json.dumps(data))

def to_dict_list(self) -> List[dict]:
return [json.loads(d) for d in self.container]
return [json.loads(d) for d in sorted(list(self.container))]



Expand Down
3 changes: 2 additions & 1 deletion pathogenprofiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def vcf_profiler(args: argparse.Namespace) -> List[Union[Variant,DrVariant,Gene,
vcf_targets_file = "%s.targets.vcf.gz" % args.files_prefix
if not vcf_is_indexed(args.vcf):
run_cmd("bcftools index %s" % args.vcf)
# run_cmd("bcftools view -R %s %s -Oz -o %s" % (conf["bed"],args.vcf,vcf_targets_file))
if args.caller == 'freebayes-haplotype':
args.supplementary_bam = None
annotated_variants = vcf_variant_profiler(conf, args.files_prefix, args.vcf, bam_for_phasing=args.supplementary_bam)
return annotated_variants

Expand Down
44 changes: 41 additions & 3 deletions pathogenprofiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,43 @@

shared_dict = {}

def get_version(tool):
cmds = {
'bcftools': 'bcftools --version',
'samtools': 'samtools --version',
'delly': 'delly --version',
'bwa': 'bwa',
'trimmomatic': 'trimmomatic -version',
'gatk': 'gatk -version',
'lofreq': 'lofreq version',
'bedtools': 'bedtools --version',
'minimap2': 'minimap2 --version',
'freebayes': 'freebayes --version',
'pilon': 'pilon --version',
'snpEff': 'snpEff -version',
'kmc': 'kmc',
'sourmash': 'sourmash --version',
}
regex = {
'bcftools': r'bcftools (\d+\.\d+\.?\d?)',
'samtools': r'samtools (\d+\.\d+\.?\d?)',
'delly': r'Delly version: v(\d+\.\d+\.?\d?)',
'bwa': r'Version: (\d+\.\d+\.?\d?)',
'trimmomatic': r'(\d+\.\d+)',
'gatk': r'The Genome Analysis Toolkit \(GATK\) v(\d+\.\d+\.?\d?)',
'lofreq': r'version: (\d+\.\d+\.?\d?)',
'bedtools': r'bedtools v(\d+\.\d+\.?\d?)',
'minimap2': r'(\d+\.\d+)',
'freebayes': r'version: v(\d+\.\d+\.?\d?)',
'pilon': r'Pilon version (\d+\.\d+\.?\d?)',
'snpEff': r'SnpEff\t(\d+\.\d+.?)',
'kmc': r'K-Mer Counter \(KMC\) ver. (\d+\.\d+\.\d+)',
'sourmash': r'sourmash (\d+\.\d+\.\d?)',
}
x = sp.run([cmds[tool]], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
text = x.stdout.decode('utf-8') + x.stderr.decode('utf-8')
version = re.search(regex[tool], text).group(1)
return version

class TempFilePrefix(object):
"""Create a temporary file prefix"""
Expand Down Expand Up @@ -482,10 +519,10 @@ def add_arguments_to_self(self,args: dict) -> None:



def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess:
def run_cmd(cmd: str, desc=None, log: str=None, exit_on_error: bool=True) -> sp.CompletedProcess:
if desc:
logging.info(desc)
programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd) if x!=""])
programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd.strip()) if x!=""])
missing = [p for p in programs if which(p)==False]
if len(missing)>0:
raise ValueError("Cant find programs: %s\n" % (", ".join(missing)))
Expand All @@ -495,7 +532,8 @@ def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess:
result = sp.run(cmd,shell=True,stderr=output,stdout=output)
if result.returncode != 0:
logging.error(result.stderr.decode("utf-8"))
raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode()))
if exit_on_error:
raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode()))
return result

def cmd_out(cmd: str) -> Generator[str, None, None]:
Expand Down
16 changes: 16 additions & 0 deletions pathogenprofiler/variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,19 @@ def call_variants(self) -> Vcf:
raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform))
return self.run_calling(self.calling_cmd)

class FreebayesHaplotypeCaller(VariantCaller):
__software__ = "freebayes-haplotype"
def call_variants(self) -> Vcf:
# Call variants using Lofreq

self.calling_cmd = """
samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | \
samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && \
samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && \
freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | \
bcftools view -Oz -o %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \
tabix %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \
freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam --haplotype-length 100 --haplotype-basis-alleles %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz | bcftools norm -a -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz
""" % vars(self)

return self.run_calling(self.calling_cmd)
24 changes: 17 additions & 7 deletions pathogenprofiler/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ def run_snpeff(self,db,ref_file,gff_file,rename_chroms = None, split_indels=True
self.phasing_bam = f"--bam {bam_for_phasing}"
else:
self.phasing_bam = ""
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
# run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
else :
run_cmd("bcftools view %(filename)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools view -Oz -o %(vcf_csq_file)s" % vars(self))
return Vcf(self.vcf_csq_file,self.prefix)
Expand Down Expand Up @@ -186,7 +187,7 @@ def load_ann(
variants = []
vcf = pysam.VariantFile(self.filename)
for var in vcf:
logging.debug(str(var)[:500])
logging.debug(str(var)[:5000])
chrom = var.chrom
pos = var.pos
ref = var.ref
Expand All @@ -211,14 +212,23 @@ def load_ann(
ann_list = [x.split("|") for x in ann_strs]
for alt in alleles[1:]:
strand_support = get_stand_support(var,alt)

if strand_support[0] != None:
freq = sum(strand_support)/sum(ad)
else:
if 'QNAME' in var.info:
freq = 1.0
elif sv:
freq = af_dict[alt]
else:
raise NotImplementedError

tmp_var = Variant(
chrom = chrom,
pos = int(pos),
ref = ref,
alt = alt,
depth = sum(ad),
freq = af_dict[alt],
freq = freq,
forward_reads = strand_support[0],
reverse_reads = strand_support[1],
sv = sv,
Expand Down
Loading

0 comments on commit 67cd363

Please sign in to comment.