Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #50

Merged
merged 22 commits into from
Aug 29, 2024
Merged

Dev #50

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pathogenprofiler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
from .cli import *
from .rules import *
from .models import *
__version__ = "4.3.0"
__version__ = "4.4.0"
11 changes: 9 additions & 2 deletions pathogenprofiler/bam.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,16 @@ def run_delly(self,ref_file: str,bed_file: str) -> Vcf:
self.bed_file = bed_file
self.ref_file = ref_file
if self.platform=="illumina":
run_cmd("delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self))
cmd = "delly call -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)
result = run_cmd(cmd, exit_on_error=False)
else:
run_cmd("delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self))
cmd = "delly lr -t DEL -g %(ref_file)s %(bam_file)s -o %(prefix)s.delly.bcf" % vars(self)
result = run_cmd(cmd, exit_on_error=False)

exitcode = result.returncode
if exitcode!=0:
logging.error("Delly failed, skipping")
return None

run_cmd("bcftools view -c 2 %(prefix)s.delly.bcf | bcftools view -e '(INFO/END-POS)>=100000' -Oz -o %(prefix)s.delly.vcf.gz" % vars(self))
run_cmd("bcftools index %(prefix)s.delly.vcf.gz" % vars(self))
Expand Down
3 changes: 2 additions & 1 deletion pathogenprofiler/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .fastq import Fastq
from .utils import run_cmd, cmd_out
from .bam import Bam
from .db import get_db
from .db import get_db, check_db_exists
from .fasta import Fasta, Paf
from .profiler import vcf_profiler, bam_barcoder, vcf_barcoder
import logging
Expand Down Expand Up @@ -335,6 +335,7 @@ def set_species(args: argparse.Namespace) -> SpeciesPrediction:
>>> species_prediction.species
[Species(species='Mycobacterium abscessus', prediction_info=None)]
"""
check_db_exists(args.software_name,args.resistance_db)
conf = get_db(args.software_name,args.resistance_db)
species = Species(
species=conf["species"]
Expand Down
43 changes: 33 additions & 10 deletions pathogenprofiler/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def write_bed(db: dict,gene_dict: dict,gene_info: List[Gene],ref_file: str,outfi
if len(drugs)==0:
drugs = "None"
else:
drugs = ",".join(drugs)
drugs = ",".join(sorted(list(drugs)))
lines.append([
gene_info[gene].chrom,
str(genome_start),
Expand Down Expand Up @@ -574,12 +574,30 @@ def check_db_files(variables):
if not os.path.isfile(val+".fai"):
index_ref(val)

def is_db_path(string):
if "/" in string:
return True
elif os.path.isfile(string+".variables.json"):
return True
return False

def check_db_exists(software_name,db_name):
db = get_db(software_name=software_name,db_name=db_name,verbose=False)
if db is None:
share_path = f"{sys.base_prefix}/share/{software_name}/"
logging.error(f"DB {db_name} does not exist in the current directory or in {share_path}")
raise FileExistsError


def get_db(software_name,db_name):
if "/" in db_name:
share_path = "/".join(db_name.split("/")[:-1])
db_name = db_name.split("/")[-1]
variable_file_name = f"{share_path}/{db_name}.variables.json"
def get_db(software_name,db_name,verbose=True):
if is_db_path(db_name):
if "/" in db_name:
share_path = "/".join(db_name.split("/")[:-1])
db_name = db_name.split("/")[-1]
variable_file_name = f"{share_path}/{db_name}.variables.json"
else:
share_path = '.'
variable_file_name = f"{share_path}/{db_name}.variables.json"
else:
share_path = f"{sys.base_prefix}/share/{software_name}/"
variable_file_name = get_variable_file_name(software_name,db_name)
Expand All @@ -588,7 +606,8 @@ def get_db(software_name,db_name):
return None
variables = json.load(open(variable_file_name))
for key,val in variables['files'].items():
logging.info(f"Using {key} file: {share_path}/{val}")
if verbose:
logging.info(f"Using {key} file: {share_path}/{val}")
if ".json" in val:
variables[key] = json.load(open(f"{share_path}/{val}"))
elif key=='rules':
Expand All @@ -608,9 +627,13 @@ def list_db(software_name):


def create_species_db(args,extra_files = None):
variables = json.load(open("variables.json"))
if not extra_files:
extra_files = {}
version = {"name":args.prefix}
version = {
"db-schema-version":variables['db-schema-version'],
"name":args.prefix
}
if os.path.isdir('.git'):
for l in pp.cmd_out("git log | head -4"):
row = l.strip().split()
Expand All @@ -626,12 +649,12 @@ def create_species_db(args,extra_files = None):
shutil.copyfile(file,target)

variables_file = args.prefix+".variables.json"
variables = {
variables.update({
"version": version,
"files":{
"variables": variables_file
}
}
})

if extra_files:
for key,val in extra_files.items():
Expand Down
28 changes: 27 additions & 1 deletion pathogenprofiler/hgvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,4 +332,30 @@ def verify_mutation_list(hgvs_mutations: List[dict], genes: List[Gene], refseq:
converted_mutations[key] = mutation_conversion[key]

logging.debug(converted_mutations)
return converted_mutations
return converted_mutations


def split_protein_hgvs(hgvs) -> List[str]:
"""
Split a protein HGVS into its components.

Parameters:
hgvs (str): The protein HGVS.

Returns:
List[str]: A list of protein HGVS components.

Example:
>>> split_protein_hgvs("p.MetAsnLys74IleGluThr")
['p.Met74Ile', 'p.Asn74Glu', 'p.Lys74Thr']
"""
# Remove the 'p.' prefix
hgvs = hgvs[2:]
components = []
aa = re.findall(r'[A-Z][a-z][a-z]', hgvs)
refs = aa[:len(aa)//2]
alts = aa[len(aa)//2:]
start_pos = int(re.search(r'\d+', hgvs).group())
for i,(ref,alt) in enumerate(zip(refs, alts)):
components.append(f'p.{ref}{start_pos+i}{alt}')
return components
2 changes: 1 addition & 1 deletion pathogenprofiler/mutation_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def add(self,data: Union[dict,list]) -> None:
self.container.add(json.dumps(data))

def to_dict_list(self) -> List[dict]:
return [json.loads(d) for d in self.container]
return [json.loads(d) for d in sorted(list(self.container))]



Expand Down
3 changes: 2 additions & 1 deletion pathogenprofiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def vcf_profiler(args: argparse.Namespace) -> List[Union[Variant,DrVariant,Gene,
vcf_targets_file = "%s.targets.vcf.gz" % args.files_prefix
if not vcf_is_indexed(args.vcf):
run_cmd("bcftools index %s" % args.vcf)
# run_cmd("bcftools view -R %s %s -Oz -o %s" % (conf["bed"],args.vcf,vcf_targets_file))
if args.caller == 'freebayes-haplotype':
args.supplementary_bam = None
annotated_variants = vcf_variant_profiler(conf, args.files_prefix, args.vcf, bam_for_phasing=args.supplementary_bam)
return annotated_variants

Expand Down
44 changes: 41 additions & 3 deletions pathogenprofiler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,43 @@

shared_dict = {}

def get_version(tool):
cmds = {
'bcftools': 'bcftools --version',
'samtools': 'samtools --version',
'delly': 'delly --version',
'bwa': 'bwa',
'trimmomatic': 'trimmomatic -version',
'gatk': 'gatk -version',
'lofreq': 'lofreq version',
'bedtools': 'bedtools --version',
'minimap2': 'minimap2 --version',
'freebayes': 'freebayes --version',
'pilon': 'pilon --version',
'snpEff': 'snpEff -version',
'kmc': 'kmc',
'sourmash': 'sourmash --version',
}
regex = {
'bcftools': r'bcftools (\d+\.\d+\.?\d?)',
'samtools': r'samtools (\d+\.\d+\.?\d?)',
'delly': r'Delly version: v(\d+\.\d+\.?\d?)',
'bwa': r'Version: (\d+\.\d+\.?\d?)',
'trimmomatic': r'(\d+\.\d+)',
'gatk': r'The Genome Analysis Toolkit \(GATK\) v(\d+\.\d+\.?\d?)',
'lofreq': r'version: (\d+\.\d+\.?\d?)',
'bedtools': r'bedtools v(\d+\.\d+\.?\d?)',
'minimap2': r'(\d+\.\d+)',
'freebayes': r'version: v(\d+\.\d+\.?\d?)',
'pilon': r'Pilon version (\d+\.\d+\.?\d?)',
'snpEff': r'SnpEff\t(\d+\.\d+.?)',
'kmc': r'K-Mer Counter \(KMC\) ver. (\d+\.\d+\.\d+)',
'sourmash': r'sourmash (\d+\.\d+\.\d?)',
}
x = sp.run([cmds[tool]], shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
text = x.stdout.decode('utf-8') + x.stderr.decode('utf-8')
version = re.search(regex[tool], text).group(1)
return version

class TempFilePrefix(object):
"""Create a temporary file prefix"""
Expand Down Expand Up @@ -482,10 +519,10 @@ def add_arguments_to_self(self,args: dict) -> None:



def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess:
def run_cmd(cmd: str, desc=None, log: str=None, exit_on_error: bool=True) -> sp.CompletedProcess:
if desc:
logging.info(desc)
programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd) if x!=""])
programs = set([x.strip().split()[0] for x in re.split("[|&;]",cmd.strip()) if x!=""])
missing = [p for p in programs if which(p)==False]
if len(missing)>0:
raise ValueError("Cant find programs: %s\n" % (", ".join(missing)))
Expand All @@ -495,7 +532,8 @@ def run_cmd(cmd: str, desc=None, log: str=None) -> sp.CompletedProcess:
result = sp.run(cmd,shell=True,stderr=output,stdout=output)
if result.returncode != 0:
logging.error(result.stderr.decode("utf-8"))
raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode()))
if exit_on_error:
raise ValueError("Command Failed:\n%s\nstderr:\n%s" % (cmd,result.stderr.decode()))
return result

def cmd_out(cmd: str) -> Generator[str, None, None]:
Expand Down
16 changes: 16 additions & 0 deletions pathogenprofiler/variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,19 @@ def call_variants(self) -> Vcf:
raise NotImplementedError("%s not implemented for %s platform" % (self.__software__,self.platform))
return self.run_calling(self.calling_cmd)

class FreebayesHaplotypeCaller(VariantCaller):
__software__ = "freebayes-haplotype"
def call_variants(self) -> Vcf:
# Call variants using Lofreq

self.calling_cmd = """
samtools view -T %(ref_file)s -h %(bam_file)s {region} %(samclip_cmd)s | \
samtools view -b > %(temp_file_prefix)s.{region_safe}.tmp.bam && \
samtools index %(temp_file_prefix)s.{region_safe}.tmp.bam && \
freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam | \
bcftools view -Oz -o %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \
tabix %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz && \
freebayes -f %(ref_file)s -r {region} %(calling_params)s %(temp_file_prefix)s.{region_safe}.tmp.bam --haplotype-length 100 --haplotype-basis-alleles %(temp_file_prefix)s.{region_safe}.tmp.vcf.gz | bcftools norm -a -Oz -o %(temp_file_prefix)s.{region_safe}.vcf.gz
""" % vars(self)

return self.run_calling(self.calling_cmd)
24 changes: 17 additions & 7 deletions pathogenprofiler/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,11 @@ def run_snpeff(self,db,ref_file,gff_file,rename_chroms = None, split_indels=True
self.phasing_bam = f"--bam {bam_for_phasing}"
else:
self.phasing_bam = ""
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
run_cmd("bcftools view -c 1 -a %(filename)s | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v snps | combine_vcf_variants.py --ref %(ref_file)s --gff %(gff_file)s %(phasing_bam)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file1)s && bcftools index %(tmp_file1)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v indels | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file2)s && bcftools index %(tmp_file2)s" % vars(self))
# run_cmd("bcftools view -c 1 -a %(filename)s | bcftools view -v other | realign_tandem_deletions.py - %(ref_file)s %(gff_file)s - | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools sort -Oz -o %(tmp_file3)s && bcftools index %(tmp_file3)s" % vars(self))
# run_cmd("bcftools concat -a %(tmp_file1)s %(tmp_file2)s %(tmp_file3)s | bcftools sort -Oz -o %(vcf_csq_file)s" % vars(self))
else :
run_cmd("bcftools view %(filename)s | %(rename_cmd)s snpEff ann %(snpeff_data_dir_opt)s -noLog -noStats %(db)s - %(re_rename_cmd)s | bcftools view -Oz -o %(vcf_csq_file)s" % vars(self))
return Vcf(self.vcf_csq_file,self.prefix)
Expand Down Expand Up @@ -186,7 +187,7 @@ def load_ann(
variants = []
vcf = pysam.VariantFile(self.filename)
for var in vcf:
logging.debug(str(var)[:500])
logging.debug(str(var)[:5000])
chrom = var.chrom
pos = var.pos
ref = var.ref
Expand All @@ -211,14 +212,23 @@ def load_ann(
ann_list = [x.split("|") for x in ann_strs]
for alt in alleles[1:]:
strand_support = get_stand_support(var,alt)

if strand_support[0] != None:
freq = sum(strand_support)/sum(ad)
else:
if 'QNAME' in var.info:
freq = 1.0
elif sv:
freq = af_dict[alt]
else:
raise NotImplementedError

tmp_var = Variant(
chrom = chrom,
pos = int(pos),
ref = ref,
alt = alt,
depth = sum(ad),
freq = af_dict[alt],
freq = freq,
forward_reads = strand_support[0],
reverse_reads = strand_support[1],
sv = sv,
Expand Down
Loading