Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pandas in the post treatment #59

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion defense_finder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from macsypy.scripts import macsyfinder


def run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, nocut_ga, loglevel):
def run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, nocut_ga, loglevel, index_dir):

gen_args = ['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'defense-finder-models/DefenseFinder_{i}', 'all',
'--out-dir', os.path.join(tmp_dir, 'DF_{i}'), '--w', str(workers),
Expand All @@ -23,6 +23,8 @@ def run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, nocut
msf_cmd.append("--no-cut-ga")
if models_dir:
msf_cmd.extend(("--models-dir", models_dir))
if index_dir:
msf_cmd.extend(("--index-dir", index_dir))
if loglevel != "DEBUG":
msf_cmd.append("--mute")

Expand Down
7 changes: 5 additions & 2 deletions defense_finder_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,12 @@ def update(models_dir=None, force_reinstall: bool = False):
help='Advanced! Run macsyfinder in no-cut-ga mode. The validity of the genes and systems found is not guaranteed!')
@click.option('--log-level', 'loglevel', default="INFO",
help='set the logging level among DEBUG, [INFO], WARNING, ERROR, CRITICAL')
@click.option('--index-dir', 'index_dir', required=False, help='Specify a directory to write the index files required by macsyfinder when the input file is in a read-only folder')


def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, preserve_raw: bool, no_cut_ga: bool, models_dir: str = None, loglevel : str = "INFO"):
def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, preserve_raw: bool,
no_cut_ga: bool, models_dir: str = None, loglevel : str = "INFO",
index_dir: str = None):
"""
Search for all known anti-phage defense systems in the target fasta file.
"""
Expand Down Expand Up @@ -144,7 +147,7 @@ def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, pres
protein_file_name = filename

logger.info("Running DefenseFinder")
defense_finder.run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, no_cut_ga, loglevel)
defense_finder.run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, no_cut_ga, loglevel, index_dir)
logger.info("Post-treatment of the data")
defense_finder_posttreat.run(tmp_dir, outdir, os.path.splitext(os.path.basename(filename))[0])

Expand Down
66 changes: 27 additions & 39 deletions defense_finder_posttreat/best_solution.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,42 @@
import os
import csv
import pandas as pd

from macsypy.serialization import TsvSystemSerializer
from macsypy.registries import split_def_name

def get(tmp_dir):
results = os.listdir(tmp_dir)
acc = []

acc = pd.DataFrame()
for family_dir in results:
family_path = os.path.join(tmp_dir, family_dir)
acc = acc + parse_best_solution(family_path)


if is_file_empty(family_path+'/best_solution.tsv') == False :
print(parse_best_solution(family_path).dtypes)
acc = pd.concat([acc , parse_best_solution(family_path)])
if acc.empty == True :
acc = pd.DataFrame(columns=get_best_solution_keys('\t'))
print(acc.dtypes)
return format_best_solution(acc)

def uncomment(csvfile):
"""
generator which yield lines in macsyfinder/best_solution files but skip comments or empty lines

:param csvfile: the csv to parse
:type cvsfile: file object
"""
for line in csvfile:
uncommented = line.split('#')[0].strip()
if uncommented:
yield uncommented

def is_file_empty(path):
prev_line = ''
with open(path, 'r') as f:
for line in f:
if line.startswith('#'):
prev_line = line
else:
break
if prev_line.startswith('# No Systems found'):
return True
return False

def parse_best_solution(dir):
"""
:param dir: the macsyfinder result directory path
:type dir: str
"""
delimiter = '\t'
with open(os.path.join(dir, 'best_solution.tsv')) as tsv_file:
tsv = csv.DictReader(uncomment(tsv_file),
fieldnames=get_best_solution_keys(delimiter=delimiter),
delimiter=delimiter)
try:
next(tsv)
except StopIteration:
return []
data = list(tsv)
data=pd.read_table(dir+'/best_solution.tsv',sep=delimiter,comment='#')

return data


Expand All @@ -50,15 +45,8 @@ def get_best_solution_keys(delimiter='\t'):


def format_best_solution(p):
out = []
for l in p:
gene_ref = l['model_fqn']
gene_ref_elements = split_def_name(gene_ref)
type = gene_ref_elements[-2]
subtype = gene_ref_elements[-1]
native_keys = list(filter(lambda i: i not in ['type', 'subtype'], get_best_solution_keys()))
new_line = { key: l[key] for key in native_keys }
new_line['type'] = type
new_line['subtype'] = subtype
out.append(new_line)
return out
p['type'] = p.model_fqn.map(lambda x : x.split('/')[-2])
p['subtype'] = p.model_fqn.map(lambda x : x.split('/')[-1])
p=p.sort_values('hit_pos')

return(p)
18 changes: 3 additions & 15 deletions defense_finder_posttreat/df_genes.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,11 @@
import os
import csv
from defense_finder_posttreat import best_solution

def export_defense_finder_genes(defense_finder_genes, outdir, filename):
defense_finder_genes_list = defense_finder_genes_to_list(defense_finder_genes)
write_defense_finder_genes(defense_finder_genes_list, outdir, filename)
defense_finder_genes.to_csv(outdir+'/'+filename+'_defense_finder_genes.tsv',sep='\t',index=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idem os.path.join


def defense_finder_genes_to_list(defense_finder_genes):
header = best_solution.get_best_solution_keys()
out = [header]
for g in defense_finder_genes:
l = []
for key in header:
l.append(g[key])
out.append(l)
return out

def write_defense_finder_genes(defense_finder_genes_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_genes.tsv')
with open(filepath, 'w') as defense_finder_genes_file:
write = csv.writer(defense_finder_genes_file, delimiter='\t')
write.writerows(defense_finder_genes_list)
defense_finder_genes_list.to_csv('filepath',sep='\t',index=False)

66 changes: 21 additions & 45 deletions defense_finder_posttreat/df_hmmer.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,38 @@
import os
import csv
import pandas as pd

def get_hit_sort_attr(hit):
return hit['hit_id']

def remove_duplicates(hmmer_hits):
temp_list = []
for i in hmmer_hits:
if i not in temp_list:
temp_list.append(i)
return temp_list

jeanrjc marked this conversation as resolved.
Show resolved Hide resolved
hmmer_hits=hmmer_hits.sort_values('hit_score',ascending=False).drop_duplicates(['hit_id','gene_name'])

return hmmer_hits

def export_defense_finder_hmmer_hits(tmp_dir, outdir, filename):
paths = get_hmmer_paths(tmp_dir)
hmmer_hits = []
hmmer_hits = pd.DataFrame()

for path in paths:
d = parse_hmmer_results_file(path)
hmmer_hits = hmmer_hits + remove_duplicates(d)
sorted_hmmer_hits = sorted(hmmer_hits, key=get_hit_sort_attr)
hmmer_hits_list = hmmer_to_list(sorted_hmmer_hits)
write_defense_finder_hmmer(hmmer_hits_list, outdir, filename)

def write_defense_finder_hmmer(hmmer_hits_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_hmmer.tsv')
with open(filepath, 'w') as defense_finder_hmmer_file:
write = csv.writer(defense_finder_hmmer_file, delimiter='\t')
write.writerows(hmmer_hits_list)
defense_finder_hmmer_file.close()
if d.empty == False:
hmmer_hits = pd.concat([hmmer_hits , remove_duplicates(d)])
if hmmer_hits.empty == True:
hmmer_hits=pd.DataFrame(columns=get_hmmer_keys())

hmmer_hits=remove_duplicates(hmmer_hits)
hmmer_hits=hmmer_hits.sort_values('hit_score')

write_defense_finder_hmmer(hmmer_hits, outdir, filename)

def write_defense_finder_hmmer(hmmer_hits, outdir, filename):
hmmer_hits.to_csv(outdir+"/"+filename+"_defense_finder_hmmer.tsv",sep='\t',index=False)

def get_hmmer_keys():
return ['hit_id', 'replicon', 'hit_pos', 'hit_sequence_length', 'gene_name','i_eval','hit_score','hit_profile_cov','hit_seq_cov','hit_begin_match','hit_end_match']

def parse_hmmer_results_file(path):
tsv_file = open(path)
tsv = csv.reader(tsv_file, delimiter='\t')
data = []
for row in tsv:
if not row[0].startswith('#'):
data.append(row)
tsv_file.close()
out = []
for l in data:
if not l: continue
line_as_dict = {}
for idx, val in enumerate(get_hmmer_keys()):
line_as_dict[val] = l[idx]
out.append(line_as_dict)
return out
data = pd.read_table(path, sep='\t',comment='#',names=get_hmmer_keys())
return data

def get_hmmer_paths(results_dir):
family_dirs = os.listdir(results_dir)
Expand All @@ -59,13 +45,3 @@ def get_hmmer_paths(results_dir):
files.append(entry)
return list(map(lambda i: i.path, files))


def hmmer_to_list(hmmer_hits):
header = get_hmmer_keys()
out = [header]
for s in hmmer_hits:
l = []
for key in header:
l.append(s[key])
out.append(l)
return out
58 changes: 18 additions & 40 deletions defense_finder_posttreat/df_systems.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,27 @@
import os
import csv
from itertools import groupby
from functools import reduce
import pandas as pd

def export_defense_finder_systems(defense_finder_genes, outdir, filename):
systems = build_defense_finder_systems(defense_finder_genes)
systems_list = systems_to_list(systems)
write_defense_finder_systems(systems_list, outdir, filename)
systems.to_csv(outdir+'/'+filename+'_defense_finder_systems.tsv',sep='\t',index=False)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use os.path.join


def systems_to_list(systems):
header = get_system_keys()
out = [header]
for s in systems:
l = []
for key in header:
l.append(s[key])
out.append(l)
return out

def write_defense_finder_systems(systems_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_systems.tsv')
with open(filepath, 'w') as defense_finder_systems_file:
write = csv.writer(defense_finder_systems_file, delimiter='\t')
write.writerows(systems_list)
def build_defense_finder_systems(defense_finder_genes):
sys=defense_finder_genes.drop_duplicates('sys_id')[['sys_id' , 'type' , 'subtype']]

def get_system_keys():
return [ 'sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys' ]
sys_beg=defense_finder_genes.sort_values('hit_pos').drop_duplicates('sys_id').rename({'hit_id' : 'sys_beg'},axis=1)[['sys_id','sys_beg']]
sys_end=defense_finder_genes.sort_values('hit_pos' , ascending=False).drop_duplicates('sys_id').rename({'hit_id' : 'sys_end'},axis=1)[['sys_id','sys_end']]
protein_in_syst=defense_finder_genes.groupby('sys_id').hit_id.apply(lambda x: ",".join(x.sort_values())).reset_index().rename({'hit_id':'protein_in_syst'},axis=1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simplify to get sys_beg and sys_end from protein in syst

name_of_profiles_in_sys=defense_finder_genes.groupby('sys_id').gene_name.apply(lambda x : ",".join(x.sort_values())).reset_index().rename({'hit_id' : 'protein_in_syst'},axis = 1)
genes_count=defense_finder_genes.sys_id.value_counts().reset_index()
genes_count.columns=['sys_id','genes_count']

def projection(val):
return val['sys_id']

def build_defense_finder_systems(defense_finder_genes):
system_goups = [list(it) for k, it in groupby(defense_finder_genes, projection)]
out = []
for system_group in system_goups:
item = {}
first_item = system_group[0]
last_item = system_group[-1]
item['sys_id'] = first_item['sys_id']
item['sys_beg'] = first_item['hit_id']
item['sys_end'] = last_item['hit_id']
item['type'] = first_item['type']
item['subtype'] = first_item['subtype']
item['protein_in_syst'] = reduce(lambda acc, s: acc + ',' + s['hit_id'], system_group, '')[1:]
item['genes_count'] = len(system_group)
item['name_of_profiles_in_sys'] = reduce(lambda acc, s: acc + ',' + s['gene_name'], system_group, '')[1:]
out.append(item)
return out
out=sys.merge(sys_beg,on = 'sys_id')
out=out.merge(sys_end,on = 'sys_id')
out=out.merge(protein_in_syst,on = 'sys_id')
out=out.merge(genes_count,on = 'sys_id')
out=out.merge(name_of_profiles_in_sys,on = 'sys_id')

return(out)

Loading