Skip to content

Commit

Permalink
Add antidefense results parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
FlorianTesson committed Jul 8, 2024
1 parent 5e44e3f commit f705f16
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 15 deletions.
10 changes: 6 additions & 4 deletions defense_finder_posttreat/best_solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ def format_best_solution(p):
p['type'] = p.model_fqn.map(lambda x: x.split('/')[-2])
p['subtype'] = p.model_fqn.map(lambda x: x.split('/')[-1])
p.loc[p['type'] == 'CasFinder', 'type'] = 'Cas'
p = p.sort_values('hit_pos')

p.loc[p.model_fqn.str.contains("ADF"),'activity']='Antidefense'
p.loc[~p.model_fqn.str.contains("ADF"),'activity']='Defense'
p = p.sort_values('hit_pos').reset_index(drop=True)
if len(p) > 0:
p.loc[p.model_fqn.str.contains("ADF"),'activity']='Antidefense'
p.loc[~p.model_fqn.str.contains("ADF"),'activity']='Defense'
else:
p['activity']=''
p=p.sort_values('hit_pos')

return p
19 changes: 13 additions & 6 deletions defense_finder_posttreat/df_hmmer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import pandas as pd
import shutil


def remove_duplicates(hmmer_hits):
Expand All @@ -14,12 +15,18 @@ def export_defense_finder_hmmer_hits(tmp_dir, outdir, filename):
paths = get_hmmer_paths(tmp_dir)
hmmer_hits = pd.DataFrame()

for path in paths:
d = parse_hmmer_results_file(path)
if d.empty is False:
hmmer_hits = pd.concat([hmmer_hits, remove_duplicates(d)])
if hmmer_hits.empty is True:
hmmer_hits = pd.DataFrame(columns=get_hmmer_keys())
concat = os.path.join(tmp_dir, 'hmm_extracts.concat')

# concatenate all hmm_extract before reading them
with open(concat,'wb') as wfd:
for f in paths:
with open(f,'rb') as fd:
# skip the 5 first lines which are comments
for _ in range(5):
_ = fd.readline()
shutil.copyfileobj(fd, wfd)

hmmer_hits = pd.read_table(concat, names=get_hmmer_keys())

hmmer_hits = remove_duplicates(hmmer_hits)
hmmer_hits = hmmer_hits.sort_values(['hit_pos', 'hit_score'])
Expand Down
15 changes: 10 additions & 5 deletions defense_finder_posttreat/df_systems.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@

def export_defense_finder_systems(defense_finder_genes, outdir, filename):
systems = build_defense_finder_systems(defense_finder_genes)
print(systems)
systems.to_csv(os.path.join(outdir, filename + '_defense_finder_systems.tsv'), sep='\t', index=False)


def build_defense_finder_systems(defense_finder_genes):
if defense_finder_genes.empty is False:
out = defense_finder_genes.groupby(['sys_id', 'type', 'subtype'])[['hit_id', 'hit_pos']].apply(lambda x: ",".join(x.sort_values('hit_pos').hit_id.to_list())).reset_index()
out.columns = ['sys_id', 'type', 'subtype', 'protein_in_syst']
out = defense_finder_genes.groupby(['sys_id', 'type', 'subtype', 'activity'])[['hit_id', 'hit_pos']].apply(lambda x: ",".join(x.sort_values('hit_pos').hit_id.to_list())).reset_index()
out.columns = ['sys_id', 'type', 'subtype', 'activity', 'protein_in_syst']
out['sys_beg'] = out.protein_in_syst.map(lambda x: x.split(",")[0])
out['sys_end'] = out.protein_in_syst.map(lambda x: x.split(",")[-1])

genes_count = defense_finder_genes.sys_id.value_counts().reset_index()
genes_count.columns = ['sys_id', 'genes_count']
out = out.merge(genes_count, on='sys_id')
Expand All @@ -24,6 +24,11 @@ def build_defense_finder_systems(defense_finder_genes):
out = out.merge(defense_finder_genes[['hit_id', 'hit_pos']], left_on='sys_beg', right_on='hit_id').sort_values('hit_pos').reset_index(drop=True)
out = out.drop(columns=['hit_id', 'hit_pos'])
else:
out = pd.DataFrame(columns=['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys'])
out = pd.DataFrame(columns=['sys_id', 'type', 'subtype', 'activity', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys'])

return out[['sys_id', 'type', 'subtype', 'activity', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']]


return out[['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']]
# defense=pd.read_table('./Test_version/ESCO001.0722.00768.C001_defense_finder_genes.tsv',sep='\t')
# export_defense_finder_systems(defense,'./Test_version',
# 'Test.tsv')

0 comments on commit f705f16

Please sign in to comment.