From f705f164814d88fd002599ee6cd645c799d7e990 Mon Sep 17 00:00:00 2001 From: Florian Tesson Date: Mon, 8 Jul 2024 13:52:04 +0200 Subject: [PATCH] Add antidefense results parsing --- defense_finder_posttreat/best_solution.py | 10 ++++++---- defense_finder_posttreat/df_hmmer.py | 19 +++++++++++++------ defense_finder_posttreat/df_systems.py | 15 ++++++++++----- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/defense_finder_posttreat/best_solution.py b/defense_finder_posttreat/best_solution.py index e7386e1..7abc050 100644 --- a/defense_finder_posttreat/best_solution.py +++ b/defense_finder_posttreat/best_solution.py @@ -49,10 +49,12 @@ def format_best_solution(p): p['type'] = p.model_fqn.map(lambda x: x.split('/')[-2]) p['subtype'] = p.model_fqn.map(lambda x: x.split('/')[-1]) p.loc[p['type'] == 'CasFinder', 'type'] = 'Cas' - p = p.sort_values('hit_pos') - - p.loc[p.model_fqn.str.contains("ADF"),'activity']='Antidefense' - p.loc[~p.model_fqn.str.contains("ADF"),'activity']='Defense' + p = p.sort_values('hit_pos').reset_index(drop=True) + if len(p) > 0: + p.loc[p.model_fqn.str.contains("ADF"),'activity']='Antidefense' + p.loc[~p.model_fqn.str.contains("ADF"),'activity']='Defense' + else: + p['activity']='' p=p.sort_values('hit_pos') return p diff --git a/defense_finder_posttreat/df_hmmer.py b/defense_finder_posttreat/df_hmmer.py index 1ae703c..8d7e056 100644 --- a/defense_finder_posttreat/df_hmmer.py +++ b/defense_finder_posttreat/df_hmmer.py @@ -1,5 +1,6 @@ import os import pandas as pd +import shutil def remove_duplicates(hmmer_hits): @@ -14,12 +15,18 @@ def export_defense_finder_hmmer_hits(tmp_dir, outdir, filename): paths = get_hmmer_paths(tmp_dir) hmmer_hits = pd.DataFrame() - for path in paths: - d = parse_hmmer_results_file(path) - if d.empty is False: - hmmer_hits = pd.concat([hmmer_hits, remove_duplicates(d)]) - if hmmer_hits.empty is True: - hmmer_hits = pd.DataFrame(columns=get_hmmer_keys()) + concat = os.path.join(tmp_dir, 'hmm_extracts.concat') + + # concatenate all hmm_extract before reading them + with open(concat,'wb') as wfd: + for f in paths: + with open(f,'rb') as fd: + # skip the 5 first lines which are comments + for _ in range(5): + _ = fd.readline() + shutil.copyfileobj(fd, wfd) + + hmmer_hits = pd.read_table(concat, names=get_hmmer_keys()) hmmer_hits = remove_duplicates(hmmer_hits) hmmer_hits = hmmer_hits.sort_values(['hit_pos', 'hit_score']) diff --git a/defense_finder_posttreat/df_systems.py b/defense_finder_posttreat/df_systems.py index ead4dcc..4edf654 100644 --- a/defense_finder_posttreat/df_systems.py +++ b/defense_finder_posttreat/df_systems.py @@ -4,16 +4,16 @@ def export_defense_finder_systems(defense_finder_genes, outdir, filename): systems = build_defense_finder_systems(defense_finder_genes) + print(systems) systems.to_csv(os.path.join(outdir, filename + '_defense_finder_systems.tsv'), sep='\t', index=False) def build_defense_finder_systems(defense_finder_genes): if defense_finder_genes.empty is False: - out = defense_finder_genes.groupby(['sys_id', 'type', 'subtype'])[['hit_id', 'hit_pos']].apply(lambda x: ",".join(x.sort_values('hit_pos').hit_id.to_list())).reset_index() - out.columns = ['sys_id', 'type', 'subtype', 'protein_in_syst'] + out = defense_finder_genes.groupby(['sys_id', 'type', 'subtype', 'activity'])[['hit_id', 'hit_pos']].apply(lambda x: ",".join(x.sort_values('hit_pos').hit_id.to_list())).reset_index() + out.columns = ['sys_id', 'type', 'subtype', 'activity', 'protein_in_syst'] out['sys_beg'] = out.protein_in_syst.map(lambda x: x.split(",")[0]) out['sys_end'] = out.protein_in_syst.map(lambda x: x.split(",")[-1]) - genes_count = defense_finder_genes.sys_id.value_counts().reset_index() genes_count.columns = ['sys_id', 'genes_count'] out = out.merge(genes_count, on='sys_id') @@ -24,6 +24,11 @@ def build_defense_finder_systems(defense_finder_genes): out = out.merge(defense_finder_genes[['hit_id', 'hit_pos']], left_on='sys_beg', right_on='hit_id').sort_values('hit_pos').reset_index(drop=True) out = out.drop(columns=['hit_id', 'hit_pos']) else: - out = pd.DataFrame(columns=['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']) + out = pd.DataFrame(columns=['sys_id', 'type', 'subtype', 'activity', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']) + + return out[['sys_id', 'type', 'subtype', 'activity', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']] + - return out[['sys_id', 'type', 'subtype', 'sys_beg', 'sys_end', 'protein_in_syst', 'genes_count', 'name_of_profiles_in_sys']] +# defense=pd.read_table('./Test_version/ESCO001.0722.00768.C001_defense_finder_genes.tsv',sep='\t') +# export_defense_finder_systems(defense,'./Test_version', +# 'Test.tsv') \ No newline at end of file