Skip to content

Commit

Permalink
Merge pull request #64 from mdmparis/add_adf
Browse files Browse the repository at this point in the history
Add ADF and pandas posttreatment
  • Loading branch information
FlorianTesson authored Jul 19, 2024
2 parents 879151e + d80d9e9 commit 6816b1b
Show file tree
Hide file tree
Showing 18 changed files with 2,154 additions and 3,423 deletions.
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ A [Github action](https://github.com/mdmparis/defense-finder/actions) is setup t
Note that you don't need to publish defense-finder everytime the models change: only changes in this repository are relevant.

Here are the steps to follow in order to publish defense-finder:
- find the current version in the `setup` function of `setup.py`.
- find the current version in `defense_finder_cli/_version.py`
- get a new version number according to [semantic versionning](https://semver.org/)
- update the version un `setup.py`
- update the version un `defense_finder_cli/_version.py`
- commit this change, and tag the commit with `git tag -a v<your version number> -m '<your version number> <an optional message>'`
- push your commits to master
- run `git push origin v<your version>` to sync the tags
Expand Down
851 changes: 0 additions & 851 deletions Liste_hmm_system.md

This file was deleted.

20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,27 @@ If you are using DefenseFinder please cite
- "Systematic and quantitative view of the antiviral arsenal of prokaryotes" [Nature Communication](https://www.nature.com/articles/s41467-022-30269-9.pdf), 2022, _Tesson F., Hervé A. , Mordret E., Touchon M., d’Humières C., Cury J., Bernheim A._
- "MacSyFinder: A Program to Mine Genomes for Molecular Systems with an Application to CRISPR-Cas Systems." PloS one 2014
_Abby S., Néron B.,Ménager H., Touchon M. Rocha EPC._
- "CRISPRCasFinder, an update of CRISRFinder, includes a portable version, enhanced performance and integrates search for Cas proteins." Nucleic Acids Research 2018 Couvin D. et al.

## DefenseFinder Models

This repository contains DefenseFinder a tool allowing for a systematic search of anti-phage systems.
The DefenseFinder models based on MacSyFinder architecture can be [here](https://github.com/mdmparis/defense-finder-models)
The DefenseFinder models based on MacSyFinder architecture can be [here](https://github.com/mdmparis/defense-finder-models).

The CRISPR-Cas models used in DefenseFinder come from the macsy models of CasFinder available [here](https://github.com/macsy-models/CasFinder).

## DefenseFinder website

### More information on defense systems on the DefenseFinder

To make DefenseFinder results more intuitive, we created a Wiki of defense systems to gather information on all the defense systems.
The defense wiki is available [here](https://defensefinder.mdmlab.fr/wiki/).

This website gather information on all defense systems detected by DefenseFinder as well as precomputed results and predicted structure of defense systems.

### DefenseFinder webserver

DefenseFinder is available as a [webservice](https://defensefinder.mdmlab.fr/).

# Installing DefenseFinder command-line interface

Expand Down Expand Up @@ -238,4 +254,4 @@ done
```
---
For questions: you can contact aude.bernheim@inserm.fr
For questions: you can contact aude.bernheim@pasteur.fr
37 changes: 24 additions & 13 deletions defense_finder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,36 @@
from macsypy.scripts import macsyfinder


def run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, nocut_ga, loglevel):

gen_args = ['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'defense-finder-models/DefenseFinder_{i}', 'all',
'--out-dir', os.path.join(tmp_dir, 'DF_{i}'), '--w', str(workers),
'--coverage-profile', str(coverage), '--exchangeable-weight', '1']
scripts = [[f.format(i=i) for f in gen_args] for i in range(1, 6)]

scripts.append(['--db-type', dbtype, '--sequence-db',protein_file_name, '--models', 'defense-finder-models/RM', 'all',
'--out-dir', os.path.join(tmp_dir, 'RM'), '--w', str(workers),
'--coverage-profile', str(coverage), '--exchangeable-weight', '1'])

scripts.append(['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'CasFinder', 'all',
'--out-dir', os.path.join(tmp_dir, 'Cas'), '-w', str(workers)])
def run(protein_file_name, dbtype, workers, coverage, adf, adf_only, tmp_dir, models_dir, nocut_ga, loglevel, index_dir):
scripts=[]

if adf_only == False:
gen_args = ['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'defense-finder-models/DefenseFinder_{i}', 'all',
'--out-dir', os.path.join(tmp_dir, 'DF_{i}'), '--w', str(workers),
'--coverage-profile', str(coverage), '--exchangeable-weight', '1']
scripts = [[f.format(i=i) for f in gen_args] for i in range(1, 6)]

scripts.append(['--db-type', dbtype, '--sequence-db',protein_file_name, '--models', 'defense-finder-models/RM', 'all',
'--out-dir', os.path.join(tmp_dir, 'RM'), '--w', str(workers),
'--coverage-profile', str(coverage), '--exchangeable-weight', '1'])

scripts.append(['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'CasFinder', 'all',
'--out-dir', os.path.join(tmp_dir, 'Cas'), '-w', str(workers)])


if (adf == True) or (adf_only == True):
scripts.append(['--db-type', dbtype, '--sequence-db', protein_file_name, '--models', 'defense-finder-models/ADF', 'all',
'--out-dir', os.path.join(tmp_dir, 'AntiDefenseFinder'), '-w', str(workers)])

for msf_cmd in scripts:
if nocut_ga:
msf_cmd.append("--no-cut-ga")
if models_dir:
msf_cmd.extend(("--models-dir", models_dir))
if index_dir:
if not os.path.exists(index_dir):
os.makedirs(index_dir)
msf_cmd.extend(("--index-dir", index_dir))
if loglevel != "DEBUG":
msf_cmd.append("--mute")

Expand Down
1 change: 1 addition & 0 deletions defense_finder_cli/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "1.3.0"
29 changes: 25 additions & 4 deletions defense_finder_cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def cli():
"""
pass

@cli.command()
def version():
"""Get the version of DefenseFinder (software)
"""
print(f"Using DefenseFinder version {__version__}")


@cli.command()
@click.option('--models-dir', 'models_dir', required=False, help='Specify a directory containing your models.')
Expand Down Expand Up @@ -65,11 +71,18 @@ def update(models_dir=None, force_reinstall: bool = False):
@click.option('--models-dir', 'models_dir', required=False, help='Specify a directory containing your models.')
@click.option('--no-cut-ga', 'no_cut_ga', is_flag=True, default=False,
help='Advanced! Run macsyfinder in no-cut-ga mode. The validity of the genes and systems found is not guaranteed!')
@click.option('-a','--antidefensefinder', 'adf', is_flag=True, default=False,
help='Also run AntiDefenseFinder models to find antidefense systems.')
@click.option("-A",'--antidefensefinder-only', 'adf_only', is_flag=True, default=False,
help='Run only AntiDefenseFinder for antidefense system and not DefenseFinder')
@click.option('--log-level', 'loglevel', default="INFO",
help='set the logging level among DEBUG, [INFO], WARNING, ERROR, CRITICAL')
@click.option('--index-dir', 'index_dir', required=False, help='Specify a directory to write the index files required by macsyfinder when the input file is in a read-only folder')


def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, preserve_raw: bool, no_cut_ga: bool, models_dir: str = None, loglevel : str = "INFO"):
def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, preserve_raw: bool, adf: bool,
adf_only: bool, no_cut_ga: bool, models_dir: str = None, loglevel : str = "INFO",
index_dir: str = None):
"""
Search for all known anti-phage defense systems in the target fasta file.
"""
Expand Down Expand Up @@ -143,8 +156,8 @@ def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, pres
else:
protein_file_name = filename

logger.info("Running DefenseFinder")
defense_finder.run(protein_file_name, dbtype, workers, coverage, tmp_dir, models_dir, no_cut_ga, loglevel)
logger.info(f"Running DefenseFinder version {__version__}")
defense_finder.run(protein_file_name, dbtype, workers, coverage, adf,adf_only, tmp_dir, models_dir, no_cut_ga, loglevel, index_dir)
logger.info("Post-treatment of the data")
defense_finder_posttreat.run(tmp_dir, outdir, os.path.splitext(os.path.basename(filename))[0])

Expand All @@ -165,6 +178,8 @@ def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, pres
Tesson F., Hervé A. , Mordret E., Touchon M., d’Humières C., Cury J., Bernheim A., 2022, Nature Communication
Systematic and quantitative view of the antiviral arsenal of prokaryotes
Using DefenseFinder version {__version__}.
DefenseFinder relies on MacSyFinder :
{get_version_message().split("and don't")[0]}
Expand All @@ -173,4 +188,10 @@ def run(file: str, outdir: str, dbtype: str, workers: int, coverage: float, pres
{nl.join([f"{path+tab+version}" for path, version in versions_models])}
""")
""")

if __name__ == "__main__":
__version__ = "Version_from_the_command_line"
cli()
else:
from ._version import __version__
68 changes: 32 additions & 36 deletions defense_finder_posttreat/best_solution.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
import os
import csv
import pandas as pd

from macsypy.serialization import TsvSystemSerializer
from macsypy.registries import split_def_name


def get(tmp_dir):
results = os.listdir(tmp_dir)
acc = []

acc = pd.DataFrame()
for family_dir in results:
family_path = os.path.join(tmp_dir, family_dir)
acc = acc + parse_best_solution(family_path)

if is_file_empty(os.path.join(family_path, 'best_solution.tsv')) is False:
acc = pd.concat([acc, parse_best_solution(family_path)])
if acc.empty is True:
acc = pd.DataFrame(columns=get_best_solution_keys('\t'))
return format_best_solution(acc)

def uncomment(csvfile):
"""
generator which yield lines in macsyfinder/best_solution files but skip comments or empty lines

:param csvfile: the csv to parse
:type cvsfile: file object
"""
for line in csvfile:
uncommented = line.split('#')[0].strip()
if uncommented:
yield uncommented
def is_file_empty(path):
prev_line = ''
with open(path, 'r') as f:
for line in f:
if line.startswith('#'):
prev_line = line
else:
break
if prev_line.startswith('# No Systems found'):
return True
return False


def parse_best_solution(dir):
Expand All @@ -33,15 +36,8 @@ def parse_best_solution(dir):
:type dir: str
"""
delimiter = '\t'
with open(os.path.join(dir, 'best_solution.tsv')) as tsv_file:
tsv = csv.DictReader(uncomment(tsv_file),
fieldnames=get_best_solution_keys(delimiter=delimiter),
delimiter=delimiter)
try:
next(tsv)
except StopIteration:
return []
data = list(tsv)
data = pd.read_table(os.path.join(dir, 'best_solution.tsv'), sep=delimiter, comment='#')

return data


Expand All @@ -50,15 +46,15 @@ def get_best_solution_keys(delimiter='\t'):


def format_best_solution(p):
out = []
for l in p:
gene_ref = l['model_fqn']
gene_ref_elements = split_def_name(gene_ref)
type = gene_ref_elements[-2]
subtype = gene_ref_elements[-1]
native_keys = list(filter(lambda i: i not in ['type', 'subtype'], get_best_solution_keys()))
new_line = { key: l[key] for key in native_keys }
new_line['type'] = type
new_line['subtype'] = subtype
out.append(new_line)
return out
p['type'] = p.model_fqn.map(lambda x: x.split('/')[-2])
p['subtype'] = p.model_fqn.map(lambda x: x.split('/')[-1])
p.loc[p['type'] == 'CasFinder', 'type'] = 'Cas'
p = p.sort_values('hit_pos').reset_index(drop=True)
if len(p) > 0:
p.loc[p.model_fqn.str.contains("ADF"),'activity']='Antidefense'
p.loc[~p.model_fqn.str.contains("ADF"),'activity']='Defense'
else:
p['activity']=''
p=p.sort_values('hit_pos')

return p
22 changes: 2 additions & 20 deletions defense_finder_posttreat/df_genes.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,5 @@
import os
import csv
from defense_finder_posttreat import best_solution

def export_defense_finder_genes(defense_finder_genes, outdir, filename):
defense_finder_genes_list = defense_finder_genes_to_list(defense_finder_genes)
write_defense_finder_genes(defense_finder_genes_list, outdir, filename)

def defense_finder_genes_to_list(defense_finder_genes):
header = best_solution.get_best_solution_keys()
out = [header]
for g in defense_finder_genes:
l = []
for key in header:
l.append(g[key])
out.append(l)
return out

def write_defense_finder_genes(defense_finder_genes_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_genes.tsv')
with open(filepath, 'w') as defense_finder_genes_file:
write = csv.writer(defense_finder_genes_file, delimiter='\t')
write.writerows(defense_finder_genes_list)
def export_defense_finder_genes(defense_finder_genes, outdir, filename):
defense_finder_genes.to_csv(os.path.join(outdir, filename+'_defense_finder_genes.tsv'), sep='\t', index=False)
85 changes: 34 additions & 51 deletions defense_finder_posttreat/df_hmmer.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,46 @@
import os
import csv
import pandas as pd
import shutil

def get_hit_sort_attr(hit):
return hit['hit_id']

def remove_duplicates(hmmer_hits):
temp_list = []
for i in hmmer_hits:
if i not in temp_list:
temp_list.append(i)
return temp_list
# Remove duplicates of hit_id if they have the same gene name
# For R-M and Retron (multi HMM) only one hit is conserved
hmmer_hits = hmmer_hits.sort_values('hit_score', ascending=False).drop_duplicates(['hit_id', 'gene_name'])

return hmmer_hits


def export_defense_finder_hmmer_hits(tmp_dir, outdir, filename):
paths = get_hmmer_paths(tmp_dir)
hmmer_hits = []
for path in paths:
d = parse_hmmer_results_file(path)
hmmer_hits = hmmer_hits + remove_duplicates(d)
sorted_hmmer_hits = sorted(hmmer_hits, key=get_hit_sort_attr)
hmmer_hits_list = hmmer_to_list(sorted_hmmer_hits)
write_defense_finder_hmmer(hmmer_hits_list, outdir, filename)

def write_defense_finder_hmmer(hmmer_hits_list, outdir, filename):
filepath = os.path.join(outdir, f'{filename}_defense_finder_hmmer.tsv')
with open(filepath, 'w') as defense_finder_hmmer_file:
write = csv.writer(defense_finder_hmmer_file, delimiter='\t')
write.writerows(hmmer_hits_list)
defense_finder_hmmer_file.close()
hmmer_hits = pd.DataFrame()

concat = os.path.join(tmp_dir, 'hmm_extracts.concat')

# concatenate all hmm_extract before reading them
with open(concat,'wb') as wfd:
for f in paths:
with open(f,'rb') as fd:
# skip the 5 first lines which are comments
for _ in range(5):
_ = fd.readline()
shutil.copyfileobj(fd, wfd)

hmmer_hits = pd.read_table(concat, names=get_hmmer_keys())

hmmer_hits = remove_duplicates(hmmer_hits)
hmmer_hits = hmmer_hits.sort_values(['hit_pos', 'hit_score'])

write_defense_finder_hmmer(hmmer_hits, outdir, filename)


def write_defense_finder_hmmer(hmmer_hits, outdir, filename):
hmmer_hits.to_csv(os.path.join(outdir, filename+"_defense_finder_hmmer.tsv"), sep='\t', index=False)


def get_hmmer_keys():
return ['hit_id', 'replicon', 'hit_pos', 'hit_sequence_length', 'gene_name','i_eval','hit_score','hit_profile_cov','hit_seq_cov','hit_begin_match','hit_end_match']

def parse_hmmer_results_file(path):
tsv_file = open(path)
tsv = csv.reader(tsv_file, delimiter='\t')
data = []
for row in tsv:
if not row[0].startswith('#'):
data.append(row)
tsv_file.close()
out = []
for l in data:
if not l: continue
line_as_dict = {}
for idx, val in enumerate(get_hmmer_keys()):
line_as_dict[val] = l[idx]
out.append(line_as_dict)
return out
return ['hit_id', 'replicon', 'hit_pos', 'hit_sequence_length', 'gene_name', 'i_eval', 'hit_score', 'hit_profile_cov', 'hit_seq_cov', 'hit_begin_match', 'hit_end_match']


def get_hmmer_paths(results_dir):
family_dirs = os.listdir(results_dir)
Expand All @@ -58,14 +52,3 @@ def get_hmmer_paths(results_dir):
if entry.name.endswith('extract') and entry.is_file():
files.append(entry)
return list(map(lambda i: i.path, files))


def hmmer_to_list(hmmer_hits):
header = get_hmmer_keys()
out = [header]
for s in hmmer_hits:
l = []
for key in header:
l.append(s[key])
out.append(l)
return out
Loading

0 comments on commit 6816b1b

Please sign in to comment.