Skip to content

Commit

Permalink
Merge pull request #2 from metagenlab/dev
Browse files Browse the repository at this point in the history
wrapper + n_by_rank
  • Loading branch information
idfarbanecha authored Jul 21, 2021
2 parents 52c178d + 89aaa05 commit 70f6630
Show file tree
Hide file tree
Showing 19 changed files with 928 additions and 14 deletions.
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include assembly_finder *
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ Assembly finder requires as input a tsv table containing queries, and a configur
### Input table example
Assembly finder searches NCBI's assembly database for a number of genomes at a given taxonomic rank or taxid inputted in a tsv table as shown below.
```
UserInputNames nb_genomes
TaxnonomyInput nb_genomes
1813735 1
114185 1
ATCC_13985 1
staphylococcus_aureus 1
```
The workflow accepts taxonomy identifiers, taxonomic ranks (like species names) and strain numbers as queries as long as they are found in NCBI's taxonomy database.
Expand Down
File renamed without changes.
209 changes: 209 additions & 0 deletions assembly_finder/af.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@

# Parts of this wrapper are inspired by atlas, an easy-to-use metagenomic pipeline based on snakemake.
# Go check it out on https://github.com/metagenome-atlas/atlas
import logging
import os
import sys
import click
import subprocess
from assembly_finder import __version__

logging.basicConfig(
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M",
format="[%(asctime)s %(levelname)s] %(message)s",
)


def get_snakefile():
thisdir = os.path.abspath(os.path.dirname(__file__))
sf = os.path.join(thisdir, 'Snakefile')
if not os.path.exists(sf):
sys.exit(f"Unable to locate the Snakemake workflow file at {sf}")
return sf


@click.group(context_settings=dict(help_option_names=["-h", "--help"]))
@click.version_option(__version__)
@click.pass_context
def cli(obj):
"""
assembly_finder is a snakemake workflow used to download genome assemblies from RefSeq and Genbank
"""


@cli.command(
"run",
context_settings=dict(ignore_unknown_options=True),
short_help="run all assembly_finder pipeline steps"
)
@click.option(
"-i",
"--input-table",
type=click.Path(exists=True, resolve_path=True),
help="path to assembly_finder input_table_path",
)
@click.option(
"-p",
"--conda-prefix",
type=click.Path(exists=True, resolve_path=True),
help="path to conda environment",
)
@click.option(
"-n",
"--dryrun_status",
is_flag=True,
default=False,
show_default=True,
help="Snakemake dryrun to see the scheduling plan",
)
@click.option(
"-c",
"--cores",
type=int,
help="number of cores to allow for the workflow",
default=2,
)
@click.option(
"-nk",
"--ncbi_key",
type=str,
help="ncbi key for Entrez",
default="",
)
@click.option(
"-ne",
"--ncbi_email",
type=str,
help="ncbi email for Entrez",
default="",
)
@click.option(
"-gc",
"--complete_assemblies",
help="download only complete assemblies (default=False)",
default=True
)

@click.option(
"-gr",
"--reference_assemblies",
help="download only reference assemblies",
default=False
)
@click.option(
"-gre",
"--representative_assemblies",
help="download only representative assemblies",
default=False
)
@click.option(
"-gb",
"--genbank_assemblies",
help="download genbank assemblies (default True)",
is_flag=True
)
@click.option(
"-rs",
"--refseq_assemblies",
help="download refseq assemblies (default True)",
is_flag=True
)
@click.option(
"-rs",
"--exclude_from_metagenomes",
help="exclude metagnomes (default True)",
is_flag=True
)
@click.option(
"-f",
"--filter_rank",
help="Rank filter",
default=False,
is_flag=False
)
@click.option(
"-nr",
"--n_by_rank",
help="Max number of genome by target rank (eg 1/species)",
type=int,
default=1,
)

@click.argument("snakemake_args", nargs=-1, type=click.UNPROCESSED)

def run_workflow(conda_prefix,
input_table,
dryrun_status,
cores,
ncbi_key,
ncbi_email,
complete_assemblies,
reference_assemblies,
representative_assemblies,
exclude_from_metagenomes,
genbank_assemblies,
refseq_assemblies,
filter_rank,
n_by_rank,
snakemake_args):
"""
Runs assembly_finder pipeline with all steps
input_table_path: path/to/input_table
ncbi_key: your_ncbi_api_key
ncbi_email: your_ncbi_email
##Parameters for search_assemblies function
#This set of parameters is to search all possible assemblies
complete_assemblies: False
reference_assemblies: False
representative_assemblies: False
exclude_from_metagenomes: True
Genbank_assemblies: True
Refseq_assemblies: True
##Parameters for the filtering function
Rank_to_filter_by: False
"""
import datetime

today = datetime.datetime.today().strftime("%Y-%m-%d")

if dryrun_status:
dryrun = '-n'
else:
dryrun = ''
if conda_prefix is None:
conda_prefix = os.environ['CONDA_PREFIX']
if not os.path.exists(conda_prefix):
logging.critical(f"conda env path not found: {conda_prefix}")
sys.exit(1)
cmd = (
f"snakemake --snakefile {get_snakefile()} --use-conda --conda-prefix {conda_prefix} "
f" --cores {cores} "
f"all_download {dryrun} "
f"--config input_table_path={input_table} "
f"NCBI_key={ncbi_key} "
f"NCBI_email={ncbi_email} "
f"community_name={today} "
f"complete_assemblies={complete_assemblies} "
f"reference_assemblies={reference_assemblies} "
f"representative_assemblies={representative_assemblies} "
f"exclude_from_metagenomes={exclude_from_metagenomes} "
f"Genbank_assemblies={genbank_assemblies} "
f"Refseq_assemblies={refseq_assemblies} "
f"Rank_to_filter_by={filter_rank} "
f"n_by_rank={n_by_rank} "
f"{' '.join(snakemake_args)}")
logging.info("Executing: %s" % cmd)
try:
subprocess.check_call(cmd, shell=True)
except subprocess.CalledProcessError as e:
# removes the traceback
logging.critical(e)
exit(1)


if __name__ == "__main__":
cli()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
3 changes: 3 additions & 0 deletions assembly_finder/input_table.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
TaxnonomyInput nb_genomes
1813735 1
114185 1
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class AssemblyFinder:
def __init__(self, name, isassembly=False, genbank=False, refseq=True, representative=True, reference=True,
complete=True,
exclude_metagenomes=True, nb=1, rank_to_select='None', outf='f.tsv', outnf='nf.tsv'):
exclude_metagenomes=True, nb=1, rank_to_select=False, outf='f.tsv', outnf='nf.tsv', n_by_rank=1):
self.name = name
self.assembly = isassembly
self.genbank = genbank
Expand All @@ -25,6 +25,7 @@ def __init__(self, name, isassembly=False, genbank=False, refseq=True, represent
self.nb = nb
self.outf = outf
self.outnf = outnf
self.n_by_rank = n_by_rank
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%d %b %Y %H:%M:%S',
filename=snakemake.log[0], level=logging.DEBUG)

Expand Down Expand Up @@ -168,22 +169,29 @@ def select_assemblies(self, table):
fact_table = table.replace({'RefSeq_category': {'reference genome': 0, 'representative genome': 1, 'na': 6},
'AssemblyStatus': {'Complete Genome': 2, 'Chromosome': 3, 'Scaffold': 4,
'Contig': 5, 'na': 6}})
# make sure the path to Genbank ftp is available
fact_table = fact_table[fact_table.FtpPath_GenBank != '']
sorted_table = fact_table.sort_values(['RefSeq_category', 'AssemblyStatus', 'Contig_count',
'ScaffoldN50', 'ContigN50', 'AsmReleaseDate_GenBank'],
ascending=[True, True, True, False, False, False])
if self.rank_to_select != 'None':

if self.rank_to_select:
logging.info(f'Filtering according to {self.rank_to_select}, Refseq categories, assembly status, '
f'contig count and release date')
select_index = []
unique_list = list(set(sorted_table[self.rank_to_select]))
if len(unique_list) > 1:
if len(unique_list) > 0:
for i in unique_list:
select_index.append(sorted_table[sorted_table[self.rank_to_select] == i].sample(1).index[0])
# randomly select one assembly ID for each unique selected rank (species for example)
target = sorted_table[sorted_table[self.rank_to_select] == i]
if len(target) >= self.n_by_rank:
select_index += target.iloc[0:self.n_by_rank].index.to_list()
else:
select_index += target.index.to_list()
# randomly select self.n_by_rank assembly ID for each unique selected rank (species for example)
sorted_table = sorted_table.loc[select_index, :]
if len(unique_list) == 1:
logging.info(f'Same {self.rank_to_select} for all assemblies, Filtering according to Refseq '
f'categories, assembly status,contig count and release date')
#if len(unique_list) == 1:
# logging.info(f'Same {self.rank_to_select} for all assemblies, Filtering according to Refseq '
# f'categories, assembly status,contig count and release date')
if len(unique_list) == 0:
logging.error(f'{self.rank_to_select} is not a target rank')
else:
Expand Down Expand Up @@ -242,7 +250,8 @@ def run(self):
intb = pd.read_csv(snakemake.input[0], sep='\t', dtype={f'{column}': 'str'})
intb.set_index(f'{column}', inplace=True)
nb = int(intb.loc[entry]['nb_genomes'])
n_by_rank = snakemake.params['n_by_rank']
find_assemblies = AssemblyFinder(name=entry, isassembly=assembly, genbank=gb, refseq=rs, representative=rep,
reference=ref, complete=comp, exclude_metagenomes=met, nb=nb, rank_to_select=rank,
outnf=snakemake.output.all, outf=snakemake.output.filtered)
outnf=snakemake.output.all, outf=snakemake.output.filtered, n_by_rank=n_by_rank)
find_assemblies.run()
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ rule get_assembly_tables:
comp=config['complete_assemblies'], ref=config['reference_assemblies'],
rep=config['representative_assemblies'], met=config['exclude_from_metagenomes'],
gb=config['Genbank_assemblies'], rs=config['Refseq_assemblies'], rank_filter=config['Rank_to_filter_by'],
n_by_rank=config['n_by_rank'],
assembly = isassembly, column = col

resources: ncbi_requests=1
Expand Down
File renamed without changes.
3 changes: 0 additions & 3 deletions input_table.tsv

This file was deleted.

21 changes: 21 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from setuptools import setup, find_packages
import glob
import os
import pkg_resources
from assembly_finder import __version__

setup(name='assembly_finder',
version=__version__,
packages=find_packages(),
package_data={"":["assembly_finder/*", ]},
description='Download assemblies from NCBI',
url='https://github.com/metagenlab/assembly_finder',
author='Farid Chaabane & Trestan Pillonel',
author_email='[email protected]',
entry_points="""
[console_scripts]
af = assembly_finder.af:cli
""",
include_package_data=True,
keywords=[],
zip_safe=False)

0 comments on commit 70f6630

Please sign in to comment.