Skip to content

Commit

Permalink
Merge pull request #24 from metagenlab/summary
Browse files Browse the repository at this point in the history
Add option to only download summary tables
  • Loading branch information
farchaab authored Oct 23, 2024
2 parents 7c6c236 + fefa389 commit 97c66ef
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 27 deletions.
20 changes: 16 additions & 4 deletions assembly_finder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,11 @@ def common_options(func):
help="Custom config file [default: (outputDir)/config.yaml]",
),
click.option(
"--threads", help="Number of threads to use", default=1, show_default=True
"--threads",
"-t",
help="Number of threads to use",
default=1,
show_default=True,
),
click.option(
"--profile",
Expand Down Expand Up @@ -130,9 +134,10 @@ def common_options(func):
"options": [
"--input",
"--output",
"--taxonkit",
"--threads",
"--taxonkit",
"--taxon",
"--summary",
"--rank",
"--nrank",
"--print-versions",
Expand Down Expand Up @@ -204,6 +209,13 @@ def common_options(func):
type=str,
default=None,
)
@click.option(
"--summary/--all",
type=bool,
help="Download only summary tables or all files",
default=False,
show_default=True,
)
@click.option("--api-key", type=str, help="NCBI api-key", default=None)
@click.option(
"--compressed",
Expand All @@ -216,7 +228,7 @@ def common_options(func):
"--include",
type=str,
help="Comma seperated files to download : genome,rna,protein,cds,gff3,gtf,gbff,seq-report",
default="genome,seq-report",
default="genome",
show_default=True,
)
@click.option(
Expand All @@ -228,7 +240,7 @@ def common_options(func):
)
@click.option(
"--taxon/--accession",
help="Are queries taxa names or accession",
help="Type of queries",
type=bool,
default=True,
show_default=True,
Expand Down
12 changes: 7 additions & 5 deletions assembly_finder/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,9 @@ API_KEY = config.args.api_key
LIMIT = config.args.limit
COMPRESSED = config.args.compressed
SOURCE = config.args.source
SUMMARY = config.args.summary
INCLUDE = config.args.include
INCLUDE_LIST = INCLUDE.split(",")
if "genome" not in INCLUDE_LIST or "seq-report" not in INCLUDE_LIST:
INCLUDE_LIST = ["genome", "seq-report"] + INCLUDE_LIST
INCLUDE = ",".join(INCLUDE_LIST)

TAXON = config.args.taxon
REFERENCE = config.args.reference
ASM_LVL = config.args.assembly_level
Expand All @@ -71,10 +68,15 @@ include: os.path.join("rules", "download.smk")

targets = [
os.path.join(dir.out.base, "assembly_summary.tsv"),
os.path.join(dir.out.base, "sequence_report.tsv"),
os.path.join(dir.out.base, "taxonomy.tsv"),
os.path.join(dir.out.base, "cleanup.flag"),
]
if SUMMARY:
targets = [
os.path.join(dir.out.base, "taxonomy.tsv"),
os.path.join(dir.out.base, "assembly_summary.tsv"),
os.path.join(dir.out.base, "accessions.txt"),
]

if PRINT_VERSIONS:

Expand Down
2 changes: 1 addition & 1 deletion assembly_finder/workflow/envs/datasets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ channels:
- conda-forge
- defaults
dependencies:
- ncbi-datasets-cli =16.31.0
- ncbi-datasets-cli =16.32.0
24 changes: 7 additions & 17 deletions assembly_finder/workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,18 @@ rule format_taxonkit_lineage:
"""


if SUMMARY:
asm_table = os.path.join(dir.out.base, "assembly_summary.tsv")
else:
asm_table = temp(os.path.join(dir.out.base, "assembly_summary.txt"))


rule filter_genome_summaries:
input:
summary=os.path.join(dir.out.base, "genome_summaries.json"),
lineage=os.path.join(dir.out.base, "lineage.tsv"),
output:
gen=temp(os.path.join(dir.out.base, "assembly_summary.txt")),
gen=asm_table,
tax=os.path.join(dir.out.base, "taxonomy.tsv"),
acc=temp(os.path.join(dir.out.base, "accessions.txt")),
params:
Expand Down Expand Up @@ -260,21 +266,6 @@ rule copy_files:
"""


rule cat_sequence_reports:
input:
os.path.join(dir.out.download),
output:
os.path.join(dir.out.base, "sequence_report.tsv"),
params:
jsonl=os.path.join(dir.out.base, "download", "*", "sequence_report.jsonl"),
conda:
os.path.join(dir.env, "datasets.yml")
shell:
"""
cat {params.jsonl} | dataformat tsv genome-seq > {output}
"""


rule add_genome_paths:
input:
dir=os.path.join(dir.out.download),
Expand All @@ -293,7 +284,6 @@ rule cleanup_files:
input:
os.path.join(dir.out.base, "archive"),
os.path.join(dir.out.base, "assembly_summary.tsv"),
os.path.join(dir.out.base, "sequence_report.tsv"),
os.path.join(dir.out.base, "taxonomy.tsv"),
output:
temp(os.path.join(dir.out.base, "cleanup.flag")),
Expand Down

0 comments on commit 97c66ef

Please sign in to comment.