diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 20ef50d..0000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "mess/assembly_finder"] - path = mess/assembly_finder - url = https://github.com/metagenlab/assembly_finder diff --git a/mess/assembly_finder b/mess/assembly_finder deleted file mode 160000 index 52c178d..0000000 --- a/mess/assembly_finder +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 52c178d5a045178e9826a2fa403bcaa7eee48c60 diff --git a/mess/assembly_finder/Snakefile b/mess/assembly_finder/Snakefile new file mode 100644 index 0000000..ff3e995 --- /dev/null +++ b/mess/assembly_finder/Snakefile @@ -0,0 +1,13 @@ +import os +include: 'rules/find_assemblies.rules' +community_name=config['community_name'] +def downloaded_list(wildcards): + checkpoint_output = checkpoints.download_assemblies.get(**wildcards).output[0] + directory = '/'.join((checkpoint_output.split('/')[0:2])) + return expand(f'assembly_gz/{community_name}/{{i}}_genomic.fna.gz', + i=glob_wildcards(os.path.join(directory, '{i}_genomic.fna.gz')).i) + +rule all_download: + input: f"{community_name}-assemblies-summary.tsv", + downloaded_list, + f"assembly_gz/{community_name}/{community_name}.done" \ No newline at end of file diff --git a/mess/assembly_finder/envs/Assembly_finder.yml b/mess/assembly_finder/envs/Assembly_finder.yml new file mode 100644 index 0000000..336fe17 --- /dev/null +++ b/mess/assembly_finder/envs/Assembly_finder.yml @@ -0,0 +1,9 @@ +name: Assembly_finder +channels: + - bioconda + - conda-forge + +dependencies: + - biopython = 1.78 + - pandas = 1.2.2 + - ete3 = 3.1.2 \ No newline at end of file diff --git a/mess/assembly_finder/envs/download.yml b/mess/assembly_finder/envs/download.yml new file mode 100644 index 0000000..ace0629 --- /dev/null +++ b/mess/assembly_finder/envs/download.yml @@ -0,0 +1,5 @@ +name: download +channels: + - hcc +dependencies: + - aspera-cli = 3.9.1 \ No newline at end of file diff --git a/mess/assembly_finder/envs/singularity/aspera.Dockerfile b/mess/assembly_finder/envs/singularity/aspera.Dockerfile new file mode 100644 index 0000000..09ece05 --- /dev/null +++ b/mess/assembly_finder/envs/singularity/aspera.Dockerfile @@ -0,0 +1,27 @@ +FROM continuumio/miniconda3:4.7.12 + + +################## METADATA ###################### + +LABEL base.image="miniconda3:4.7.12" +LABEL version="v.1.0" +LABEL software="aspera" +LABEL software.version="3.9.1" +LABEL description="IBM aspera CLI (https://downloads.asperasoft.com/en/documentation/62) but install from a Conda environment " +LABEL tags="Genomics" + + +################## INSTALLATION ###################### +ENV DEBIAN_FRONTEND noninteractive + +COPY ./envs/download.yml ./download.yml + +RUN conda update conda && \ + conda env create -f download.yml && \ + conda clean --all --yes + + +RUN conda init bash +ENTRYPOINT ["/bin/bash"] +ENV PATH /opt/conda/envs/download/bin:$PATH +ENV CONDA_PREFIX "/opt/conda/envs/download" \ No newline at end of file diff --git a/mess/assembly_finder/envs/singularity/assembly_finder.Dockerfile b/mess/assembly_finder/envs/singularity/assembly_finder.Dockerfile new file mode 100644 index 0000000..d6bbbde --- /dev/null +++ b/mess/assembly_finder/envs/singularity/assembly_finder.Dockerfile @@ -0,0 +1,30 @@ +FROM continuumio/miniconda3:4.7.12 + + +################## METADATA ###################### + +LABEL base.image="miniconda3:4.7.12" +LABEL version="v.1.1" +LABEL software="assembly_finder" +LABEL tags="Genomics" + +################## MAINTAINER ###################### + +MAINTAINER Valentin Scherz + +################## INSTALLATION ###################### +ENV DEBIAN_FRONTEND noninteractive + +COPY ./envs/Assembly_finder.yml ./Assembly_finder.yml + +RUN conda config --add channels defaults && \ + conda config --add channels bioconda && \ + conda config --add channels conda-forge && \ + conda update conda && \ + conda env create -f Assembly_finder.yml && \ + conda clean --all --yes + + +RUN conda init bash +ENTRYPOINT ["/bin/bash"] +ENV PATH /opt/conda/envs/Assembly_finder/bin:$PATH \ No newline at end of file diff --git a/mess/assembly_finder/rules/assembly_table.py b/mess/assembly_finder/rules/assembly_table.py new file mode 100644 index 0000000..d99bc76 --- /dev/null +++ b/mess/assembly_finder/rules/assembly_table.py @@ -0,0 +1,248 @@ +from Bio import Entrez +import pandas as pd +import warnings +import numpy as np +import logging +from ete3 import NCBITaxa +ncbi = NCBITaxa() + + +class AssemblyFinder: + def __init__(self, name, isassembly=False, genbank=False, refseq=True, representative=True, reference=True, + complete=True, + exclude_metagenomes=True, nb=1, rank_to_select='None', outf='f.tsv', outnf='nf.tsv'): + self.name = name + self.assembly = isassembly + self.genbank = genbank + self.refseq = refseq + self.representative = representative + self.reference = reference + self.complete = complete + self.exclude_metagenomes = exclude_metagenomes + self.target_ranks = ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + self.nchunks = 10000 + self.rank_to_select = rank_to_select + self.nb = nb + self.outf = outf + self.outnf = outnf + logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%d %b %Y %H:%M:%S', + filename=snakemake.log[0], level=logging.DEBUG) + + # Static methods to apply functions on assembly summary table + def get_stat(self, meta, stat): + """ + function to extract assembly Meta stats (contig count, assembly length) + """ + return meta.split(f' ')[1].split('')[0] + + def get_names(self, gbftp): + """ + function to extract assembly file names + """ + return gbftp.split('/')[-1] + + def get_lin_tax(self, lineages): + """ + function to get lineages from a list of taxids + """ + ranks = ncbi.get_rank(lineages).values() + ranknames = ncbi.get_taxid_translator(lineages).values() + return dict(zip(ranks, ranknames)) + + def replace_nans(self, tb): + """ + function to replace unknown taxonomic rank with placeholder names + """ + tb = tb.replace(np.nan, 'unknown') + for i in range(len(tb)): + for n, col in enumerate(tb.columns): + if tb.iloc[i, n] == 'unknown' and col != 'superkingdom': + tmpname = tb.iloc[i, n - 1] + '_' + col[0] + if col == 'species': + tmpname = tb.iloc[i, n - 1] + '_' + col[0:1] + tb.iloc[i, n] = tmpname + return tb + + def chunks(self, ls, n): + """ + function to split assembly list into chunks + """ + return [ls[i:i + n] for i in range(0, len(ls), n)] + + def taxid_find(self): + """ + Function to parse input_list to convert scientific names to taxid + returns dictionary with taxid found + """ + logging.info(f'> Searching for taxIDs {self.name} ...') + try: + int(self.name) + logging.info('Query is a taxID') + taxid = self.name + + except ValueError: + logging.warning('Query is not a taxID, enter taxID to be more precise') + logging.info(f'Search term: {self.name}[all Names]') + taxid_list = Entrez.read(Entrez.esearch(db='taxonomy', term=f'{self.name}[all Names]', retmax=100))[ + 'IdList'] + if len(taxid_list) == 1: + taxid = taxid_list[0] + logging.info(f'TaxID:{taxid} found') + if len(taxid_list) > 1: + taxid = taxid_list[0] + logging.warning(f'{len(taxid_list)} TaxIDs found, change query (taking first one : {taxid})') + if len(taxid_list) == 0: + raise Exception('TaxID not found! Change search term!') + return taxid + + def search_assemblies(self): + if self.assembly: # If the input is an assembly name or Gbuid use it as a search term + search_term = f'{self.name}' + else: # If not, search ncbi taxonomy for the taxid + taxid = self.taxid_find() + search_term = f'txid{taxid}[Organism:exp] ' + if self.refseq and not self.genbank: + search_term += 'AND ("latest refseq"[filter] ' + if self.genbank and not self.refseq: + search_term += 'AND ("latest genbank"[filter] ' + if self.genbank and self.refseq: + search_term += 'AND (latest[filter] ' + if self.complete and not self.representative and not self.reference: + search_term += 'AND "complete genome"[filter] ' + if self.complete and self.representative and not self.reference: + search_term += 'AND "complete genome"[filter] OR "representative genome"[filter] ' + if self.complete and self.representative and self.reference: + search_term += 'AND "complete genome"[filter] OR "representative genome"[filter] OR ' \ + '"reference genome"[filter] ' + if self.representative and not self.reference: + search_term += 'AND "representative genome"[filter] ' + if self.reference and not self.representative: + search_term += 'AND "reference genome"[filter] ' + if self.representative and self.reference: + search_term += 'AND "representative genome"[filter] OR "reference genome"[filter] ' + if self.exclude_metagenomes: + search_term += 'AND all[filter] NOT metagenome[filter])' + assembly_ids = Entrez.read(Entrez.esearch(db='assembly', term=search_term, retmax=500000))['IdList'] + logging.info(f'> Search term: {search_term}') + logging.info(f'found {len(assembly_ids)} assemblies') + if not assembly_ids: + raise Exception('No assemblies found ! Change search term!') + return assembly_ids + + def generate_assembly_table(self, assemblies): + assembly_list = ','.join(assemblies) + assembly_summary = Entrez.read(Entrez.esummary(db='assembly', id=assembly_list), validate=False) + summaries = assembly_summary['DocumentSummarySet']['DocumentSummary'] + tb = pd.DataFrame.from_records(summaries) + columns = ['GbUid', 'RefSeq_category', 'AssemblyStatus', 'FtpPath_GenBank', 'FtpPath_RefSeq', 'Meta', + 'AsmReleaseDate_GenBank', 'ContigN50', 'ScaffoldN50', 'Coverage', 'Taxid'] + subset = tb[columns] + lens = subset.apply(lambda x: self.get_stat(x['Meta'], stat='total_length'), axis=1) + contigs = subset.apply(lambda x: self.get_stat(x['Meta'], stat='contig_count'), axis=1) + subset.insert(loc=subset.shape[1] - 1, value=lens, column='Assembly_length') + subset.insert(loc=subset.shape[1] - 1, value=contigs, column='Contig_count') + subset.insert(loc=1, value=subset['FtpPath_GenBank'].apply(self.get_names), column='AssemblyNames') + subset = subset.rename(columns={'Coverage': 'Assembly_coverage'}) + subset = subset.drop('Meta', axis=1) + return subset + + def add_lineage(self, assembly_tb): + unique_taxids = list(set(assembly_tb['Taxid'])) + taxid2lineage = ncbi.get_lineage_translator(unique_taxids) + tax = {taxid: self.get_lin_tax(lineage) for taxid, lineage in taxid2lineage.items()} + lineage_tb = pd.DataFrame.from_dict(tax, orient='index') + lineage_tb.index.set_names('Taxid', inplace=True) + lineage_tb.reset_index(inplace=True) + ordered_ranks = self.target_ranks[::-1] + ordered_ranks.append('Taxid') + lin_cols = list(lineage_tb.columns) + all_cols = list(set().union(lin_cols, ordered_ranks)) + lineage_tb = lineage_tb.reindex(columns=all_cols, fill_value=np.nan) + lineage_tb = lineage_tb[ordered_ranks] + lineage_tb = self.replace_nans(lineage_tb) + lineage_tb = lineage_tb.astype({'Taxid': 'string'}) + merged_table = assembly_tb.merge(lineage_tb, on='Taxid') + return merged_table + + def select_assemblies(self, table): + fact_table = table.replace({'RefSeq_category': {'reference genome': 0, 'representative genome': 1, 'na': 6}, + 'AssemblyStatus': {'Complete Genome': 2, 'Chromosome': 3, 'Scaffold': 4, + 'Contig': 5, 'na': 6}}) + sorted_table = fact_table.sort_values(['RefSeq_category', 'AssemblyStatus', 'Contig_count', + 'ScaffoldN50', 'ContigN50', 'AsmReleaseDate_GenBank'], + ascending=[True, True, True, False, False, False]) + if self.rank_to_select != 'None': + logging.info(f'Filtering according to {self.rank_to_select}, Refseq categories, assembly status, ' + f'contig count and release date') + select_index = [] + unique_list = list(set(sorted_table[self.rank_to_select])) + if len(unique_list) > 1: + for i in unique_list: + select_index.append(sorted_table[sorted_table[self.rank_to_select] == i].sample(1).index[0]) + # randomly select one assembly ID for each unique selected rank (species for example) + sorted_table = sorted_table.loc[select_index, :] + if len(unique_list) == 1: + logging.info(f'Same {self.rank_to_select} for all assemblies, Filtering according to Refseq ' + f'categories, assembly status,contig count and release date') + if len(unique_list) == 0: + logging.error(f'{self.rank_to_select} is not a target rank') + else: + logging.info('No taxonomic rank specified, sorting according to Refseq category, ' + 'assembly status, contig count and release date') + if len(sorted_table) >= self.nb: + logging.info(f'Selecting {self.nb} sorted assemblies out of {len(sorted_table)}') + sorted_table = sorted_table[0:self.nb] + if len(sorted_table) < self.nb: + logging.warning(f'Found less than {self.nb} assemblies in total, returning {len(sorted_table)} instead') + sorted_table = sorted_table.replace({'RefSeq_category': {0: 'reference genome', 1: 'representative genome', + 6: 'na'}, + 'AssemblyStatus': {2: 'Complete Genome', 3: 'Chromosome', 4: 'Scaffold', + 5: 'Contig', 6: 'na'}}) + return sorted_table + + def run(self): + assemblies_found = self.search_assemblies() + if len(assemblies_found) > self.nchunks: + warnings.warn(f'{len(assemblies_found)} assemblies found, restrict search term to find less assemblies') + assemblies_chunks = self.chunks(assemblies_found, + self.nchunks) # Divide assembly lists by chunks of 10000 + logging.info(f'Parsing assemblies by chucks of {self.nchunks}') + table_chunks = [] + for n, chunk in enumerate(assemblies_chunks): + logging.info(f'chunk n°{n}') + assembly_tb = self.generate_assembly_table(chunk) + tb = self.add_lineage(assembly_tb) + table_chunks.append(tb) + non_filtered_tb = pd.concat(table_chunks, sort=False) + else: + assembly_tb = self.generate_assembly_table(assemblies_found) + non_filtered_tb = self.add_lineage(assembly_tb) + + non_filtered_tb.to_csv(self.outnf, sep='\t', index=None) + filtered_tb = self.select_assemblies(non_filtered_tb) + filtered_tb.to_csv(self.outf, sep='\t', index=None) + return filtered_tb + + +''' +Main +''' +Entrez.email = snakemake.params['ncbi_email'] +Entrez.api_key = snakemake.params['ncbi_key'] +comp = snakemake.params['comp'] +ref = snakemake.params['ref'] +rep = snakemake.params['rep'] +met = snakemake.params['met'] +gb = snakemake.params['gb'] +rs = snakemake.params['rs'] +entry = snakemake.wildcards.entry +assembly = snakemake.params['assembly'] +column = snakemake.params['column'] +rank = snakemake.params['rank_filter'] +intb = pd.read_csv(snakemake.input[0], sep='\t', dtype={f'{column}': 'str'}) +intb.set_index(f'{column}', inplace=True) +nb = int(intb.loc[entry]['nb_genomes']) +find_assemblies = AssemblyFinder(name=entry, isassembly=assembly, genbank=gb, refseq=rs, representative=rep, + reference=ref, complete=comp, exclude_metagenomes=met, nb=nb, rank_to_select=rank, + outnf=snakemake.output.all, outf=snakemake.output.filtered) +find_assemblies.run() diff --git a/mess/assembly_finder/rules/combine_tables.py b/mess/assembly_finder/rules/combine_tables.py new file mode 100644 index 0000000..2418704 --- /dev/null +++ b/mess/assembly_finder/rules/combine_tables.py @@ -0,0 +1,13 @@ +import pandas as pd +""" +Main +""" +column = snakemake.params['column'] +df_list = [] +for file in snakemake.input: + entry = file.split('/')[1].split('-filtered.tsv')[0] + tb = pd.read_csv(file, sep='\t') + tb.insert(loc=0, column=f'{column}', value=[entry]*len(tb)) + df_list.append(tb) +df = pd.concat(df_list, sort=False) +df.to_csv(snakemake.output[0], sep='\t', index=None) diff --git a/mess/assembly_finder/rules/concat-ftp-links.py b/mess/assembly_finder/rules/concat-ftp-links.py new file mode 100644 index 0000000..7b88b0e --- /dev/null +++ b/mess/assembly_finder/rules/concat-ftp-links.py @@ -0,0 +1,15 @@ +import pandas as pd +""" +Main +""" +ftplinks = pd.read_csv(snakemake.input[0], sep='\t')['FtpPath_GenBank'] +links = [] +for link in ftplinks: + link = link.replace('ftp://ftp.ncbi.nlm.nih.gov', '') + fna = '/' + link.split('/')[-1]+'_genomic.fna.gz\n' + link += fna + links.append(link) + +f = open(snakemake.output[0], "w") +f.writelines(links) +f.close() diff --git a/mess/assembly_finder/rules/find_assemblies.rules b/mess/assembly_finder/rules/find_assemblies.rules new file mode 100644 index 0000000..26ccad1 --- /dev/null +++ b/mess/assembly_finder/rules/find_assemblies.rules @@ -0,0 +1,95 @@ +import pandas as pd +community_name=config['community_name'] +try: + entries = list(pd.read_csv(config['input_table_path'],delimiter='\t')['TaxonomyInput']) + isassembly = False + col = 'TaxonomyInput' +except KeyError: + entries = list(pd.read_csv(config['input_table_path'],delimiter='\t')['AssemblyInput']) + isassembly = True + col = 'AssemblyInput' + +rule check_for_update_ete3: + conda: '../envs/Assembly_finder.yml' + + container: "docker://metagenlab/assemblyfinder:v.1.1" + + output: temp('ete3-update.txt') + + log: 'logs/ete3/update.log' + + script: 'update-ete3.py' + +rule get_assembly_tables: + conda: '../envs/Assembly_finder.yml' + + container: "docker://metagenlab/assemblyfinder:v.1.1" + + input: config["input_table_path"], + 'ete3-update.txt' + + output: all='tables/{entry}-all.tsv', + filtered='tables/{entry}-filtered.tsv' + + params: ncbi_key = config['NCBI_key'], ncbi_email = config['NCBI_email'], + comp=config['complete_assemblies'], ref=config['reference_assemblies'], + rep=config['representative_assemblies'], met=config['exclude_from_metagenomes'], + gb=config['Genbank_assemblies'], rs=config['Refseq_assemblies'], rank_filter=config['Rank_to_filter_by'], + assembly = isassembly, column = col + + resources: ncbi_requests=1 + + log: 'logs/find-assemblies/{entry}.log' + + benchmark: "benchmark/find-assemblies/{entry}.txt" + + script: 'assembly_table.py' + + +rule combine_assembly_tables: + conda: '../envs/Assembly_finder.yml' + + container: "docker://metagenlab/assemblyfinder:v.1.1" + + input: expand('tables/{entry}-filtered.tsv',entry=entries) + + output: f'{community_name}-assemblies-summary.tsv' + + params: column=col + + script: 'combine_tables.py' + + +rule get_ftp_links_list: + conda: '../envs/Assembly_finder.yml' + + container: "docker://metagenlab/assemblyfinder:v.1.1" + + input: f'{community_name}-assemblies-summary.tsv' + + output: temp(f"assembly_gz/{community_name}/{community_name}-ftp-links.txt") + + script: "concat-ftp-links.py" + + +checkpoint download_assemblies: + conda: "../envs/download.yml" + + container: "docker://metagenlab/aspera-cli-conda:v.1.0" + + input: f"assembly_gz/{community_name}/{community_name}-ftp-links.txt" + + output: f"assembly_gz/{community_name}/{community_name}.done" + + log: f"logs/downloads/{community_name}.log" + + benchmark: f"benchmark/downloads/{community_name}.txt" + + params: f"assembly_gz/{community_name}" + + shell: + """ + ascp -T -k 1 -i ${{CONDA_PREFIX}}/etc/asperaweb_id_dsa.openssh --mode=recv --user=anonftp \ + --host=ftp.ncbi.nlm.nih.gov --file-list={input} {params} &>> {log} + touch {output} + """ \ No newline at end of file diff --git a/mess/assembly_finder/rules/update-ete3.py b/mess/assembly_finder/rules/update-ete3.py new file mode 100644 index 0000000..11bce5b --- /dev/null +++ b/mess/assembly_finder/rules/update-ete3.py @@ -0,0 +1,24 @@ +import sys +import os +from datetime import datetime +from ete3 import NCBITaxa +with open(snakemake.log[0], "w") as f: + sys.stderr = sys.stdout = f + ncbi = NCBITaxa() + sqldb = os.path.join(os.environ['HOME'], '.etetoolkit', 'taxa.sqlite') # path to ete sql db + db_modification_time = datetime.fromtimestamp(os.path.getctime(sqldb)) + database_age_days = abs((db_modification_time-datetime.now()).days) + if database_age_days >= 10: + ncbi.update_taxonomy_database() + comment = f'taxa.sqlite is more than {database_age_days} days old, updating database' + else: + comment = 'taxa.sqlite is up to date' + file = open(snakemake.output[0], 'w') + file.write(comment) + file.close() + + + + + +