diff --git a/.github/workflows/build-push-busco-image.yml b/.github/workflows/build-push-busco-image.yml new file mode 100644 index 00000000..241bce58 --- /dev/null +++ b/.github/workflows/build-push-busco-image.yml @@ -0,0 +1,31 @@ +name: Build & Push BUSCO Image to GHCR + +on: + pull_request: + types: + - opened + - reopened + - synchronize + - ready_for_review + paths: + - 'src/loaders/compute_tools/busco/versions.yaml' + - '.github/workflows/build-push-busco-image.yml' + - '.github/workflows/build-push-tool-images.yml' + + push: + branches: + - main + - master + - develop + paths: + - 'src/loaders/compute_tools/busco/versions.yaml' + - '.github/workflows/build-push-busco-image.yml' + - '.github/workflows/build-push-tool-images.yml' + +jobs: + trigger-build-push: + uses: ./.github/workflows/build-push-tool-images.yml + with: + tool_name: busco + version_file: 'src/loaders/compute_tools/busco/versions.yaml' + secrets: inherit \ No newline at end of file diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 920d0d7a..2891730f 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,7 +2,7 @@ ## 0.1.3 -* Added BBMap tool to the CDM pipeline. +* Added BBMap and BUSCO tool to the CDM pipeline. * Included metadata file generation after each tool's execution. * Updated Python library dependencies to the latest versions. * Standardized thread management logic across all tools. diff --git a/src/loaders/compute_tools/busco/Dockerfile b/src/loaders/compute_tools/busco/Dockerfile new file mode 100644 index 00000000..33f4851b --- /dev/null +++ b/src/loaders/compute_tools/busco/Dockerfile @@ -0,0 +1,35 @@ +FROM continuumio/miniconda3:24.5.0-0 + +# NOTE: If the tool version changes, ensure the metadata information saved after running the tool in the _run_busco_single method is updated +ARG BUSCO_VER=5.7.1 +ENV CONDA_ENV busco-$BUSCO_VER + +# Add Bioconda and Conda-Forge channels +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge + +# Install BUSCO +# Certain dependencies (e.g., dendropy, sepp) are only compatible with Python versions up to 3.9. +ARG PYTHON_VER=3.9 +RUN conda create -n $CONDA_ENV python=$PYTHON_VER +RUN conda install -n $CONDA_ENV pandas=2.2.2 jsonlines=4.0.0 mamba=1.5.8 pyyaml=6.0.1 +# Suggestions from BUSCO team to use mamba for speeding up the installation process: +# https://busco.ezlab.org/busco_userguide.html#installation-with-conda +RUN conda run -n $CONDA_ENV mamba install -c bioconda -c conda-forge -y busco=$BUSCO_VER + +# Activate the environment +RUN echo "source activate $CONDA_ENV" >> ~/.bashrc + +# Set up directories +RUN mkdir -p /app +COPY ./ /app/collections +RUN rm -r /app/collections/.git + +ENV PYTHONPATH /app/collections +WORKDIR /app + +ENV PY_SCRIPT=/app/collections/src/loaders/compute_tools/busco/busco.py + +RUN chmod -R 777 /app/collections + +ENTRYPOINT ["/app/collections/src/loaders/compute_tools/entrypoint.sh"] diff --git a/src/loaders/compute_tools/busco/busco.py b/src/loaders/compute_tools/busco/busco.py new file mode 100644 index 00000000..0558fa82 --- /dev/null +++ b/src/loaders/compute_tools/busco/busco.py @@ -0,0 +1,83 @@ +""" +Run BUSCO tool on a set of fna files. + +This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work. +Therefore, the parser program is not compatible with data generated by this tool. + +""" +import os +import time +from pathlib import Path + +from src.loaders.common.loader_common_names import TOOL_METADATA +from src.loaders.compute_tools.tool_common import ToolRunner, run_command, create_tool_metadata +from src.loaders.compute_tools.tool_version import extract_latest_reference_db_version + + +def _run_busco_single( + tool_safe_data_id: str, + data_id: str, + source_file: Path, + output_dir: Path, + threads_per_tool_run: int, + debug: bool) -> None: + start = time.time() + print(f'Start executing BUSCO for {data_id}') + + metadata_file = output_dir / TOOL_METADATA + if metadata_file.exists(): + print(f"Skipping {source_file} as it has already been processed.") + return + + current_dir = os.path.dirname(os.path.abspath(__file__)) + version_file = os.path.join(current_dir, 'versions.yaml') + ref_db_version = extract_latest_reference_db_version(version_file) + + # Please refer to https://docs.google.com/document/d/15yV-S41Iqe20F-I2MRLWdzJwVdr8QKUfZPw7oq8WvB0/edit#heading=h.elgudks5mtxu + # for more information on the BUSCO command options we are using here. + command = [ + 'busco', + '-i', str(source_file), + '-o', data_id, + '--out_path', str(output_dir), + '--datasets_version', ref_db_version, + '--download_path', '/reference_data', + '-c', str(threads_per_tool_run), + '--auto-lineage-prok', + '-m', 'genome', + '-f', + '--augustus', + ] + + run_command(command, output_dir if debug else None) + + end_time = time.time() + run_time = end_time - start + print( + f'Used {round(run_time / 60, 2)} minutes to execute BUSCO for {data_id}') + + # Save run info to a metadata file in the output directory for parsing later + additional_metadata = { + 'source_file': str(source_file), + 'data_id': data_id, + "reference_db": { + "version": ref_db_version, + }, + } + create_tool_metadata( + output_dir, + tool_name="busco", + version="5.7.1", + command=command, + run_time=round(run_time, 2), + batch_size=1, + additional_metadata=additional_metadata) + + +def main(): + runner = ToolRunner("busco") + runner.parallel_single_execution(_run_busco_single, unzip=True) + + +if __name__ == "__main__": + main() diff --git a/src/loaders/compute_tools/busco/versions.yaml b/src/loaders/compute_tools/busco/versions.yaml new file mode 100644 index 00000000..400bbdca --- /dev/null +++ b/src/loaders/compute_tools/busco/versions.yaml @@ -0,0 +1,9 @@ +# This tool serves a distinct purpose separate from collection tools; instead, it is suited for CDM work. +# Therefore, the parser program is not compatible with data generated by this tool. + +versions: + - version: 0.1.0 + date: 2024-08-22 + notes: | + - initial BUSCO implementation + reference_db_version: odb10 \ No newline at end of file diff --git a/src/loaders/jobs/taskfarmer/task_generator.py b/src/loaders/jobs/taskfarmer/task_generator.py index 35231696..df6a1926 100644 --- a/src/loaders/jobs/taskfarmer/task_generator.py +++ b/src/loaders/jobs/taskfarmer/task_generator.py @@ -38,10 +38,15 @@ --force Force overwrite of existing job directory --source_file_ext SOURCE_FILE_EXT Select files from source data directory that match the given extension. - + +TODO: The recommended approach by NERSC for running tasks with intensive I/O tools (most of our tools), is to utilize +the scratch directory. Before executing the task, source data and reference libraries should be copied to the scratch +directory. Soft links (such as for collection sources) should be created as needed. Once the task is complete, +the results should be copied back to the user's directory. For more information, refer to the NERSC documentation: +https://docs.nersc.gov/filesystems/perlmutter-scratch/ ''' -TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap'] +TOOLS_AVAILABLE = ['gtdb_tk', 'checkm2', 'microtrait', 'mash', 'eggnog', 'bbmap', 'busco'] NODE_TIME_LIMIT_DEFAULT = 5 # hours # Used as THREADS variable in the batch script which controls the number of parallel tasks per node @@ -56,16 +61,16 @@ # for single genome tools, such as microtrait and mash, the chunk_size is the number of genomes to process in a # serial manner # exe_time is the estimated execution time for a single task (default is 60 minutes) -# threads_per_tool_run is the number of threads to use for each tool execution (default is 32) +# threads_per_tool_run is the number of threads to use for each tool execution (default is SYSTEM_CPU_CORES (256) / number of parallel tasks per node) # tasks_per_node is the number of parallel tasks to run on a node (default is 1) # node_time_limit is the time limit for the node we reserved for the task (default is 5 hours) # if no specific metadata is provided for a tool, the default values are used. TASK_META = {'gtdb_tk': {'chunk_size': 1000, 'exe_time': 65, 'tasks_per_node': 4, 'threads_per_tool_run': 32}, 'eggnog': {'chunk_size': 100, 'exe_time': 15, 'node_time_limit': 0.5}, # Memory intensive tool - reserve more nodes with less node reservation time + 'busco': {'chunk_size': 50, 'exe_time': 90, 'node_time_limit': 1.5}, # 1.5 minutes per genome with a single task per node on the user's drive. TODO: Aim to test multi-threading per node along with scratch execution, and adjust `tasks_per_node` accordingly. 'default': {'chunk_size': 5000, 'exe_time': 60}} MAX_NODE_NUM = 100 # maximum number of nodes to use - REGISTRY = 'ghcr.io/kbase/collections' VERSION_FILE = 'versions.yaml' COMPUTE_TOOLS_DIR = '../../compute_tools' # relative to task_generator.py