Skip to content

Commit

Permalink
retrieve tool reference from yaml file
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed Oct 23, 2023
1 parent adb032b commit 6f7d272
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 17 deletions.
7 changes: 6 additions & 1 deletion src/loaders/compute_tools/checkm2/versions.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# reference_db_path: DIAMOND database CheckM2 relies on
# download data following the instructions provided on https://github.com/chklovski/CheckM2#database

versions:
- version: 0.1.0
date: 2023-07-19
reference_db_path: /global/cfs/cdirs/kbase/collections/libraries/CheckM2_database
- version: 0.1.1
date: 2023-08-04
notes: |
- install jsonlines, pandas to support parsing of CheckM2 output
- install jsonlines, pandas to support parsing of CheckM2 output
reference_db_path: /global/cfs/cdirs/kbase/collections/libraries/CheckM2_database
8 changes: 7 additions & 1 deletion src/loaders/compute_tools/gtdb_tk/versions.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# reference_db_path: directory containing the unarchived GTDB-Tk reference data
# download data following the instructions provided on
# https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data

versions:
- version: 0.1.0
date: 2023-07-19
reference_db_path: /global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/release207_v2
- version: 0.1.1
date: 2023-08-03
notes: |
- install jsonlines, pandas to support parsing of GTDB-Tk output
- install jsonlines, pandas to support parsing of GTDB-Tk output
reference_db_path: /global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/release207_v2
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ def extract_latest_version(file_path: str) -> str:
return latest_version['version']


def extract_latest_reference_db_path(file_path: str) -> str:
"""
Extracts the latest reference database path from a YAML file by referencing the latest date specified within the file.
Args:
file_path (str): The path to the YAML file.
Returns:
str: The latest reference database path extracted from the YAML file.
None: If the latest version does not have a reference database path.
"""

latest_ver = extract_latest_version(file_path)

with open(file_path, 'r') as file:
data = yaml.safe_load(file)
versions = data['versions']

found_version = next((v for v in versions if v['version'] == latest_ver), None)

if not found_version:
raise ValueError(f'Unable to find version: {latest_ver}')

return found_version.get('reference_db_path')


def main():
if len(sys.argv) < 2:
print("Please provide the file path as an argument.")
Expand Down
36 changes: 21 additions & 15 deletions src/loaders/jobs/taskfarmer/task_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import src.loaders.jobs.taskfarmer.taskfarmer_common as tf_common
from src.loaders.common import loader_common_names
from src.loaders.common.loader_helper import make_collection_source_dir
from src.loaders.compute_tools.tool_version import extract_latest_version
from src.loaders.compute_tools.tool_yaml_reader import extract_latest_version, extract_latest_reference_db_path
from src.loaders.jobs.taskfarmer.taskfarmer_task_mgr import TFTaskManager, PreconditionError

'''
Expand Down Expand Up @@ -64,21 +64,27 @@
VERSION_FILE = 'versions.yaml'
COMPUTE_TOOLS_DIR = '../../compute_tools' # relative to task_generator.py

# TODO GTDB update readme to specify to get correct version of data, not just latest
# TODO REPRODUCIBILITY need to version the databases and use the correct version with the
# correct tool version
# directory containing the unarchived GTDB-Tk reference data
# download data following the instructions provided on
# https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data
GTDBTK_DATA_PATH = '/global/cfs/cdirs/kbase/collections/libraries/gtdb_tk/release207_v2'
# volume name for the Docker containers
TOOL_IMG_VOLUME_NAME = {'checkm2': '/CheckM2_database',
'gtdb_tk': '/gtdbtk_reference_data'}

# DIAMOND database CheckM2 relies on
# download data following the instructions provided on https://github.com/chklovski/CheckM2#database
CHECKM2_DB = '/global/cfs/cdirs/kbase/collections/libraries/CheckM2_database'

# volume mapping for the Docker containers
TOOL_VOLUME_MAP = {'checkm2': {CHECKM2_DB: '/CheckM2_database'},
'gtdb_tk': {GTDBTK_DATA_PATH: '/gtdbtk_reference_data'}}
def _retrieve_tool_volume(tool):
# Retrieve the volume mapping for the specified tool.

current_dir = os.path.dirname(os.path.abspath(__file__))
compute_tools_dir = os.path.join(current_dir, COMPUTE_TOOLS_DIR)
version_file = os.path.join(compute_tools_dir, tool, VERSION_FILE)
ref_db_path = extract_latest_reference_db_path(version_file)

if tool in TOOL_IMG_VOLUME_NAME.keys():
if not ref_db_path:
raise ValueError(f'No reference database path found for tool {tool}.')

return {ref_db_path: TOOL_IMG_VOLUME_NAME[tool]}
else:
# No reference database path needed for the tool (microtrait, mash).
return dict()


def _pull_image(image_str, job_dir):
Expand Down Expand Up @@ -212,7 +218,7 @@ def _create_task_list(
chunk_size = TASK_META.get(tool, TASK_META['default'])['chunk_size']
genome_ids_chunks = [genome_ids[i: i + chunk_size] for i in range(0, len(genome_ids), chunk_size)]

vol_mounts = TOOL_VOLUME_MAP.get(tool, {})
vol_mounts = _retrieve_tool_volume(tool)

task_list = '#!/usr/bin/env bash\n'
for idx, genome_ids_chunk in enumerate(genome_ids_chunks):
Expand Down

0 comments on commit 6f7d272

Please sign in to comment.