diff --git a/src/cubi_tk/exceptions.py b/src/cubi_tk/exceptions.py index 1e896ac..5595b92 100644 --- a/src/cubi_tk/exceptions.py +++ b/src/cubi_tk/exceptions.py @@ -54,4 +54,8 @@ class InvalidReadmeException(CubiTkException): class FileMd5MismatchException(CubiTkException): - """Raised if the recored md5 sum for a file does not match the (re)computed value.""" + """Raised if the recorded md5 sum for a file does not match the (re)computed value.""" + + +class SodarAPIException(CubiTkException): + """Raised when the SODAR API does not return success.""" diff --git a/src/cubi_tk/sodar/__init__.py b/src/cubi_tk/sodar/__init__.py index 5e01ffa..40ccb46 100644 --- a/src/cubi_tk/sodar/__init__.py +++ b/src/cubi_tk/sodar/__init__.py @@ -30,6 +30,9 @@ ``add-ped`` Download sample sheet, add rows from PED file, and re-upload. +``update-samplesheet`` + Directly update ISA sample sheet (without intermediate files), based on ped file &/ command line specified data. + ``pull-data-collection`` Download data collection from iRODS. @@ -67,6 +70,7 @@ from .lz_validate import setup_argparse as setup_argparse_lz_validate from .pull_data_collection import setup_argparse as setup_argparse_pull_data_collection from .pull_raw_data import setup_argparse as setup_argparse_pull_raw_data +from .update_samplesheet import setup_argparse as setup_argparse_update_samplesheet from .upload_sheet import setup_argparse as setup_argparse_upload_sheet @@ -77,6 +81,9 @@ def setup_argparse(parser: argparse.ArgumentParser) -> None: setup_argparse_add_ped( subparsers.add_parser("add-ped", help="Augment sample sheet from PED file") ) + setup_argparse_update_samplesheet( + subparsers.add_parser("update-samplesheet", help="Update sample sheet") + ) setup_argparse_download_sheet(subparsers.add_parser("download-sheet", help="Download ISA-tab")) setup_argparse_upload_sheet( subparsers.add_parser("upload-sheet", help="Upload and replace ISA-tab") diff --git a/src/cubi_tk/sodar/update_samplesheet.py b/src/cubi_tk/sodar/update_samplesheet.py new file mode 100644 index 0000000..816d01a --- /dev/null +++ b/src/cubi_tk/sodar/update_samplesheet.py @@ -0,0 +1,635 @@ +"""``cubi-tk sodar update-samplesheet``: update ISA-tab with new values and/or entries.""" + +import argparse +from collections import defaultdict +from io import StringIO +import re +from typing import Iterable, Optional + +from logzero import logger +import pandas as pd +from ruamel.yaml import YAML + +from ..exceptions import ParameterException +from ..parse_ped import parse_ped +from ..sodar_api import SodarAPI + +REQUIRED_COLUMNS = ["Source Name", "Sample Name", "Extract Name"] +REQUIRED_IF_EXISTING_COLUMNS = ["Library Name"] +ISA_NON_SETTABLE = ["Term Source REF", "Term Accession Number", "Protocol REF"] + +# Type definition: +IsaColumnDetails = dict[str, list[tuple[str, str]]] +# dict of short or long column names to list of tuples of (original column name, table name) +# long names include Column type, i.e. "Characteristics[organism]" +# short names are without the type, i.e. "organism" +# original column names are the names as they are returned by pandas.read_csv, this +# may include numerical suffixes for duplicate column names (e.g. "Extract Name.1") + + +sheet_default_config_yaml = """ +ped_defaults: &ped + family_id: Family-ID + name: Sample-ID + father_name: Paternal-ID + mother_name: Maternal-ID + sex: Sex + disease: Phenotype + +Modellvorhaben: &MV + sample_fields: + - Family-ID + - Analysis-ID + - Paternal-ID + - Maternal-ID + - Sex + - Phenotype + - Individual-ID + - Probe-ID + - Barcode + - Barcode-Name + field_column_mapping: + Family-ID: Family + Analysis-ID: Source Name + Paternal-ID: Father + Maternal-ID: Mother + Sex: Sex + Phenotype: Disease status + Individual-ID: Individual-ID + Probe-ID: Probe-ID + Barcode: Barcode sequence + Barcode-Name: Barcode name + dynamic_columns: + Sample Name: "{Analysis-ID}-N1" + Extract Name: "{Analysis-ID}-N1-DNA1" + Library Name: "{Analysis-ID}-N1-DNA1-WGS1" + ped_to_sampledata: + <<: *ped + name: Analysis-ID + +#alias +MV: + <<: *MV + +MV-barcodes: + <<: *MV + sample_fields: + - Individual-ID + - Probe-ID + - Analysis-ID + - Barcode + - Barcode-Name + +germline-sheet: + sample_fields: + - Family-ID + - Sample-ID + - Paternal-ID + - Maternal-ID + - Sex + - Phenotype + field_column_mapping: + Family-ID: Family + Sample-ID: Source Name + Paternal-ID: Father + Maternal-ID: Mother + Sex: Sex + Phenotype: "Disease status" + dynamic_columns: + Sample Name: "{Sample-ID}-N1" + Extract Name: "{Sample-ID}-N1-DNA1" + Library Name: "{Sample-ID}-N1-DNA1-WGS1" + ped_to_sampledata: + <<: *ped +""" + +# ruamel round-trip loader uses ordered dicts +yaml = YAML(typ="rt") +sheet_default_config = yaml.load(sheet_default_config_yaml) + + +def orig_col_name(col_name: str) -> str: + """Return the original column name without any suffixes.""" + # Suffixes are added to duplicate ISAtab column names by pandas.read_csv + return re.sub(r"\.[0-9]+$", "", col_name) + + +class UpdateSamplesheetCommand: + def __init__(self, args): + #: Command line arguments. + self.args = args + + @classmethod + def setup_argparse(cls, parser: argparse.ArgumentParser) -> None: + """Setup arguments for ``update-samplesheet`` command.""" + + SodarAPI.setup_argparse(parser) + + parser.add_argument( + "--hidden-cmd", dest="sodar_cmd", default=cls.run, help=argparse.SUPPRESS + ) + + sample_group = parser.add_argument_group( + "Sample Definitions", + ) + + sample_group.add_argument( + "-s", + "--sample-data", + nargs="+", + action="append", + help="Sample specific (meta)data to be added to the samplesheet. Each argument describes one sample. " + "The number of arguments needs to match the fields defined via `--defaults` or `--sample-fields`" + "Can be combined with `--ped` to give additional columns or additional samples. " + "After joining both sets all fields must have a defined value.", + ) + + sample_group.add_argument( + "-d", + "--defaults", + choices=sheet_default_config.keys(), + default="germline-sheet", + help="Choose a predefined default for field definitions (used by `--sample-data`) and `--ped-name-mapping` (used by `--ped`). " + "Defaults to 'germline-sheet'. " + "Field definitions are as follows:\n" + "\n".join( + [ + f"{default} ({len(settings['sample_fields'])} fields, ped-Name: {settings['ped_to_sampledata']['name']}):\n" + f"{', '.join(settings['sample_fields'])}" + for default, settings in sheet_default_config.items() + if default != "ped_defaults" + ] + ), + ) + sample_group.add_argument( + "--sample-fields", + nargs="+", + help="Manual definition of fields for `--sample-data`, overrides values set by `--sheet-type`. " + "The field names need to match column names in the samplesheet, overrides `--defaults`." + "If values are given as 'FieldName=ISAColumn', the Fields can be matched to ped columns, while also " + "being matched to a differently named column in the ISA samplesheet.", + ) + sample_group.add_argument( + "-p", + "--ped", + help="Ped file with sample data to be added to the samplesheet, the default ped columns are mapped to these " + "fields: Family-ID, Sample-ID(*), Paternal-ID, Maternal-ID, Sex, PhenoType. " + "This mapping can be changed via `--ped-field-mapping` or the `--defaults` option.\n" + "Can be extended by `--sample-data` to give additional columns or additional samples. " + "After joining both sets all fields must have a defined value.", + ) + sample_group.add_argument( + "--ped-field-mapping", + nargs=2, + action="append", + metavar=("PED_COLUMN", "SAMPLE_FIELD"), + help="Manually define how ped columns are mapped to sample data fields or ISA columns. " + "The following ped names columns should be used: " + ", ".join(sheet_default_config["ped_defaults"].keys()) + "\n" + "Overwrites value set for 'name' by `--defaults`. " + "SAMPLE_FIELD can also be a column name of the ISA samplesheet.", + ) + + sample_group.add_argument( + "--dynamic-column", + nargs=2, + action="append", + metavar=("COLUMN", "FORMAT_STR"), + help="Dyanmically fill columns in the ISA sheet based on other columns." + "Use this option if some columns with sample-sepcific Data can be derived from other columns." + "FORMAT_STR can contain other columns as placeholders, i.e.: '{Source Name}-N1' for a new Sample Name." + "Note: only columns from the ped/sampledata can be used as placeholders.", + ) + + parser.add_argument( + "-a", + "--metadata-all", + nargs=2, + action="append", + metavar=("COLUMN", "VALUE"), + help="Set metadata value for all samples added to the samplesheet. Specify column name and value. " + "Can be used multiple times.", + ) + + # FIXME: provide option to read ISA &/ tsv file (ignoring all other sample data & fields) + # group_samples.add_argument( + # "--tsv", help="Tabular file with sample data" + # ) + + parser.add_argument( + "--overwrite", + default=False, + action="store_true", + help="Allow overwriting of existing values for samples already defined in the samplesheet", + ) + + parser.add_argument( + "--no-autofill", + action="store_true", + help="Do not automatically fill values for non-specified metadata columns that have a single unique value " + "in the existing samplesheet. Note: ontology terms & references will never be autofilled.", + ) + + parser.add_argument( + "--snappy-compatible", + action="store_true", + help="Transform IDs so they are compatible with snappy processing " + "(replaces '-' with '_' in required ISA fields).", + ) + + @classmethod + def run( + cls, args, _parser: argparse.ArgumentParser, _subparser: argparse.ArgumentParser + ) -> Optional[int]: + """Entry point into the command.""" + return cls(args).execute() + + def execute(self) -> Optional[int]: + """Execute the command.""" + + if self.args.overwrite: + logger.warning( + "Existing values in the ISA samplesheet may get overwritten, there will be no checks or further warnings." + ) + + # Get samplehseet from SODAR API + sodar_api = SodarAPI( + sodar_url=self.args.sodar_url, + sodar_api_token=self.args.sodar_api_token, + project_uuid=self.args.project_uuid, + ) + isa_data = sodar_api.get_ISA_samplesheet() + investigation = isa_data["investigation"]["content"] + study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t") + assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t") + isa_names = self.gather_ISA_column_names(study, assay) + + # Check that given sample-data field names can be used + sample_fields_mapping = self.parse_sampledata_args(isa_names) + + # Collect ped & sample data, check that they can be combined + samples = self.collect_sample_data(isa_names, sample_fields_mapping) + + # add metadata values to samples + if self.args.metadata_all: + samples = samples.assign(**dict(self.args.metadata_all)) + + # Match sample data to ISA columns and get new study & assay dataframes + study_new, assay_new = self.match_sample_data_to_isa( + samples, isa_names, sample_fields_mapping + ) + req_cols = set(REQUIRED_COLUMNS) | ( + set(REQUIRED_IF_EXISTING_COLUMNS) & set(isa_names.keys()) + ) + colset = set(study.columns.tolist() + assay.columns.tolist()) + if not req_cols.issubset(colset): + missing_cols = req_cols - colset + raise ValueError(f"Missing required columns in sample data: {', '.join(missing_cols)}") + if self.args.sanppy_compatible: + for col in study_new.columns: + if orig_col_name(col) in req_cols: + study_new[col] = study_new[col].str.replace("-", "_") + for col in assay_new.columns: + if orig_col_name(col) in req_cols: + assay_new[col] = assay_new[col].str.replace("-", "_") + + # Update ISA tables with new data + study_final = self.update_isa_table( + study, study_new, self.args.overwrite, self.args.no_autofill + ) + assay_final = self.update_isa_table( + assay, assay_new, self.args.overwrite, self.args.no_autofill + ) + + # Write new samplesheet to tsv strings, then upload via API + study_tsv = study_final.to_csv( + sep="\t", index=False, header=list(map(orig_col_name, study_final.columns)) + ) + assay_tsv = assay_final.to_csv( + sep="\t", index=False, header=list(map(orig_col_name, assay_final.columns)) + ) + ret = sodar_api.upload_ISA_samplesheet( + (isa_data["investigation"]["filename"], investigation), + (isa_data["study"]["filename"], study_tsv), + (isa_data["assay"]["filename"], assay_tsv), + ) + return ret + + def parse_sampledata_args(self, isa_names: IsaColumnDetails) -> dict[str, str]: + """Build a dict to collect and map the names for ped or sampledata [-s] fields to ISA column names.""" + + # Some samples are defined, either through ped or sample data + if not self.args.ped and not self.args.sample_data: + raise ValueError( + "No sample data provided. Please provide either a ped file (`--ped`) or sample data (`-s`)." + ) + + # Base field name mapping (also used by ped) + sample_field_mapping = sheet_default_config[self.args.defaults]["field_column_mapping"] + + # Sample data, if given, needs to match the defined fields (from default or sample_fields) + if not self.args.sample_data: + return sample_field_mapping + + if self.args.sample_fields: + sample_field_mapping.update( + { + k: (v or k) + for (k, _, v) in map(lambda s: s.partition("="), self.args.sample_fields) + } + ) + n_fields = len(self.args.sample_fields) + else: + n_fields = len(sheet_default_config[self.args.defaults]["sample_fields"]) + + incorrect_n = [sample for sample in self.args.sample_data if len(sample) != n_fields] + if incorrect_n: + msg = ( + f"The number of entries for some samples does not match the number of fields defined ({n_fields}):\n" + "\n".join( + [ + f"{len(sample)} instead of {n_fields} values: {', '.join(map(str, sample))}" + for sample in incorrect_n + ] + ) + ) + raise ValueError(msg) + + unknown_fields = [ + tup for tup in sample_field_mapping.items() if tup[1] not in isa_names.keys() + ] + if unknown_fields: + msg = ( + "Some columns for sample field mapping are not defined in the ISA samplesheet: " + + ", ".join([f"{col} (from {field})" for field, col in unknown_fields]) + ) + raise NameError(msg) + + return sample_field_mapping + + def gather_ISA_column_names(self, study: pd.DataFrame, assay: pd.DataFrame) -> IsaColumnDetails: + isa_regex = re.compile(r"(Characteristics|Parameter Value|Comment)\[(.*?)]") + study_cols = study.columns.tolist() + assay_cols = assay.columns.tolist() + + isa_short_names = list( + map(lambda x: orig_col_name(isa_regex.sub(r"\2", x)), study_cols + assay_cols) + ) + isa_long_names = list(map(orig_col_name, study_cols + assay_cols)) + isa_names_unique = study_cols + assay_cols + isa_table = ["study"] * len(study_cols) + ["assay"] * len(assay_cols) + + out = defaultdict(list) + for short, long, uniq, table in zip( + isa_short_names, isa_long_names, isa_names_unique, isa_table, strict=True + ): + out[short].append((uniq, table)) + out[long].append((uniq, table)) + + for col in ISA_NON_SETTABLE: + if col in out: + del out[col] + + # do NOT retain defaultdict class + return dict(out) + + def get_dynamic_columns( + self, + existing_names: Iterable[str], + isa_names: Iterable[str], + ) -> dict[str, str]: + dynamic_cols = dict() + re_format_names = re.compile(r"\{(.*?)}") + if self.args.dynamic_column: + for col, format_str in self.args.dynamic_column: + missing_deps = ( + set(re_format_names.findall(format_str)) + - set(existing_names) + - set(dynamic_cols) + ) + if missing_deps: + raise ValueError( + f"Dynamic column '{col}' depends on non-existing columns: {', '.join(missing_deps)}" + ) + if col not in isa_names: + raise ValueError(f"Column '{col}' is not defined in the ISA samplesheet.") + dynamic_cols[col] = format_str + elif sheet_default_config[self.args.defaults]["dynamic_columns"]: + dynamic_cols = sheet_default_config[self.args.defaults]["dynamic_columns"] + # Hardcoded dep check for defaults: see if 'Library Name' is actually defined + if "Library Name" in dynamic_cols and "Library Name" not in isa_names: + logger.warning( + 'Skipping "Library Name" dynamic column, as it is not used in the ISA samplesheet.' + ) + del dynamic_cols["Library Name"] + + # #Sample Name needs to be set before other assay columns, so ensure it goes first + # if 'Sample Name' in dynamic_cols: + # dynamic_cols.move_to_end('Sample Name', last=False) + return dynamic_cols + + def collect_sample_data( + self, + isa_names: IsaColumnDetails, + sample_field_mapping: dict[str, str], + ) -> pd.DataFrame: + ped_mapping = sheet_default_config[self.args.defaults]["ped_to_sampledata"] + if self.args.ped_field_mapping: + allowed_sample_col_values = sample_field_mapping.keys() | isa_names.keys() + for ped_col, sample_col in self.args.ped_field_mapping: + if ped_col not in ped_mapping: + logger.warning(f"Ped column '{ped_col}' is unknown and will be ignored.") + continue + if sample_col not in allowed_sample_col_values: + raise ParameterException( + f"'{sample_col}' is neither a known sample field nor a known column of the ISA samplesheet." + ) + + ped_mapping[ped_col] = sample_col + + if self.args.ped: + with open(self.args.ped, "rt") as inputf: + ped_dicts = map( + lambda donor: { + field: getattr(donor, attr_name) for attr_name, field in ped_mapping.items() + }, + parse_ped(inputf), + ) + ped_data = pd.DataFrame(ped_dicts) + else: + ped_data = pd.DataFrame() + + if self.args.sample_data: + if self.args.sample_fields: + fields = self.args.sample_fields + else: + fields = sheet_default_config[self.args.defaults]["sample_fields"] + sample_data = pd.DataFrame(self.args.sample_data, columns=fields) + # FIXME: consistent conversion for Sex & Phenotype values? also empty parent values? + # ped outputs: male/female, unaffected/affected, 0 + else: + sample_data = pd.DataFrame() + + if self.args.ped and self.args.sample_data: + # Check for matching fields between ped and sample data, but keep original order + matching_fields = [col for col in ped_data.columns if col in sample_data.columns] + combined_fields = ped_data.columns.tolist() + combined_fields += [col for col in sample_data.columns if col not in combined_fields] + if not matching_fields: + raise ParameterException("No matching fields found between ped and sample data.") + + # Combine the two sample sets, reorder based on ped & then sample data + samples = pd.merge_ordered(sample_data, ped_data, on=matching_fields)[combined_fields] + # Check that all values for all samples exist + if samples.isnull().values.any(): + missing_data = samples.loc[samples.isnull().any(axis=1)] + raise ParameterException( + "Combination of ped and sample data has missing values:\n" + + missing_data.to_string(index=False, na_rep="") + ) + # check that no different values are given for the same sample + if samples[ped_mapping["name"]].duplicated().any(): + duplicated = samples.loc[samples[ped_mapping["name"]].duplicated(keep=False)] + raise ParameterException( + "Sample with different values found in combination of ped and sample data:\n" + + duplicated.to_string(index=False, na_rep="") + ) + else: + samples = ped_data if self.args.ped else sample_data + + dynamic_cols = self.get_dynamic_columns(samples.columns, isa_names) + for col_name, format_str in dynamic_cols.items(): + # MAYBE: allow dynamic columns to change based on fill order? + # i.e. first Extract name = -DNA1, then -DNA1-WXS1 + samples[col_name] = samples.apply(lambda row: format_str.format(**row), axis=1) + + return samples + + def match_sample_data_to_isa( + self, + samples: pd.DataFrame, + isa_names: IsaColumnDetails, + sample_field_mapping: dict[str, str], + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Take a df with sampledata and build study and assay dataframes with all corresponding ISA column names.""" + + # build new assay and study dataframes, with content from samples but names from isa_names + new_study_data = pd.DataFrame() + new_assay_data = pd.DataFrame() + + failed_cols = [] + + for col_name in samples.columns: + if col_name in sample_field_mapping: + matched_cols = isa_names[sample_field_mapping[col_name]] + elif col_name in isa_names: + matched_cols = isa_names[col_name] + else: + failed_cols.append(col_name) + continue + + for matched_isa_col, isa_table in matched_cols: + if matched_isa_col == "Sample Name": + new_study_data[matched_isa_col] = samples[col_name] + new_assay_data[matched_isa_col] = samples[col_name] + elif isa_table == "study": + new_study_data[matched_isa_col] = samples[col_name] + else: + new_assay_data[matched_isa_col] = samples[col_name] + + if failed_cols: + raise ValueError( + f"Failed to match these column names/sample fields to ISA column: {', '.join(failed_cols)}" + ) + + return new_study_data, new_assay_data + + def update_isa_table( + self, + isa_table: pd.DataFrame, + update_table: pd.DataFrame, + overwrite: bool = False, + no_autofill: bool = False, + ): + if not all(update_table.columns.isin(isa_table.columns)): + raise ValueError( + "New ISA table has columns that are not present in the existing ISA table." + ) + + # reorder update_table columns to match isa_table + update_table = update_table[ + [col for col in isa_table.columns if col in update_table.columns] + ] + + # check for matches with existing data based on required cols (which should be the material cols) + # separate new data into updates (all mat_cols match) and additions + mat_cols = [ + col + for col in isa_table.columns + if orig_col_name(col) in REQUIRED_COLUMNS + REQUIRED_IF_EXISTING_COLUMNS + ] + common_rows_update = ( + update_table[mat_cols].isin(isa_table[mat_cols].to_dict(orient="list")).all(axis=1) + ) + common_rows_isa = ( + isa_table[mat_cols].isin(update_table[mat_cols].to_dict(orient="list")).all(axis=1) + ) + updates = update_table.loc[common_rows_update].copy() + additions = update_table.loc[~common_rows_update].copy() + + if overwrite: + # update all isa_table columns with new_data + isa_table.loc[common_rows_isa, updates.columns] = updates.values + else: + # Update only values those columns that are empty/falsy + # Give a warning if any existing values are different + for col in updates.columns: + isa_col = isa_table[col].loc[common_rows_isa] + equal_values = isa_col.reset_index(drop=True) == updates[col].reset_index(drop=True) + empty_values = isa_col.isnull().reset_index(drop=True) | isa_col.eq("").reset_index( + drop=True + ) + clash_rows = ~equal_values & ~empty_values + + if equal_values.all(): + continue + elif clash_rows.any(): + clash = isa_table.loc[isa_col.index[clash_rows], mat_cols + [col]] + clash["" + col] = updates[col].loc[clash_rows].values + logger.warning( + f"Given values for ISA column '{col}' have different existing values, " + "these will not be updated. Use `--overwrite` to force update.\n" + + clash.to_string(index=False, na_rep="") + ) + + isa_table.loc[isa_col.index[empty_values], col] = ( + updates[col].loc[empty_values].values + ) + + # Check which cols should be autofilled ("Protocol REF" needs to be, for ISA to work) + autofill_cols = { + col: isa_table[col].unique()[0] + for col in isa_table.columns + if orig_col_name(col) == "Protocol REF" + } + if not no_autofill: + # Do not autofill ontology terms or references (an autofilled ontology reference without values + # would not pass altamisa validation) + autofill_cols.update( + { + col: isa_table[col].unique()[0] + for col in isa_table.columns + if isa_table[col].nunique() == 1 and orig_col_name(col) not in ISA_NON_SETTABLE + } + ) + + additions = additions.assign(**autofill_cols) + + isa_table = pd.concat([isa_table, additions], ignore_index=True).fillna("") + + return isa_table + + +def setup_argparse(parser: argparse.ArgumentParser) -> None: + """Setup argument parser for ``cubi-tk sodar update-samplesheet``.""" + return UpdateSamplesheetCommand.setup_argparse(parser) diff --git a/src/cubi_tk/sodar_api.py b/src/cubi_tk/sodar_api.py new file mode 100644 index 0000000..48ae682 --- /dev/null +++ b/src/cubi_tk/sodar_api.py @@ -0,0 +1,158 @@ +import argparse +from collections import namedtuple +from functools import reduce +import os +from typing import Literal +import urllib.parse as urlparse + +from logzero import logger +import requests + +from .common import is_uuid, load_toml_config +from .exceptions import ParameterException, SodarAPIException + + +class SodarAPI: + def __init__(self, sodar_url: str, sodar_api_token: str, project_uuid: str): + self.sodar_url = sodar_url + self.sodar_api_token = sodar_api_token + self.project_uuid = project_uuid + self.check_args() + + def check_args(self): + # toml_config needs an object with attribute named config + args = namedtuple("args", ["config"]) + toml_config = load_toml_config(args(config=None)) + if not self.sodar_url: + if not toml_config: + raise ParameterException( + "SODAR URL not given on command line and not found in toml config files." + ) + self.sodar_url = toml_config.get("global", {}).get("sodar_server_url") + if not self.sodar_url: + raise ParameterException( + "SODAR URL not found in config files. Please specify on command line." + ) + if not self.sodar_api_token: + if not toml_config: + raise ParameterException( + "SODAR API token not given on command line and not found in toml config files." + ) + self.sodar_api_token = toml_config.get("global", {}).get("sodar_api_token") + if not self.sodar_api_token: + raise ParameterException( + "SODAR API token not found in config files. Please specify on command line." + ) + if not is_uuid(self.project_uuid): + raise ParameterException("Sodar Project UUID is not a valid UUID.") + + @staticmethod + def setup_argparse(parser: argparse.ArgumentParser) -> None: + """Setup argument parser.""" + group_sodar = parser.add_argument_group("SODAR-related") + + # load default from toml file + # consider: mark token as sensitive + group_sodar.add_argument( + "--sodar-url", + default=os.environ.get("SODAR_URL", "https://sodar.bihealth.org/"), + help="URL to SODAR, defaults to SODAR_URL environment variable or fallback to https://sodar.bihealth.org/", + ) + group_sodar.add_argument( + "--sodar-api-token", + default=os.environ.get("SODAR_API_TOKEN", None), + help="Authentication token when talking to SODAR. Defaults to SODAR_API_TOKEN environment variable.", + ) + group_sodar.add_argument( + "project_uuid", + help="SODAR project UUID", + ) + + def _base_api_header(self) -> dict[str, str]: + # FIXME: only add versioning header once SODAR API v1.0 is released + # (this will introduce individual versioning for specific calls and break the general versioning) + sodar_headers = { + "Authorization": "token {}".format(self.sodar_api_token), + # 'Accept': f'application/vnd.bihealth.sodar+json; version={SODAR_API_VERSION}', + } + return sodar_headers + + def _api_call( + self, + api: Literal["samplesheets", "landingzones"], + action: str, + method: Literal["get", "post"] = "get", + params: dict = None, + data: dict = None, + files: dict = None, + ) -> dict: + # need to add trailing slashes to all parts of the URL for urljoin to work correctly + # afterward remove the final trailing slash from the joined URL + base_url_parts = [ + part if part.endswith("/") else f"{part}/" + for part in (self.sodar_url, "project", self.project_uuid, api, action) + ] + url = reduce(urlparse.urljoin, base_url_parts)[:-1] + if params: + url += "?" + urlparse.urlencode(params) + + if method == "get": + response = requests.get(url, headers=self._base_api_header()) + elif method == "post": + response = requests.post(url, headers=self._base_api_header(), files=files, data=data) + else: + raise ValueError("Unknown HTTP method.") + + if response.status_code != 200: + raise SodarAPIException(f"API response: {response.text}") + + return response.json() + + def get_ISA_samplesheet(self) -> dict[str, dict[str, str]]: + samplesheet = self._api_call("samplesheets", "export/json") + + # Consider: support multi-assay and multi-study projects? + # -> would require proper ISA parsing to handle assay<>study relations + if len(samplesheet["studies"]) > 1: + raise NotImplementedError("Only single-study projects are supported.") + study = list(samplesheet["studies"].keys())[0] + if len(samplesheet["assays"]) > 1: + raise NotImplementedError("Only single-assay projects are supported.") + assay = list(samplesheet["assays"].keys())[0] + + return { + "investigation": { + "filename": samplesheet["investigation"]["path"], + "content": samplesheet["investigation"]["tsv"], + }, + "study": {"filename": study, "content": samplesheet["studies"][study]["tsv"]}, + "assay": {"filename": assay, "content": samplesheet["assays"][assay]["tsv"]}, + } + + def upload_ISA_samplesheet( + self, + investigation: tuple[str, str], + study: tuple[str, str], + assay: tuple[str, str], + ): + try: + ret_val = self._api_call( + "samplesheets", + "import", + method="post", + files={ + "file_investigation": (*investigation, "text/plain"), + "file_study": (*study, "text/plain"), + "file_assay": (*assay, "text/plain"), + }, + ) + if "sodar_warnings" in ret_val: + logger.info("ISA-tab uploaded with warnings.") + for warning in ret_val["sodar_warnings"]: + logger.warning(f"SODAR warning: {warning}") + else: + logger.info("ISA-tab uploaded successfully.") + return 0 + except SodarAPIException as e: + logger.error(f"Failed to upload ISA-tab:\n{e}") + return 1 diff --git a/tests/data/isa_mv.json b/tests/data/isa_mv.json new file mode 100644 index 0000000..fbeb72e --- /dev/null +++ b/tests/data/isa_mv.json @@ -0,0 +1 @@ +{"investigation": {"path": "i_Investigation.txt", "tsv": "ONTOLOGY SOURCE REFERENCE\nTerm Source Name\tUO\tOBI\tNCBITAXON\tOMIM\tHP\tORPHA\tBTO\nTerm Source File\thttp://data.bioontology.org/ontologies/UO\thttp://data.bioontology.org/ontologies/OBI\thttp://data.bioontology.org/ontologies/NCBITAXON\thttp://data.bioontology.org/ontologies/OMIM\thttp://data.bioontology.org/ontologies/HP\thttp://data.bioontology.org/ontologies/ORDO\thttps://data.bioontology.org/ontologies/BTO\nTerm Source Version\t48\t35\t10\t12\t570\t2.8\t2021-10-26\nTerm Source Description\tUnits of Measurement Ontology\tOntology for Biomedical Investigations\tNational Center for Biotechnology Information (NCBI) Organismal Classification\tOnline Mendelian Inheritance in Man\tHuman Phenotype Ontology\tOrphanet Rare Disease Ontology\tThe BRENDA Tissue Ontology\nINVESTIGATION\nInvestigation Identifier\t\nInvestigation Title\tModellvorhaben Rare Diseases\nInvestigation Description\t\nInvestigation Submission Date\t\nInvestigation Public Release Date\t\nComment[Created With Configuration]\t/path/to/isa-configurations/bih_studies/bih_germline\nComment[Last Opened With Configuration]\tbih_germline\nINVESTIGATION PUBLICATIONS\nInvestigation PubMed ID\nInvestigation Publication DOI\nInvestigation Publication Author List\nInvestigation Publication Title\nInvestigation Publication Status\nInvestigation Publication Status Term Accession Number\nInvestigation Publication Status Term Source REF\nINVESTIGATION CONTACTS\nInvestigation Person Last Name\nInvestigation Person First Name\nInvestigation Person Mid Initials\nInvestigation Person Email\nInvestigation Person Phone\nInvestigation Person Fax\nInvestigation Person Address\nInvestigation Person Affiliation\nInvestigation Person Roles\nInvestigation Person Roles Term Accession Number\nInvestigation Person Roles Term Source REF\nSTUDY\nStudy Identifier\tmodellvorhaben_rare_diseases\nStudy Title\tModellvorhaben Rare Diseases\nStudy Description\t\nComment[Study Grant Number]\t\nComment[Study Funding Agency]\t\nStudy Submission Date\t\nStudy Public Release Date\t\nStudy File Name\ts_modellvorhaben_rare_diseases.txt\nSTUDY DESIGN DESCRIPTORS\nStudy Design Type\nStudy Design Type Term Accession Number\nStudy Design Type Term Source REF\nSTUDY PUBLICATIONS\nStudy PubMed ID\nStudy Publication DOI\nStudy Publication Author List\nStudy Publication Title\nStudy Publication Status\nStudy Publication Status Term Accession Number\nStudy Publication Status Term Source REF\nSTUDY FACTORS\nStudy Factor Name\nStudy Factor Type\nStudy Factor Type Term Accession Number\nStudy Factor Type Term Source REF\nSTUDY ASSAYS\nStudy Assay File Name\ta_modellvorhaben_rare_diseases_genome_sequencing.txt\nStudy Assay Measurement Type\tgenome sequencing\nStudy Assay Measurement Type Term Accession Number\t\nStudy Assay Measurement Type Term Source REF\t\nStudy Assay Technology Type\tnucleotide sequencing\nStudy Assay Technology Type Term Accession Number\thttp://purl.obolibrary.org/obo/OBI_0000626\nStudy Assay Technology Type Term Source REF\tOBI\nStudy Assay Technology Platform\tIllumina\nSTUDY PROTOCOLS\nStudy Protocol Name\tSample collection\tNucleic acid extraction WGS\tLibrary construction WGS\tNucleic acid sequencing WGS\nStudy Protocol Type\tSample collection\tNucleic acid extraction WGS\tLibrary construction WGS\tNucleic acid sequencing WGS\nStudy Protocol Type Term Accession Number\t\t\t\t\nStudy Protocol Type Term Source REF\t\t\t\t\nStudy Protocol Description\t\t\t\t\nStudy Protocol URI\t\t\t\t\nStudy Protocol Version\t\t\t\t\nStudy Protocol Parameters Name\t\tNucleic acid type;Fragmentation method;Concentration measurement\tLibrary source;Library strategy;Library selection;Library layout;Library kit;Library kit manufacturer;Target insert size;Wet-lab insert size;Barcode kit;Barcode kit manufacturer;Barcode name;Barcode sequence;Concentration measurement\tBase quality encoding;Platform;Center contact;Instrument model;Center name\nStudy Protocol Parameters Name Term Accession Number\t\t;;\t;;;;;;;;;;;;\t;;;;\nStudy Protocol Parameters Name Term Source REF\t\t;;\t;;;;;;;;;;;;\t;;;;\nStudy Protocol Components Name\t\t\t\t\nStudy Protocol Components Type\t\t\t\t\nStudy Protocol Components Type Term Accession Number\t\t\t\t\nStudy Protocol Components Type Term Source REF\t\t\t\t\nSTUDY CONTACTS\nStudy Person Last Name\nStudy Person First Name\nStudy Person Mid Initials\nStudy Person Email\nStudy Person Phone\nStudy Person Fax\nStudy Person Address\nStudy Person Affiliation\nStudy Person Roles\nStudy Person Roles Term Accession Number\nStudy Person Roles Term Source REF\n"}, "studies": {"s_modellvorhaben_rare_diseases.txt": {"tsv": "Source Name\tCharacteristics[UUID]\tCharacteristics[External links]\tCharacteristics[Individual-ID]\tCharacteristics[Probe-ID]\tCharacteristics[Batch]\tCharacteristics[Family]\tCharacteristics[Organism]\tTerm Source REF\tTerm Accession Number\tCharacteristics[Mother]\tCharacteristics[Father]\tComment[Family notes]\tCharacteristics[Sex]\tCharacteristics[Disease status]\tCharacteristics[Sample Conservation]\tCharacteristics[Tissue]\tTerm Source REF\tTerm Accession Number\tCharacteristics[OMIM disease]\tTerm Source REF\tTerm Accession Number\tCharacteristics[Orphanet disease]\tTerm Source REF\tTerm Accession Number\tCharacteristics[HPO terms]\tTerm Source REF\tTerm Accession Number\tComment[Disease notes]\tProtocol REF\tPerformer\tSample Name\tCharacteristics[Sequencing target]\tCharacteristics[External links]\tCharacteristics[Provider name]\tCharacteristics[Provider contact]\tCharacteristics[Provider project ID]\tCharacteristics[Provider sample ID]\nSRGS_00_0001\t\t\tMV-00-0001\tMV-00-0001-E1\t0\tMV-00-0001-FAM\tHomo sapiens\tNCBITAXON\thttp://purl.bioontology.org/ontology/NCBITAXON/9606\tSRGS_00_0002\tSRGS_00_0003\tindex\tUNKNOWN\tUNKNOWN\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSample collection\t\tSRGS_00_0001-N1\tgermline\t\tLabor Berlin – Charité Vivantes GmbH\t\t\t\nSRGS_00_0002\t\t\tMV-00-0002\tMV-00-0002-E1\t0\tMV-00-0001-FAM\tHomo sapiens\tNCBITAXON\thttp://purl.bioontology.org/ontology/NCBITAXON/9606\t0\t0\tmother\tUNKNOWN\tUNKNOWN\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSample collection\t\tSRGS_00_0002-N1\tgermline\t\tLabor Berlin – Charité Vivantes GmbH\t\t\t\nSRGS_00_0003\t\t\tMV-00-0003\tMV-00-0003-E1\t0\tMV-00-0001-FAM\tHomo sapiens\tNCBITAXON\thttp://purl.bioontology.org/ontology/NCBITAXON/9606\t0\t0\tfather\tUNKNOWN\tUNKNOWN\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tSample collection\t\tSRGS_00_0003-N1\tgermline\t\tLabor Berlin – Charité Vivantes GmbH\t\t\t\n"}}, "assays": {"a_modellvorhaben_rare_diseases_genome_sequencing.txt": {"tsv": "Sample Name\tProtocol REF\tParameter Value[Nucleic acid type]\tParameter Value[Fragmentation method]\tParameter Value[Concentration measurement]\tPerformer\tDate\tExtract Name\tCharacteristics[Concentration]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[Library source]\tParameter Value[Library strategy]\tParameter Value[Library selection]\tParameter Value[Library layout]\tParameter Value[Library kit]\tParameter Value[Library kit manufacturer]\tComment[Library kit catalogue ID]\tParameter Value[Target insert size]\tParameter Value[Wet-lab insert size]\tParameter Value[Barcode kit]\tParameter Value[Barcode kit manufacturer]\tComment[Barcode kit catalogue ID]\tParameter Value[Barcode name]\tParameter Value[Barcode sequence]\tParameter Value[Concentration measurement]\tPerformer\tDate\tLibrary Name\tCharacteristics[Sequencing kit]\tCharacteristics[Sequencing kit manufacturer]\tCharacteristics[Folder name]\tCharacteristics[Concentration]\tUnit\tTerm Source REF\tTerm Accession Number\tProtocol REF\tParameter Value[Platform]\tParameter Value[Instrument model]\tParameter Value[Base quality encoding]\tParameter Value[Center name]\tParameter Value[Center contact]\tPerformer\tDate\tRaw Data File\nSRGS_00_0001-N1\tNucleic acid extraction WGS\tDNA\tenzymatic\t\t\t\tSRGS_00_0001-N1-DNA1\t\t\t\t\tLibrary construction WGS\tGENOMIC\tWGS\tRANDOM\tPAIRED\tGenomV2\tTwist-Biosciences\t\t\t\t\t\t\t\t\t\t\t\tSRGS_00_0001-N1-DNA1-WGS1\t\t\tindex\t\t\t\t\tNucleic acid sequencing WGS\tILLUMINA\tNovaSeq6000\tPhred+33\t\t\t\t\t\nSRGS_00_0002-N1\tNucleic acid extraction WGS\tDNA\tenzymatic\t\t\t\tSRGS_00_0002-N1-DNA1\t\t\t\t\tLibrary construction WGS\tGENOMIC\tWGS\tRANDOM\tPAIRED\tGenomV2\tTwist-Biosciences\t\t\t\t\t\t\t\t\t\t\t\tSRGS_00_0002-N1-DNA1-WGS1\t\t\tmother\t\t\t\t\tNucleic acid sequencing WGS\tILLUMINA\tNovaSeq6000\tPhred+33\t\t\t\t\t\nSRGS_00_0003-N1\tNucleic acid extraction WGS\tDNA\tenzymatic\t\t\t\tSRGS_00_0003-N1-DNA1\t\t\t\t\tLibrary construction WGS\tGENOMIC\tWGS\tRANDOM\tPAIRED\tGenomV2\tTwist-Biosciences\t\t\t\t\t\t\t\t\t\t\t\tSRGS_00_0003-N1-DNA1-WGS1\t\t\tfather\t\t\t\t\tNucleic acid sequencing WGS\tILLUMINA\tNovaSeq6000\tPhred+33\t\t\t\t\t\n"}}, "date_modified": "2025-01-09 14:47:04.051505+00:00"} diff --git a/tests/test_sodar_update_samplesheet.py b/tests/test_sodar_update_samplesheet.py new file mode 100644 index 0000000..2ec607c --- /dev/null +++ b/tests/test_sodar_update_samplesheet.py @@ -0,0 +1,619 @@ +import argparse +from io import StringIO +import json +import pathlib +import re +from unittest.mock import patch + +import pandas as pd +import pytest + +from cubi_tk.exceptions import ParameterException +from cubi_tk.sodar.update_samplesheet import UpdateSamplesheetCommand +from cubi_tk.sodar_api import SodarAPI + + +@pytest.fixture +def MV_isa_json(): + with open(pathlib.Path(__file__).resolve().parent / "data" / "isa_mv.json") as f: + return json.load(f) + + +@pytest.fixture +def MV_ped_extra_sample(): + return """FAM_03\tAna_04\t0\t0\t1\t2\n""" + + +@pytest.fixture +def MV_ped_samples(): + return """FAM_01\tAna_01\t0\t0\t1\t2\nFAM_02\tAna_02\t0\tAna_03\t2\t2\nFAM_02\tAna_03\t0\t0\t2\t2\n""" + + +@pytest.fixture +@patch("cubi_tk.sodar_api.SodarAPI._api_call") +def mock_isa_data(API_call, MV_isa_json): + API_call.return_value = MV_isa_json + api = SodarAPI("https://sodar.bihealth.org/", "1234", "123e4567-e89b-12d3-a456-426655440000") + isa_data = api.get_ISA_samplesheet() + investigation = isa_data["investigation"]["content"] + study = pd.read_csv(StringIO(isa_data["study"]["content"]), sep="\t") + assay = pd.read_csv(StringIO(isa_data["assay"]["content"]), sep="\t") + return investigation, study, assay + + +@pytest.fixture +def UCS_class_object(): + parser = argparse.ArgumentParser() + UpdateSamplesheetCommand.setup_argparse(parser) + args = parser.parse_args(["123e4567-e89b-12d3-a456-426655440000"]) + UCS = UpdateSamplesheetCommand(args) + return UCS + + +@pytest.fixture +def sample_df(): + return pd.DataFrame( + [ + [ + "Ana_01", + "FAM_01", + "0", + "0", + "male", + "affected", + "Ind_01", + "Probe_01", + "ATCG", + "A1", + "Ana_01-N1", + "Ana_01-N1-DNA1", + "Ana_01-N1-DNA1-WGS1", + ], + [ + "Ana_02", + "FAM_02", + "0", + "Ana_03", + "female", + "affected", + "Ind_02", + "Probe_02", + "ACTG", + "A2", + "Ana_02-N1", + "Ana_02-N1-DNA1", + "Ana_02-N1-DNA1-WGS1", + ], + [ + "Ana_03", + "FAM_02", + "0", + "0", + "female", + "affected", + "Ind_03", + "Probe_03", + "ATGC", + "A3", + "Ana_03-N1", + "Ana_03-N1-DNA1", + "Ana_03-N1-DNA1-WGS1", + ], + ], + columns=[ + "Analysis-ID", + "Family-ID", + "Paternal-ID", + "Maternal-ID", + "Sex", + "Phenotype", + "Individual-ID", + "Probe-ID", + "Barcode", + "Barcode-Name", + "Sample Name", + "Extract Name", + "Library Name", + ], + ) + + +def helper_update_UCS(arg_list, UCS): + parser = argparse.ArgumentParser() + UpdateSamplesheetCommand.setup_argparse(parser) + args = parser.parse_args(arg_list) + UCS.args = args + + return UCS + + +def test_gather_ISA_column_names(mock_isa_data, UCS_class_object): + from cubi_tk.sodar.update_samplesheet import ISA_NON_SETTABLE, REQUIRED_COLUMNS + + study = mock_isa_data[1] + assay = mock_isa_data[2] + + isa_names = UCS_class_object.gather_ISA_column_names(study, assay) + + assert not all(col in isa_names for col in ISA_NON_SETTABLE) + assert all(col in isa_names for col in REQUIRED_COLUMNS) + + isa_regex = re.compile(r"(Characteristics|Parameter Value|Comment)\[(.*?)]") + multi_colname_regex = re.compile(r"\.[0-9]+$") + for col in study.columns.tolist() + assay.columns.tolist(): + colname_long = multi_colname_regex.sub("", col) + colname_short = isa_regex.sub(r"\2", colname_long) + if colname_long in ISA_NON_SETTABLE: + continue + assert colname_long in isa_names + assert colname_short in isa_names + assert isa_names[colname_short] == isa_names[colname_long] + + +def test_parse_sampledata_args(mock_isa_data, UCS_class_object): + isa_names = UCS_class_object.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2]) + + # base mapping from default + arg_list = [ + "-s", + "Ind_01", + "Probe_01", + "Ana_01", + "ATCG", + "A1", + "-s", + "Ind_02", + "Probe_02", + "Ana_02", + "ACTG", + "A2", + "-s", + "Ind_03", + "Probe_03", + "Ana_03", + "ATGC", + "A3", + "-d", + "MV-barcodes", + "123e4567-e89b-12d3-a456-426655440000", + ] + expected = { + "Family-ID": "Family", + "Analysis-ID": "Source Name", + "Paternal-ID": "Father", + "Maternal-ID": "Mother", + "Sex": "Sex", + "Phenotype": "Disease status", + "Individual-ID": "Individual-ID", + "Probe-ID": "Probe-ID", + "Barcode": "Barcode sequence", + "Barcode-Name": "Barcode name", + } + USC = helper_update_UCS(arg_list, UCS_class_object) + assert USC.parse_sampledata_args(isa_names) == expected + + # manually defined mapping + arg_list = [ + "--sample-fields", + "Dummy-ID=Source Name", + "Sample Name", + "Extract Name", + "barcode=Barcode sequence", + "-s", + "Ind_01", + "Probe_01", + "Ana_01", + "ATCG", + "-s", + "Ind_02", + "Probe_02", + "Ana_02", + "ACTG", + "-s", + "Ind_03", + "Probe_03", + "Ana_03", + "ATGC", + "-d", + "MV-barcodes", + "123e4567-e89b-12d3-a456-426655440000", + ] + expected["Dummy-ID"] = "Source Name" + expected["Sample Name"] = "Sample Name" + expected["Extract Name"] = "Extract Name" + expected["barcode"] = "Barcode sequence" + + USC = helper_update_UCS(arg_list, UCS_class_object) + assert USC.parse_sampledata_args(isa_names) == expected + + # missing required fields (from default) + arg_list = [ + "-s", + "Ind_01", + "Probe_01", + "Ana_01", + "ATCG", + "-d", + "MV-barcodes", + "123e4567-e89b-12d3-a456-426655440000", + ] + USC = helper_update_UCS(arg_list, UCS_class_object) + with pytest.raises(ValueError): + USC.parse_sampledata_args(isa_names) + + # missing sample data + arg_list = ["123e4567-e89b-12d3-a456-426655440000"] + USC = helper_update_UCS(arg_list, UCS_class_object) + with pytest.raises(ValueError): + USC.parse_sampledata_args(isa_names) + + # only base ped mapping + arg_list = ["-p", "dummy-pedfile", "123e4567-e89b-12d3-a456-426655440000"] + expected = { + "Family-ID": "Family", + "Sample-ID": "Source Name", + "Paternal-ID": "Father", + "Maternal-ID": "Mother", + "Sex": "Sex", + "Phenotype": "Disease status", + } + USC = helper_update_UCS(arg_list, UCS_class_object) + assert USC.parse_sampledata_args(isa_names) == expected + + +def test_get_dynamic_columns(UCS_class_object): + # Use this both for exisiting (sampledata) and for allowed (isa) columns + existing_names = ( + "Sample-ID", + "Source Name", + "Sample Name", + "Extract Name", + "Library Name", + "Library Strategy", + "Dummy", + ) + + expected = { + "Sample Name": "{Sample-ID}-N1", + "Extract Name": "{Sample-ID}-N1-DNA1", + "Library Name": "{Sample-ID}-N1-DNA1-WGS1", + } + assert UCS_class_object.get_dynamic_columns(existing_names, existing_names) == expected + + arg_list = [ + "--dynamic-column", + "Sample Name", + "{Source Name}-N1", + "--dynamic-column", + "Extract Name", + "{Sample Name}-DNA1", + "--dynamic-column", + "Dummy", + "{Sample Name}-DNA1-{Library Strategy}1", + "123e4567-e89b-12d3-a456-426655440000", + ] + expected = { + "Sample Name": "{Source Name}-N1", + "Extract Name": "{Sample Name}-DNA1", + "Dummy": "{Sample Name}-DNA1-{Library Strategy}1", + } + UCS = helper_update_UCS(arg_list, UCS_class_object) + assert UCS.get_dynamic_columns(existing_names, existing_names) == expected + + # FIXME: test auto-exclusion of Library name from defaults if not in existing_names + + +def test_collect_sample_data( + mock_isa_data, UCS_class_object, MV_ped_samples, MV_ped_extra_sample, sample_df, fs, caplog +): + fs.create_file("mv_samples.ped", contents=MV_ped_samples) + fs.create_file("mv_extra_sample.ped", contents=MV_ped_extra_sample) + + expected = sample_df + arg_list = [ + "-s", + "FAM_01", + "Ana_01", + "0", + "0", + "male", + "affected", + "Ind_01", + "Probe_01", + "ATCG", + "A1", + "-s", + "FAM_02", + "Ana_02", + "0", + "Ana_03", + "female", + "affected", + "Ind_02", + "Probe_02", + "ACTG", + "A2", + "-s", + "FAM_02", + "Ana_03", + "0", + "0", + "female", + "affected", + "Ind_03", + "Probe_03", + "ATGC", + "A3", + "-d", + "Modellvorhaben", + "-p", + "mv_samples.ped", + "123e4567-e89b-12d3-a456-426655440000", + ] + + def run_usc_collect_sampledata(arg_list): + USC = helper_update_UCS(arg_list, UCS_class_object) + isa_names = USC.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2]) + sampledata_fields = USC.parse_sampledata_args(isa_names) + return USC.collect_sample_data(isa_names, sampledata_fields) + + # test merging of --ped & -s info (same samples) + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + + # incomplete info for sample only given via ped + arg_list[36] = "mv_extra_sample.ped" + with pytest.raises(ParameterException): + run_usc_collect_sampledata(arg_list) + assert "Combination of ped and sample data has missing values" in caplog.records[-1].message + + # Test 'MV-barcodes' default, to allow specifying only the info missing from ped file (+ one column for merging) + arg_list = [ + "-s", + "Ind_01", + "Probe_01", + "Ana_01", + "ATCG", + "A1", + "-s", + "Ind_02", + "Probe_02", + "Ana_02", + "ACTG", + "A2", + "-s", + "Ind_03", + "Probe_03", + "Ana_03", + "ATGC", + "A3", + "-d", + "MV-barcodes", + "-p", + "mv_samples.ped", + "123e4567-e89b-12d3-a456-426655440000", + ] + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + + # Should still fail if ped has sample where additional info is missing + arg_list[21] = "mv_extra_sample.ped" + with pytest.raises(ParameterException): + run_usc_collect_sampledata(arg_list) + assert "Combination of ped and sample data has missing values" in caplog.records[-1].message + + # test germlinesheet default + # - only -ped + expected_cols = [ + "Family-ID", + "Analysis-ID", + "Paternal-ID", + "Maternal-ID", + "Sex", + "Phenotype", + "Sample Name", + "Extract Name", + "Library Name", + ] + expected = expected.loc[:, expected_cols] + expected_cols[1] = "Sample-ID" + expected.columns = expected_cols + arg_list = [ + "-p", + "mv_samples.ped", + "123e4567-e89b-12d3-a456-426655440000", + ] + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + + # - only -s + arg_list = [ + "123e4567-e89b-12d3-a456-426655440000", + "-s", + "FAM_01", + "Ana_01", + "0", + "0", + "male", + "affected", + "-s", + "FAM_02", + "Ana_02", + "0", + "Ana_03", + "female", + "affected", + "-s", + "FAM_02", + "Ana_03", + "0", + "0", + "female", + "affected", + ] + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + + # - --ped and -s (same samples) + arg_list += ["-p", "mv_samples.ped"] + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected) + + # - --ped and -s (different samples) + arg_list[-1] = "mv_extra_sample.ped" + expected2 = pd.concat( + [ + expected, + pd.DataFrame( + [ + [ + "FAM_03", + "Ana_04", + "0", + "0", + "male", + "affected", + "Ana_04-N1", + "Ana_04-N1-DNA1", + "Ana_04-N1-DNA1-WGS1", + ] + ], + columns=expected.columns, + ), + ], + ignore_index=True, + ) + pd.testing.assert_frame_equal(run_usc_collect_sampledata(arg_list), expected2) + + # - --ped and -s (mismatch in sample-info) + arg_list[-1] = "mv_samples.ped" + arg_list[6] = "female" + with pytest.raises(ParameterException): + run_usc_collect_sampledata(arg_list) + assert ( + "Sample with different values found in combined sample data:" + in caplog.records[-1].message + ) + + # >> that one might only fail in ISA validation? + + +def test_match_sample_data_to_isa(mock_isa_data, UCS_class_object, sample_df): + arg_list = [ + "-s", + "Ind", + "Probe", + "Ana", + "ACTG", + "A1", + "-d", + "MV-barcodes", + "123e4567-e89b-12d3-a456-426655440000", + ] + UCS = helper_update_UCS(arg_list, UCS_class_object) + isa_names = UCS.gather_ISA_column_names(mock_isa_data[1], mock_isa_data[2]) + sampledata_fields = UCS.parse_sampledata_args(isa_names) + samples = sample_df + + expected_study = pd.DataFrame( + [ + ["Ana_01", "FAM_01", "0", "0", "male", "affected", "Ind_01", "Probe_01", "Ana_01-N1"], + [ + "Ana_02", + "FAM_02", + "0", + "Ana_03", + "female", + "affected", + "Ind_02", + "Probe_02", + "Ana_02-N1", + ], + ["Ana_03", "FAM_02", "0", "0", "female", "affected", "Ind_03", "Probe_03", "Ana_03-N1"], + ], + columns=[ + "Source Name", + "Characteristics[Family]", + "Characteristics[Father]", + "Characteristics[Mother]", + "Characteristics[Sex]", + "Characteristics[Disease status]", + "Characteristics[Individual-ID]", + "Characteristics[Probe-ID]", + "Sample Name", + ], + ) + # Order of columns here does not matter yet + expected_assay = pd.DataFrame( + [ + ["ATCG", "A1", "Ana_01-N1", "Ana_01-N1-DNA1", "Ana_01-N1-DNA1-WGS1"], + ["ACTG", "A2", "Ana_02-N1", "Ana_02-N1-DNA1", "Ana_02-N1-DNA1-WGS1"], + ["ATGC", "A3", "Ana_03-N1", "Ana_03-N1-DNA1", "Ana_03-N1-DNA1-WGS1"], + ], + columns=[ + "Parameter Value[Barcode sequence]", + "Parameter Value[Barcode name]", + "Sample Name", + "Extract Name", + "Library Name", + ], + ) + + study, assay = UCS.match_sample_data_to_isa(samples, isa_names, sampledata_fields) + pd.testing.assert_frame_equal(study, expected_study) + pd.testing.assert_frame_equal(assay, expected_assay) + + +def test_update_isa_table(UCS_class_object, caplog): + orig_isa = pd.DataFrame( + { + "Sample Name": ["Probe_01", "Probe_02", "Probe_03"], + "Extract Name": ["Ana_01", "Ana_02", "Ana_03"], + "Protocol REF": ["DNA extraction", "DNA extraction", "DNA extraction"], + "Parameter Value[Library layout]": ["paired", "paired", "paired"], + "Parameter Value[Barcode sequence]": ["ATCG", "ACTG", ""], + "Parameter Value[Barcode name]": ["A1", "A2", ""], + "Extract Name.1": ["Ana_01", "Ana_02", "Ana_03"], + "RawData File": ["", "", ""], + } + ) + parsed_assay = pd.DataFrame( + { + "Sample Name": ["Probe_02", "Probe_03", "Probe_04"], + "Extract Name": ["Ana_02", "Ana_03", "Ana_04"], + "Extract Name.1": ["Ana_02", "Ana_03", "Ana_04"], + "Parameter Value[Barcode sequence]": ["XXXX", "ATTT", "UUUU"], + "Parameter Value[Barcode name]": ["A2", "A3", "A4"], + } + ) + expected = pd.concat( + [ + orig_isa.loc[[True, True, False]], + pd.DataFrame( + [ + ["Probe_03", "Ana_03", "DNA extraction", "paired", "ATTT", "A3", "Ana_03", ""], + ["Probe_04", "Ana_04", "DNA extraction", "paired", "UUUU", "A4", "Ana_04", ""], + ], + columns=orig_isa.columns, + ), + ], + ignore_index=True, + ) + + # Default case, no overwriting of non-empty fields, autofilling of columns with only 1 value + actual = UCS_class_object.update_isa_table(orig_isa, parsed_assay) + pd.testing.assert_frame_equal(actual, expected) + + # Check for warning message regarding non overwrite of for XXXX + assert "XXXX" in caplog.records[-1].message and "Barcode sequence" in caplog.records[-1].message + + # no auto-filling + expected["Parameter Value[Library layout]"] = ["paired", "paired", "paired", ""] + actual = UCS_class_object.update_isa_table(orig_isa, parsed_assay, no_autofill=True) + pd.testing.assert_frame_equal(actual, expected) + + # allow overwriting + expected["Parameter Value[Barcode sequence]"] = ["ATCG", "XXXX", "ATTT", "UUUU"] + actual = UCS_class_object.update_isa_table( + orig_isa, parsed_assay, overwrite=True, no_autofill=True + ) + pd.testing.assert_frame_equal(actual, expected) + + +# FIXME: smoke test for execute (MV & germline-sheet) +# - test --snappy_compatible