From ce1c484d827b4b3c76b83f689b0ecdc9cf40d428 Mon Sep 17 00:00:00 2001 From: Mohamed Abuelanin Date: Sun, 20 Oct 2024 12:10:57 -0700 Subject: [PATCH] Add metadata annotation in snipe qc --- src/snipe/__init__.py | 2 +- src/snipe/cli/cli_qc.py | 134 +++++++++++++++++++++++++++++++--------- 2 files changed, 105 insertions(+), 31 deletions(-) diff --git a/src/snipe/__init__.py b/src/snipe/__init__.py index cf5e5d4..a09f420 100644 --- a/src/snipe/__init__.py +++ b/src/snipe/__init__.py @@ -3,4 +3,4 @@ from snipe.api.multisig_reference_QC import MultiSigReferenceQC from snipe.api.reference_QC import ReferenceQC -__version__ = '0.1.0' \ No newline at end of file +__version__ = '0.1.1' \ No newline at end of file diff --git a/src/snipe/cli/cli_qc.py b/src/snipe/cli/cli_qc.py index 93b5d9b..e2796cc 100644 --- a/src/snipe/cli/cli_qc.py +++ b/src/snipe/cli/cli_qc.py @@ -54,8 +54,8 @@ def process_subset( ref (str): Reference signature file path. amplicon (Optional[str]): Amplicon signature file path. ychr (Optional[str]): Y chromosome signature file path. - vars (List[str]): List of variable signature file paths. - export_var (bool): Flag to export variable signatures. + vars (List[str]): List of variance signature file paths. + export_var (bool): Flag to export variance signatures. roi (bool): Flag to calculate ROI. debug (bool): Flag to enable debug logging. @@ -104,20 +104,20 @@ def process_subset( subset_logger.error(f"Failed to load Y chromosome signature from {ychr}: {e}") return {}, subset # All samples in this subset fail - # Load variable signatures if provided + # Load variance signatures if provided vars_snipesigs = [] if vars: - subset_logger.info(f"Loading {len(vars)} variable signature(s).") + subset_logger.info(f"Loading {len(vars)} variance signature(s).") for path in vars: if not os.path.exists(path): - subset_logger.error(f"Variable signature file does not exist: {path}") + subset_logger.error(f"Variance signature file does not exist: {path}") return {}, subset # All samples in this subset fail try: var_sig = SnipeSig(sourmash_sig=path, sig_type=SigType.AMPLICON, enable_logging=debug) vars_snipesigs.append(var_sig) - subset_logger.debug(f"Loaded variable signature: {var_sig.name}") + subset_logger.debug(f"Loaded variance signature: {var_sig.name}") except Exception as e: - subset_logger.error(f"Failed to load variable signature from {path}: {e}") + subset_logger.error(f"Failed to load variance signature from {path}: {e}") return {}, subset # All samples in this subset fail # Initialize QC instance @@ -146,6 +146,8 @@ def process_subset( predict_extra_folds=predict_extra_folds if roi else None, advanced=True ) + #! override the internal sig file path with the actual used file path. + sample_stats["file_path"] = sample_path subset_stats[sample_sig.name] = sample_stats subset_logger.debug(f"Successfully processed sample: {sample_sig.name}") except Exception as e: @@ -159,7 +161,7 @@ def process_subset( @click.command() @click.option('--ref', type=click.Path(exists=True), required=True, help='Reference genome signature file (required).') -@click.option('--sample', type=click.Path(exists=True), callback=validate_sig_input, multiple=True, default = None, help='Sample signature file. Can be provided multiple times.') +@click.option('--sample', type=click.Path(exists=True), callback=validate_sig_input, multiple=True, default=None, help='Sample signature file. Can be provided multiple times.') @click.option('--samples-from-file', type=click.Path(exists=True), help='File containing sample paths (one per line).') @click.option('--amplicon', type=click.Path(exists=True), help='Amplicon signature file (optional).') @click.option('--roi', is_flag=True, default=False, help='Calculate ROI for 1,2,5,9 folds.') @@ -169,9 +171,13 @@ def process_subset( @click.option('-o', '--output', required=True, callback=validate_tsv_file, help='Output TSV file for QC results.') @click.option('--var', 'vars', multiple=True, type=click.Path(exists=True), help='Extra signature file path to study variance of non-reference k-mers.') @click.option('-c', '--cores', type=int, default=1, show_default=True, help='Number of CPU cores to use for parallel processing.') +@click.option('--metadata', type=str, default=None, help='Additional metadata in the format colname=value,colname=value,...') +@click.option('--metadata-from-file', type=click.Path(exists=True), help='File containing metadata information in TSV or CSV format.') def qc(ref: str, sample: List[str], samples_from_file: Optional[str], - amplicon: Optional[str], roi: bool, export_var: bool, - ychr: Optional[str], debug: bool, output: str, vars: List[str], cores: int): + amplicon: Optional[str], roi: bool, export_var: bool, + ychr: Optional[str], debug: bool, output: str, vars: List[str], cores: int, + metadata: Optional[str], metadata_from_file: Optional[str]): + """ Perform quality control (QC) on multiple samples against a reference genome. @@ -200,9 +206,6 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], - `--roi` Calculate ROI for 1x, 2x, 5x, and 9x coverage folds. - - `--advanced` - Include advanced QC metrics. - - `--ychr PATH` Y chromosome signature file (overrides the reference ychr). @@ -213,7 +216,19 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], Output TSV file for QC results. - `--var PATH` - Variable signature file path. Can be used multiple times. + Variance signature file path. Can be used multiple times. + + - `--export-var` + Export signatures for sample hashes found in the variance signature. + + - `-c`, `--cores INT` + Number of CPU cores to use for parallel processing. Default: 1. + + - `--metadata STR` + Additional metadata in the format `colname=value,colname=value,...`. Applies to all samples. + + - `--metadata-from-file PATH` + File containing metadata information in TSV or CSV format. Each row should have `sample_path,metadata_col,value`. ## Examples @@ -249,7 +264,7 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], snipe qc --ref reference.sig --sample sample1.sig --advanced --roi -o qc_results.tsv ``` - ### Using Multiple Variable Signatures + ### Using Multiple Variance Signatures ```bash snipe qc --ref reference.sig --sample sample1.sig --var var1.sig --var var2.sig -o qc_results.tsv @@ -310,9 +325,9 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], A TSV file named `qc_roi.tsv` containing QC metrics along with ROI predictions for `sample1.sig` and `sample2.sig`. - ### Use Case 3: Advanced QC with Amplicon and Variable Signatures + ### Use Case 3: Advanced QC with Amplicon and Variance Signatures - **Objective:** Perform advanced QC on a sample using an amplicon signature and multiple variable signatures. + **Objective:** Perform advanced QC on a sample using an amplicon signature and multiple variance signatures. **Command:** @@ -325,13 +340,15 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], - `--ref reference.sig`: Reference genome signature file. - `--amplicon amplicon.sig`: Amplicon signature file. - `--sample sample1.sig`: Sample signature file. - - `--var var1.sig` & `--var var2.sig`: Variable signature files. + - `--var var1.sig` & `--var var2.sig`: Variance signature files. - `--export-var`: Export signatures for variances. + - `--metadata`: Additional metadata in the format `metadata1=value,metadata2=value,...`. + - `--metadata-from-file`: File containing metadata information in TSV or CSV format per signature. - `-o qc_advanced.tsv`: Output file for QC results. **Expected Output:** - A TSV file named `qc_advanced.tsv` containing comprehensive QC metrics, including advanced metrics and analyses based on the amplicon and variable signatures for `sample1.sig`. + A TSV file named `qc_advanced.tsv` containing comprehensive QC metrics, including advanced metrics and analyses based on the amplicon and variance signatures for `sample1.sig`. ### Use Case 4: Overwriting Existing Output File @@ -414,18 +431,17 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], - `--ref reference.zip`: Reference genome signature file. - `--sample sample1.zip` & `--sample sample2.sig`: Multiple sample signature files. - `--amplicon amplicon.zip`: Amplicon signature file. - - `--var var1.zip` & `--var var2.zip`: Variable signature files. + - `--var var1.zip` & `--var var2.zip`: Variance signature files. - `--advanced`: Includes advanced QC metrics. - `--roi`: Enables ROI calculations. - `-o qc_comprehensive.tsv`: Output file for QC results. **Expected Output:** - A TSV file named `qc_comprehensive.tsv` containing comprehensive QC metrics, including advanced analyses, ROI predictions, and data from amplicon and variable signatures for both `sample1.sig` and `sample2.sig`. + A TSV file named `qc_comprehensive.tsv` containing comprehensive QC metrics, including advanced analyses, ROI predictions, and data from amplicon and variance signatures for both `sample1.sig` and `sample2.sig`. """ - start_time = time.time() # Configure logging @@ -504,24 +520,24 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], logger.error(f"Failed to load Y chromosome signature from {ychr}: {e}") sys.exit(1) - # Prepare variable signatures if provided + # Prepare variance signatures if provided vars_paths = [] vars_snipesigs = [] if vars: - logger.info(f"Loading {len(vars)} variable signature(s).") + logger.info(f"Loading {len(vars)} variance signature(s).") for path in vars: if not os.path.exists(path): - logger.error(f"Variable signature file does not exist: {path}") + logger.error(f"Variance signature file does not exist: {path}") sys.exit(1) vars_paths.append(os.path.abspath(path)) try: var_sig = SnipeSig(sourmash_sig=path, sig_type=SigType.AMPLICON, enable_logging=debug) vars_snipesigs.append(var_sig) - logger.debug(f"Loaded variable signature: {var_sig.name}") + logger.debug(f"Loaded variance signature: {var_sig.name}") except Exception as e: - logger.error(f"Failed to load variable signature from {path}: {e}") + logger.error(f"Failed to load variance signature from {path}: {e}") - logger.debug(f"Variable signature paths: {vars_paths}") + logger.debug(f"Variance signature paths: {vars_paths}") predict_extra_folds = [1, 2, 5, 9] @@ -608,7 +624,7 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], if len(succeeded) == 0: logger.error("All samples failed during QC processing. Output TSV will not be generated.") sys.exit(1) - + # write total success and failure logger.info("Successfully processed samples: %d", len(succeeded)) @@ -628,6 +644,64 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], reordered_cols += cols df = df[reordered_cols] + # Process --metadata option + metadata_dict = {} + if metadata: + try: + for item in metadata.split(','): + key, value = item.split('=', 1) + metadata_dict[key.strip()] = value.strip() + logger.debug(f"Parsed metadata from command line: {metadata_dict}") + except Exception as e: + logger.error(f"Failed to parse --metadata: {e}") + sys.exit(1) + + # Process --metadata-from-file option + if metadata_from_file: + if not os.path.exists(metadata_from_file): + logger.error(f"Metadata file does not exist: {metadata_from_file}") + sys.exit(1) + try: + # Read metadata file + metadata_df = pd.read_csv(metadata_from_file, sep=None, engine='python', header=None) + # Validate columns + if metadata_df.shape[1] != 3: + logger.error("Metadata file must have three columns: sample_path, metadata_col, value") + sys.exit(1) + metadata_samples = metadata_df.iloc[:, 0].tolist() + metadata_cols = metadata_df.iloc[:, 1].tolist() + metadata_values = metadata_df.iloc[:, 2].tolist() + # Build a nested dictionary: {sample_name: {col1: val1, col2: val2, ...}} + metadata_sample_dict = {} + for sample_path, col_name, value in zip(metadata_samples, metadata_cols, metadata_values): + sample_basename = os.path.basename(sample_path) + if sample_basename not in metadata_sample_dict: + metadata_sample_dict[sample_basename] = {} + metadata_sample_dict[sample_basename][col_name] = value + logger.debug(f"Parsed metadata from file: {metadata_sample_dict}") + except Exception as e: + logger.error(f"Failed to read or parse metadata file {metadata_from_file}: {e}") + sys.exit(1) + else: + metadata_sample_dict = {} + + # Apply metadata to DataFrame + if metadata_dict: + # Apply global metadata to all samples + for key, value in metadata_dict.items(): + df[key] = value + + if metadata_sample_dict: + # Apply per-sample metadata + df["tmp_basename"] = df["filename"].apply(os.path.basename) + # df.set_index("tmp_basename", inplace=True) + # map metadata to samples + for sample_basename, metadata_dict in metadata_sample_dict.items(): + for key, value in metadata_dict.items(): + df.loc[df["tmp_basename"] == sample_basename, key] = value + # df.reset_index(inplace=True) + df.drop(columns=["tmp_basename"], inplace=True) + # Export to TSV with comments try: with open(output, 'w', encoding='utf-8') as f: @@ -648,4 +722,4 @@ def qc(ref: str, sample: List[str], samples_from_file: Optional[str], end_time = time.time() elapsed_time = end_time - start_time - logger.info(f"QC process completed in {elapsed_time:.2f} seconds.") \ No newline at end of file + logger.info(f"QC process completed in {elapsed_time:.2f} seconds.")