From 6f8b0ae4e5095f2f3b2faaad38f8a859629e8441 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Thu, 5 Sep 2024 09:03:42 -0400 Subject: [PATCH 1/3] drop members from sample table if empty cleaned bam --- pipes/WDL/tasks/tasks_terra.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipes/WDL/tasks/tasks_terra.wdl b/pipes/WDL/tasks/tasks_terra.wdl index 0fd8c4bb1..65e271bbb 100644 --- a/pipes/WDL/tasks/tasks_terra.wdl +++ b/pipes/WDL/tasks/tasks_terra.wdl @@ -434,6 +434,7 @@ task create_or_update_sample_tables { cleaned_bams_list = '~{sep="*" cleaned_reads_unaligned_bams}'.split('*') cleaned_library_id_list = [bam.split("/")[-1].replace(".bam", "").replace(".cleaned", "") for bam in cleaned_bams_list] df_library_table_clean_bams = pd.DataFrame({lib_col_name : cleaned_library_id_list, "cleaned_bam" : cleaned_bams_list}) + cleaned_bam_names = set(df_library_table_clean_bams[lib_col_name]) df_library_bams = pd.merge(df_library_table_raw_bams, df_library_table_clean_bams, on=lib_col_name, how="outer") library_bams_tsv = flowcell_data_id + "-all_bams.tsv" @@ -467,13 +468,13 @@ task create_or_update_sample_tables { writer.writerows(out_rows) # grab the meta_by_filename values to create new sample->library mappings - # restrict to libraries/samples that we actually have bam files for + # restrict to libraries/samples that we actually have cleaned bam files for sample_to_libraries = {} libraries_in_bams = set() for library_id, data in library_meta_dict.items(): sample_id = data['sample'] sample_to_libraries.setdefault(sample_id, []) - if library_id in library_bam_names: + if library_id in cleaned_bam_names: sample_to_libraries[sample_id].append(library_id) libraries_in_bams.add(library_id) else: From e39c7d1911d59f5597fe7a552b39f056bdd006b8 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Tue, 17 Sep 2024 16:13:24 -0400 Subject: [PATCH 2/3] bugfix for when many previous libraries exist --- pipes/WDL/tasks/tasks_terra.wdl | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_terra.wdl b/pipes/WDL/tasks/tasks_terra.wdl index 65e271bbb..650f62805 100644 --- a/pipes/WDL/tasks/tasks_terra.wdl +++ b/pipes/WDL/tasks/tasks_terra.wdl @@ -503,12 +503,23 @@ task create_or_update_sample_tables { print(df_sample.index) # create tsv to populate sample table with new sample->library mappings + def test_non_empty_value(value): + # this function exists because pandas / numpy arrays don't behave like python lists with regards to coercion to truth values + # Check for numpy NaN (which coerces to python True!) + if isinstance(value, float) and pd.isna(value): + return False + # Check for numpy arrays (which refuse to coerce to logical values and throw ValueError instead!) + if isinstance(value, (np.ndarray, pd.Series, pd.DataFrame)): + return value.size > 0 and value.any() + # Default to normal python behavior + return bool(value) + sample_fname = 'sample_membership.tsv' with open(sample_fname, 'wt') as outf: outf.write('entity:~{sample_table_name}_id\tlibraries\n') merged_sample_ids = set() for sample_id, libraries in sample_to_libraries.items(): - if sample_id in df_sample.index and "libraries" in df_sample.columns and df_sample.libraries[sample_id] and pd.notna(df_sample.libraries[sample_id]): + if sample_id in df_sample.index and "libraries" in df_sample.columns and test_non_empty_value(df_sample.libraries[sample_id]): # merge in new sample->library mappings with any pre-existing sample->library mappings already_associated_libraries = [entity["entityName"] for entity in df_sample.libraries[sample_id] if entity.get("entityName")] libraries = list(set(libraries + already_associated_libraries)) From f6942c3c0efcc4fc70dfb5408c4e28dee7d4f3b7 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Wed, 18 Sep 2024 09:14:54 -0400 Subject: [PATCH 3/3] missing python import --- pipes/WDL/tasks/tasks_terra.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipes/WDL/tasks/tasks_terra.wdl b/pipes/WDL/tasks/tasks_terra.wdl index 650f62805..17771230a 100644 --- a/pipes/WDL/tasks/tasks_terra.wdl +++ b/pipes/WDL/tasks/tasks_terra.wdl @@ -412,6 +412,7 @@ task create_or_update_sample_tables { import json import csv import pandas as pd + import numpy as np from firecloud import api as fapi print(workspace_project + "\n" + workspace_name)