From 2f80f4d4b45c0ebdd64d443e515c3c221f093fb6 Mon Sep 17 00:00:00 2001 From: Andre Masella Date: Fri, 6 Dec 2019 11:27:55 -0500 Subject: [PATCH] Finish pre-WGS report (#70) * Finish pre-WGS report - Use new graph generation functions - Add sliders - Use BamQC3 and IchorCNA data - Data table and failure info --- application/dash_application/plot_builder.py | 27 ++- .../dash_application/views/preqc_wgs.py | 169 ++++++++++++------ 2 files changed, 136 insertions(+), 60 deletions(-) diff --git a/application/dash_application/plot_builder.py b/application/dash_application/plot_builder.py index eb82c053..bb3397e8 100644 --- a/application/dash_application/plot_builder.py +++ b/application/dash_application/plot_builder.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Tuple import pandas import plotly.graph_objects as go @@ -180,4 +180,27 @@ def get_shapes_for_values(shapeby: List[str]): return get_dict_wrapped(shapeby, ALL_SYMBOLS) def get_colours_for_values(colourby: List[str]): - return get_dict_wrapped(colourby, PLOTLY_DEFAULT_COLOURS) \ No newline at end of file + return get_dict_wrapped(colourby, PLOTLY_DEFAULT_COLOURS) + +def terminal_output(data:DataFrame, limits:List[Tuple[str, str, float]]) -> str: + if data.empty: + return "No data!" + + output = "" + + for (name, column, cutoff) in limits: + output += "$failed_%s\n" %name + newline = False + linenumber = 0 + for failed in data.loc[data[column] < cutoff][pinery.column.SampleProvenanceColumn.SampleName]: + if not newline: + output += "[{0}] ".format(linenumber) + output += "\"" + failed + "\"\t\t" + if newline: + output += "\n" + newline = not newline + linenumber += 1 + if output: + return output + else: + return "All samples within cutoffs" diff --git a/application/dash_application/views/preqc_wgs.py b/application/dash_application/views/preqc_wgs.py index ed73934b..bdf63b24 100644 --- a/application/dash_application/views/preqc_wgs.py +++ b/application/dash_application/views/preqc_wgs.py @@ -2,7 +2,6 @@ import dash_core_components as core import dash_html_components as html -import numpy from dash.dependencies import Input, Output, State import pandas as pd @@ -11,7 +10,7 @@ from gsiqcetl import QCETLCache from . import navbar from ..dash_id import init_ids -from ..plot_builder import get_shapes_for_values, fill_in_shape_col, fill_in_colour_col, generate +from ..plot_builder import terminal_output, fill_in_shape_col, fill_in_colour_col, generate from ..table_builder import build_table from ..utility import df_manipulation as util @@ -29,11 +28,14 @@ "second-sort", "colour-by", "shape-by", + "reads-per-start-point-slider", + "insert-mean-slider", "passed-filter-reads-slider", # Graphs "total-reads", "mean-insert", + "reads-per-start-point", "duplication", "purity", "ploidy", @@ -41,10 +43,11 @@ "non-primary-reads", "on-target-reads", + "terminal-output", "data-table", ]) -BAMQC_COL = gsiqcetl.column.BamQcColumn +BAMQC_COL = gsiqcetl.column.BamQc3Column ICHOR_COL = gsiqcetl.column.IchorCnaColumn PINERY_COL = pinery.column.SampleProvenanceColumn INSTRUMENT_COLS = pinery.column.InstrumentWithModelColumn @@ -67,26 +70,22 @@ special_cols["On-target Reads"], special_cols["Purity"] ] -most_bamqc_cols = [*BAMQC_COL.values()] -most_bamqc_cols.remove(BAMQC_COL.BamFile) later_col_set = [ PINERY_COL.PrepKit, PINERY_COL.TissuePreparation, PINERY_COL.LibrarySourceTemplateType, PINERY_COL.ExternalName, PINERY_COL.GroupID, PINERY_COL.TissueOrigin, PINERY_COL.TissueType, PINERY_COL.Institute, INSTRUMENT_COLS.ModelName ] -wgs_table_columns = [*first_col_set, *most_bamqc_cols, *later_col_set] +wgs_table_columns = [*first_col_set, *BAMQC_COL.values(), *ICHOR_COL.values(), *later_col_set] # Set initial values for dropdown menus initial_first_sort = PINERY_COL.StudyTitle initial_second_sort = BAMQC_COL.TotalReads initial_colour_col = PINERY_COL.StudyTitle initial_shape_col = PINERY_COL.PrepKit - -# Set initial points for graph cutoff lines -graph_cutoffs = { - "pf_reads": 0.01 -} +initial_cutoff_pf_reads = 0.01 +initial_cutoff_insert_mean = 150 +initial_cutoff_rpsp = 5 shape_or_colour_by = [ {"label": "Project", "value": PINERY_COL.StudyTitle}, @@ -106,17 +105,17 @@ def get_wgs_data(): """ # Get the BamQC data cache = QCETLCache() - if True: # DELETE(amasella): once IchorCNA data is available - wgs_df = cache.bamqc.bamqc - wgs_df[ICHOR_COL.Ploidy] = numpy.NaN - wgs_df[ICHOR_COL.TumorFraction] = numpy.NaN - # This doesn't seem to exist even though it should - wgs_df[BAMQC_COL.MarkDuplicates_PERCENT_DUPLICATION] = numpy.NaN - else: - wgs_df = cache.bamqc.bamqc.merge(cache.ichorcna.ichorcna[[ICHOR_COL.Ploidy, ICHOR_COL.TumorFraction]], - how="left", - left_on=[BAMQC_COL.Run, BAMQC_COL.Lane, BAMQC_COL.Barcodes], - right_on=[ICHOR_COL.Run, ICHOR_COL.Lane, ICHOR_COL.Barcodes]) + + ichorcna_df = cache.ichorcna.ichorcna[[ICHOR_COL.Run, + ICHOR_COL.Lane, + ICHOR_COL.Barcodes, + ICHOR_COL.Ploidy, + ICHOR_COL.TumorFraction]] + bamqc_df = cache.bamqc3.bamqc3 + wgs_df = bamqc_df.merge( + ichorcna_df, how="left", left_on=[ + BAMQC_COL.Run, BAMQC_COL.Lane, BAMQC_COL.Barcodes], right_on=[ + ICHOR_COL.Run, ICHOR_COL.Lane, ICHOR_COL.Barcodes]) # Cast the primary key/join columns to explicit types wgs_df = util.df_with_normalized_ius_columns( wgs_df, BAMQC_COL.Run, BAMQC_COL.Lane, BAMQC_COL.Barcodes) @@ -182,7 +181,7 @@ def get_wgs_data(): EMPTY_WGS = pd.DataFrame(columns=WGS_DF.columns) -def generate_total_reads(df, colour_by, shape_by): +def generate_total_reads(df, colour_by, shape_by, cutoff): return generate( "Total Reads (Passed Filter)", df, @@ -191,11 +190,12 @@ def generate_total_reads(df, colour_by, shape_by): "# Reads (10^6)", colour_by, shape_by, - "none" + "none", + cutoff ) -def generate_mean_insert_size(df, colour_by, shape_by): +def generate_mean_insert_size(df, colour_by, shape_by, cutoff): return generate( "Insert Mean", df, @@ -204,10 +204,18 @@ def generate_mean_insert_size(df, colour_by, shape_by): "Base Pairs", colour_by, shape_by, - "none" + "none", + cutoff ) +def generate_reads_per_start_point(df, colour_by, shape_by, cutoff): + return generate("Reads per Start Point", + df, lambda d: d[PINERY_COL.SampleName], + lambda d: d[BAMQC_COL.ReadsPerStartPoint], + "Reads", colour_by, shape_by, "none", cutoff) + + def generate_duplication(df, colour_by, shape_by): return generate( "Duplication", @@ -286,6 +294,18 @@ def generate_ploidy(df, colour_by, shape_by): ) +def generate_terminal_output( + data, + initial_cutoff_rpsp, + initial_cutoff_insert_mean, + initial_cutoff_pf_reads): + return terminal_output(data, [ + ('rpsp', BAMQC_COL.ReadsPerStartPoint, initial_cutoff_rpsp), + ('insert_mean', BAMQC_COL.InsertMean, initial_cutoff_insert_mean), + ('reads_pf', special_cols["Total Reads (Passed Filter)"], initial_cutoff_pf_reads), + ]) + + # Layout elements layout = core.Loading(fullscreen=True, type="cube", children=[ html.Div(className="body", children=[ @@ -377,7 +397,32 @@ def generate_ploidy(df, colour_by, shape_by): # TODO: add "Search Sample" input # TODO: add "Show Names" dropdown - # TODO: add cut-off sliders + html.Label([ + "Reads Per Start Point:", + core.Slider( + id=ids["reads-per-start-point-slider"], + min=0, + max=50, + step=1, + marks={i * 5: str(i * 5) for i in range(0, 10)}, + tooltip="always_visible", + value=initial_cutoff_rpsp + ) + ]), + html.Br(), + html.Label([ + "Insert Mean:", + core.Slider( + id=ids["insert-mean-slider"], + min=0, + max=500, + step=10, + marks={i * 50: str(i * 50) for i in range(0, 10)}, + tooltip="always_visible", + value=initial_cutoff_insert_mean + ) + ]), + html.Br(), html.Label([ "Passed Filter Reads:", @@ -386,21 +431,9 @@ def generate_ploidy(df, colour_by, shape_by): min=0, max=0.5, step=0.025, - marks={ - 0: "0", - 0.05: "0.05", - 0.1: "0.1", - 0.15: "0.15", - 0.2: "0.2", - 0.25: "0.25", - 0.3: "0.3", - 0.35: "0.35", - 0.4: "0.4", - 0.45: "0.45", - 0.5: "0.5" - }, + marks={i * 0.05: str(i * 0.05) for i in range(0, 10)}, tooltip="always_visible", - value=graph_cutoffs["pf_reads"] + value=initial_cutoff_pf_reads ) ]), html.Br(), @@ -408,14 +441,19 @@ def generate_ploidy(df, colour_by, shape_by): html.Div(className="seven columns", children=[ core.Graph( - id=ids["total-reads"], - figure=generate_total_reads(EMPTY_WGS, initial_colour_col, - initial_shape_col) - ), + id=ids["total-reads"], + figure=generate_total_reads(EMPTY_WGS, initial_colour_col, + initial_shape_col, initial_cutoff_pf_reads) + ), core.Graph( id=ids["mean-insert"], figure=generate_mean_insert_size( - EMPTY_WGS, initial_colour_col, initial_shape_col) + EMPTY_WGS, initial_colour_col, initial_shape_col, initial_cutoff_insert_mean) + ), + core.Graph( + id=ids["reads-per-start-point"], + figure=generate_reads_per_start_point( + EMPTY_WGS, initial_colour_col, initial_shape_col, initial_cutoff_rpsp) ), core.Graph( id=ids["duplication"], @@ -448,14 +486,19 @@ def generate_ploidy(df, colour_by, shape_by): EMPTY_WGS, initial_colour_col, initial_shape_col) ), ]), - - # DataTable for all samples info - html.Div(className="data-table", - children=[ - build_table(ids["data-table"], wgs_table_columns, WGS_DF, - BAMQC_COL.TotalReads) - ]) - ]) + ]), + html.Div(className='terminal-output', + children=[ + html.Pre(generate_terminal_output(EMPTY_WGS, initial_cutoff_rpsp, initial_cutoff_insert_mean, + initial_cutoff_pf_reads), + id=ids['terminal-output'], + ) + ]), + html.Div(className='data-table', + children=[ + build_table(ids["data-table"], wgs_table_columns, WGS_DF, + BAMQC_COL.TotalReads) + ]), ]) ]) @@ -465,12 +508,14 @@ def init_callbacks(dash_app): [ Output(ids["total-reads"], "figure"), Output(ids["mean-insert"], "figure"), + Output(ids["reads-per-start-point"], "figure"), Output(ids["duplication"], "figure"), Output(ids["purity"], "figure"), Output(ids["ploidy"], "figure"), Output(ids["unmapped-reads"], "figure"), Output(ids["non-primary-reads"], "figure"), Output(ids["on-target-reads"], "figure"), + Output(ids["terminal-output"], "value"), Output(ids["data-table"], "data"), ], [ @@ -482,6 +527,9 @@ def init_callbacks(dash_app): State(ids['second-sort'], 'value'), State(ids['colour-by'], 'value'), State(ids['shape-by'], 'value'), + State(ids["reads-per-start-point-slider"], 'value'), + State(ids["insert-mean-slider"], 'value'), + State(ids["passed-filter-reads-slider"], 'value'), ] ) def update_pressed(click, @@ -489,7 +537,10 @@ def update_pressed(click, first_sort, second_sort, colour_by, - shape_by): + shape_by, + total_reads_cutoff, + insert_mean_cutoff, + rpsp_cutoff): if not runs: df = pd.DataFrame(columns=WGS_DF.columns) else: @@ -501,15 +552,17 @@ def update_pressed(click, dd = defaultdict(list) return [ - generate_total_reads(df, colour_by, shape_by), - generate_mean_insert_size(df, colour_by, shape_by), + generate_total_reads(df, colour_by, shape_by, total_reads_cutoff), + generate_mean_insert_size(df, colour_by, shape_by, insert_mean_cutoff), + generate_reads_per_start_point(df, colour_by, shape_by, rpsp_cutoff), generate_duplication(df, colour_by, shape_by), generate_purity(df, colour_by, shape_by), generate_ploidy(df, colour_by, shape_by), generate_unmapped_reads(df, colour_by, shape_by), generate_non_primary(df, colour_by, shape_by), generate_on_target_reads(df, colour_by, shape_by), - df.to_dict("records", into=dd), + generate_terminal_output(df, rpsp_cutoff, insert_mean_cutoff, total_reads_cutoff), + df.to_dict('records', into=dd), ] @dash_app.callback(