From d0233ba69269464a92014bd40a871eb88ae86ead Mon Sep 17 00:00:00 2001 From: AAnzel <44969003+AAnzel@users.noreply.github.com> Date: Sat, 18 Dec 2021 18:22:59 +0100 Subject: [PATCH] Added: Implemented depth analysis workflow. --- Source/common.py | 134 +++++++++++++++++++++++++++++++++++--------- Source/visualize.py | 3 +- 2 files changed, 109 insertions(+), 28 deletions(-) diff --git a/Source/common.py b/Source/common.py index 1176c9b..645d511 100644 --- a/Source/common.py +++ b/Source/common.py @@ -971,17 +971,88 @@ def create_annotated_data_set(end, path_bins): return result_df +def create_aggregation_functions(column_names): + result_dict = {} + for one_column_name in column_names: + if one_column_name == 'Gene': + continue + else: + result_dict[one_column_name] = 'sum' + + return result_dict + + +def create_summary_df(df): + summary_dict = dict() + numerical_columns = df.select_dtypes(include=np.number).columns.tolist() + for one_column_name in numerical_columns: + q_1 = df[one_column_name].quantile(0.25) + q_3 = df[one_column_name].quantile(0.75) + mean = np.mean(df[one_column_name]) + iqr = q_3 - q_1 + lower_lim = 0 if (q_1 - 1.5*iqr) < 0 else q_1 - 1.5*iqr + upper_lim = q_3 + 1.5*iqr + summary_dict[one_column_name] = {'Q1': q_1, 'Q3': q_3, 'IQR': iqr, + 'LowerLimit': lower_lim, + 'UpperLimit': upper_lim, 'Mean': mean} + + summary_df = pd.DataFrame().from_dict(summary_dict).T.reset_index().rename( + columns={'index': 'DateTime'}) + + return summary_df, summary_dict + + +def create_outliers_df(df, summary_dict): + outliers_df = pd.DataFrame() + for one_column_name in list(summary_dict.keys()): + outliers_df[one_column_name] = df[one_column_name][ + (df[one_column_name] < + summary_dict[one_column_name]['LowerLimit']) | + (df[one_column_name] > + summary_dict[one_column_name]['UpperLimit'])] + + outliers_df = outliers_df.reset_index().melt('index').drop( + 'index', axis=1).rename(columns={'variable': 'DateTime'}) + + return outliers_df + + def create_depth_data_set(end, path_depths): print("Importing depth data set") depth_files = os.listdir(path_depths) depth_files.sort() - final_dict = {} + final_df = None + for i in depth_files: + # We have to initialize the first dataframe + tmp_df = pd.read_csv( + os.path.join(path_depths, i), sep='\t', + names=['Gene', os.path.splitext(i)[0]]) - result_df = pd.DataFrame.from_dict(final_dict).fillna(0).transpose() - print("Finished importing") + if final_df is None: + final_df = tmp_df + else: + final_df = pd.merge(final_df, tmp_df, how="outer", on=['Gene']) + + print("Finished importing\nCreating summary data set") + # This dataframe now has columns: 'Gene' and other columns are timestamps + # There migh be genes that are named the same, and we want to sum them + # Since they are not annotated + aggregation_functions = create_aggregation_functions( + final_df.columns.to_list()) + final_df = final_df.groupby(final_df['Gene']).aggregate( + aggregation_functions) + final_df.reset_index(inplace=True) + + # We now want to create summary dataframe that has columns + # DateTime, Q1, Q3, IQR, LowerLimit, UpperLimit, Mean + # This dataframe is relevant to us + summary_df, summary_dict = create_summary_df(final_df) + outliers_df = create_outliers_df(final_df, summary_dict) - return result_df + print("Finished creating") + + return summary_df, outliers_df def show_folder_structure(folder_path): @@ -1304,14 +1375,10 @@ def work_with_depth(data_set_type, folder_path, key_suffix): depth_files.sort() num_of_depth_files = len(depth_files) - df = create_depth_data_set( + summary_df, outliers_df = create_depth_data_set( end=num_of_depth_files, path_bins=folder_path) - list_of_dates = create_temporal_column( - depth_files, None, None, 'TIMESTAMP') - df.insert(0, 'DateTime', list_of_dates) - - return df + return summary_df, outliers_df def work_with_kegg(data_set_type, folder_path, key_suffix): @@ -1527,27 +1594,42 @@ def work_with_data_set(df, data_set_type, folder_path, recache, key_suffix): df, temporal_feature, feature_list, key_suffix) elif data_set_type == 'DEPTH': - DEPTH_DATA_SET_NAME = 'depth.pkl' - DEPTH_DATA_SET_PATH = os.path.join( - os.path.split(folder_path)[0], DEPTH_DATA_SET_NAME) - - if recache is False and os.path.exists(DEPTH_DATA_SET_PATH): - df = get_cached_dataframe(DEPTH_DATA_SET_PATH) + SUM_DEPTH_DATA_SET_NAME = 'sum_depth.pkl' + OUT_DEPTH_DATA_SET_NAME = 'out_depth.pkl' + SUM_DEPTH_DATA_SET_PATH = os.path.join( + os.path.split(folder_path)[0], SUM_DEPTH_DATA_SET_NAME) + OUT_DEPTH_DATA_SET_PATH = os.path.join( + os.path.split(folder_path)[0], OUT_DEPTH_DATA_SET_NAME) + + if recache is False and os.path.exists(SUM_DEPTH_DATA_SET_PATH)\ + and os.path.exists(OUT_DEPTH_DATA_SET_PATH): + summary_df = get_cached_dataframe(SUM_DEPTH_DATA_SET_PATH) + outliers_df = get_cached_dataframe(OUT_DEPTH_DATA_SET_PATH) else: with st.spinner('Creating depth-of-coverage data frame. ' + 'This might take some time.'): - df = work_with_depth( + summary_df, outliers_df = work_with_depth( data_set_type, folder_path, key_suffix) - cache_dataframe(df, DEPTH_DATA_SET_PATH) - - show_calculated_data_set(df, 'Imported depths') - df = fix_data_set(df) - temporal_feature, feature_list = find_temporal_feature(df) - df, feature_list = modify_data_set( - df, temporal_feature, feature_list, key_suffix) - chosen_charts = visualize_data_set( - df, temporal_feature, feature_list, key_suffix) + cache_dataframe(summary_df, SUM_DEPTH_DATA_SET_PATH) + + show_calculated_data_set(summary_df, 'Summary of imported depths') + show_calculated_data_set(outliers_df, 'Outliers of imported depths') + summary_df = fix_data_set(summary_df) + outliers_df = fix_data_set(outliers_df) + summary_temporal_feature, summary_feature_list =\ + find_temporal_feature(summary_df) + outliers_temporal_feature, outliers_feature_list =\ + find_temporal_feature(outliers_df) + # df, feature_list = modify_data_set( + # df, temporal_feature, feature_list, key_suffix) + chosen_charts = [] + chosen_charts += visualize_data_set( + summary_df, summary_temporal_feature, summary_feature_list, + key_suffix + 'sum') + chosen_charts += visualize_data_set( + outliers_df, outliers_temporal_feature, outliers_feature_list, + key_suffix + 'out') elif data_set_type == 'Calculated': CALCULATED_DATA_SET_NAME = 'calculated.pkl' diff --git a/Source/visualize.py b/Source/visualize.py index ae0d772..3cbc2ac 100644 --- a/Source/visualize.py +++ b/Source/visualize.py @@ -373,8 +373,7 @@ def whisker_chart(summary_data, temporal_column): ) final_chart = (whiskers_chart + bar_chart + mean_chart).configure_scale( - bandPaddingInner=0.2 - ) + bandPaddingInner=0.2) return final_chart.interactive()