From c80c6aed1b871b630dddd7ef7eef9722b5d26a90 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Thu, 12 Dec 2024 09:36:36 -0800 Subject: [PATCH 1/4] update pointcloud preprocessing scripts --- .../preprocessing/pc_preprocessing/pcna.py | 48 ++++++++++++++--- .../pc_preprocessing/punctate_cyto.py | 46 +++++++++++++--- .../pc_preprocessing/punctate_nuc.py | 52 +++++++++++++++---- 3 files changed, 121 insertions(+), 25 deletions(-) diff --git a/src/br/data/preprocessing/pc_preprocessing/pcna.py b/src/br/data/preprocessing/pc_preprocessing/pcna.py index 15fb431..64fb235 100644 --- a/src/br/data/preprocessing/pc_preprocessing/pcna.py +++ b/src/br/data/preprocessing/pc_preprocessing/pcna.py @@ -1,5 +1,6 @@ from multiprocessing import Pool - +import argparse +from pathlib import Path import numpy as np import pandas as pd from pyntcloud import PyntCloud @@ -64,26 +65,35 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): center_of_mass = np.mean(np.stack(np.where(img > 0)), axis=1) return np.floor(center_of_mass + 0.5).astype(int) +def main(args): -if __name__ == "__main__": - df = pd.read_csv(PCNA_SINGLE_CELL_PATH) + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + + if args.global_path: + df["registered_path"] = df["registered_path"].apply( + lambda x: args.global_path + x + ) - path_prefix = SAVE_LOCATION + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) with Pool(40) as p: @@ -97,3 +107,27 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Script for computing point clouds for PCNA dataset") + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) + + """ + python pcna.py --save_path "./make_pcs_test" --preprocessed_manifest "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/tmp_output_pcna/processed/manifest.parquet" --global_path "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/ + """ \ No newline at end of file diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py index f148ba1..71fe6b7 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py @@ -1,6 +1,7 @@ import warnings from multiprocessing import Pool - +import argparse +from pathlib import Path import numpy as np import pandas as pd from aicsimageio import AICSImage @@ -96,11 +97,11 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): @@ -108,13 +109,24 @@ def get_center_of_mass(img): return np.floor(center_of_mass + 0.5).astype(int) -if __name__ == "__main__": - df = pd.read_parquet(SINGLE_CELL_IMAGE_PATH) +def main(args): + + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + df = df.loc[df['structure_name'].isin(SKEW_EXP_DICT.keys())] + + if args.global_path: + df["registered_path"] = df["registered_path"].apply( + lambda x: args.global_path + x + ) - path_prefix = SAVE_LOCATION + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) with Pool(40) as p: @@ -128,3 +140,23 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Script for computing point clouds for cytoplasmic structures from WTC-11 hIPS single cell image dataset") + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py index 495b08c..e5237c1 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py @@ -4,7 +4,11 @@ from scipy.ndimage import binary_dilation from skimage.io import imread from tqdm import tqdm +import argparse +from pathlib import Path +from multiprocessing import Pool +STRUCTS = ['HIST1H2BJ', 'NUP153', 'SMC1A', 'SON'] def compute_labels(row, save=True): path = row["registered_path"] @@ -64,11 +68,11 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): @@ -76,19 +80,25 @@ def get_center_of_mass(img): return np.floor(center_of_mass + 0.5).astype(int) -if __name__ == "__main__": - df = pd.read_parquet(SINGLE_CELL_IMAGE_PATH) +def main(args): + + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + df = df.loc[df['structure_name'].isin(STRUCTS)] + + if args.global_path: + df["registered_path"] = df["registered_path"].apply( + lambda x: args.global_path + x + ) - path_prefix = SAVE_LOCATION + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) - # if str(row['CellId']) == '660844': - # print('yes') - # compute_labels(row) - - from multiprocessing import Pool with Pool(40) as p: _ = tuple( @@ -101,3 +111,23 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Script for computing point clouds for nuclear structures from WTC-11 hIPS single cell image dataset") + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) \ No newline at end of file From 14aad9cf0d558a6c66900b1e8a435bf162d1b53d Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Thu, 12 Dec 2024 09:41:07 -0800 Subject: [PATCH 2/4] run pre-commit --- .../preprocessing/pc_preprocessing/pcna.py | 14 +++++++----- .../pc_preprocessing/punctate_cyto.py | 15 +++++++------ .../pc_preprocessing/punctate_nuc.py | 22 ++++++++++--------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/src/br/data/preprocessing/pc_preprocessing/pcna.py b/src/br/data/preprocessing/pc_preprocessing/pcna.py index 64fb235..bb9ddd4 100644 --- a/src/br/data/preprocessing/pc_preprocessing/pcna.py +++ b/src/br/data/preprocessing/pc_preprocessing/pcna.py @@ -1,6 +1,7 @@ -from multiprocessing import Pool import argparse +from multiprocessing import Pool from pathlib import Path + import numpy as np import pandas as pd from pyntcloud import PyntCloud @@ -77,6 +78,7 @@ def get_center_of_mass(img): center_of_mass = np.mean(np.stack(np.where(img > 0)), axis=1) return np.floor(center_of_mass + 0.5).astype(int) + def main(args): # make save path directory @@ -85,9 +87,7 @@ def main(args): df = pd.read_parquet(args.preprocessed_manifest) if args.global_path: - df["registered_path"] = df["registered_path"].apply( - lambda x: args.global_path + x - ) + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) global path_prefix path_prefix = args.save_path @@ -110,7 +110,9 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Script for computing point clouds for PCNA dataset") + parser = argparse.ArgumentParser( + description="Script for computing point clouds for PCNA dataset" + ) parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") parser.add_argument( "--global_path", @@ -130,4 +132,4 @@ def main(args): """ python pcna.py --save_path "./make_pcs_test" --preprocessed_manifest "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/tmp_output_pcna/processed/manifest.parquet" --global_path "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/ - """ \ No newline at end of file + """ diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py index 71fe6b7..fbf2b8e 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py @@ -1,7 +1,8 @@ +import argparse import warnings from multiprocessing import Pool -import argparse from pathlib import Path + import numpy as np import pandas as pd from aicsimageio import AICSImage @@ -115,12 +116,10 @@ def main(args): Path(args.save_path).mkdir(parents=True, exist_ok=True) df = pd.read_parquet(args.preprocessed_manifest) - df = df.loc[df['structure_name'].isin(SKEW_EXP_DICT.keys())] + df = df.loc[df["structure_name"].isin(SKEW_EXP_DICT.keys())] if args.global_path: - df["registered_path"] = df["registered_path"].apply( - lambda x: args.global_path + x - ) + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) global path_prefix path_prefix = args.save_path @@ -143,7 +142,9 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Script for computing point clouds for cytoplasmic structures from WTC-11 hIPS single cell image dataset") + parser = argparse.ArgumentParser( + description="Script for computing point clouds for cytoplasmic structures from WTC-11 hIPS single cell image dataset" + ) parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") parser.add_argument( "--global_path", @@ -159,4 +160,4 @@ def main(args): help="Path to processed single cell image manifest.", ) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py index e5237c1..feb34e1 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py @@ -1,14 +1,16 @@ +import argparse +from multiprocessing import Pool +from pathlib import Path + import numpy as np import pandas as pd from pyntcloud import PyntCloud from scipy.ndimage import binary_dilation from skimage.io import imread from tqdm import tqdm -import argparse -from pathlib import Path -from multiprocessing import Pool -STRUCTS = ['HIST1H2BJ', 'NUP153', 'SMC1A', 'SON'] +STRUCTS = ["HIST1H2BJ", "NUP153", "SMC1A", "SON"] + def compute_labels(row, save=True): path = row["registered_path"] @@ -86,12 +88,10 @@ def main(args): Path(args.save_path).mkdir(parents=True, exist_ok=True) df = pd.read_parquet(args.preprocessed_manifest) - df = df.loc[df['structure_name'].isin(STRUCTS)] + df = df.loc[df["structure_name"].isin(STRUCTS)] if args.global_path: - df["registered_path"] = df["registered_path"].apply( - lambda x: args.global_path + x - ) + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) global path_prefix path_prefix = args.save_path @@ -114,7 +114,9 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Script for computing point clouds for nuclear structures from WTC-11 hIPS single cell image dataset") + parser = argparse.ArgumentParser( + description="Script for computing point clouds for nuclear structures from WTC-11 hIPS single cell image dataset" + ) parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") parser.add_argument( "--global_path", @@ -130,4 +132,4 @@ def main(args): help="Path to processed single cell image manifest.", ) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) From 1d99d1387e238e7a3c1eadd9dc43512d06389798 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Thu, 12 Dec 2024 09:41:50 -0800 Subject: [PATCH 3/4] doc changes + pre-commit --- docs/PREPROCESSING.md | 6 ++++-- src/br/analysis/visualize_pointclouds.py | 12 +++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/PREPROCESSING.md b/docs/PREPROCESSING.md index 4d74e0f..e529c11 100644 --- a/docs/PREPROCESSING.md +++ b/docs/PREPROCESSING.md @@ -37,7 +37,7 @@ Preprocessing is divided into three steps that use two different virtual environ # Punctate structures: Generate pointclouds -Edit the data paths in the following file to match the location of the outputs of the alignment, masking, and registration step, then run it. +Use the preprocessed data manifest generated via the alignment, masking, and registration steps from image as input to the pointcloud generation step ``` src @@ -45,7 +45,9 @@ src └── data    └── preprocessing       └── pc_preprocessing -          └── punctate_cyto.py <- Point cloud sampling from raw images for punctate structures here +          └── pcna.py <- Point cloud sampling from raw images for DNA replication foci dataset here +          └── punctate_nuc.py <- Point cloud sampling from raw images of nuclear structures from the WTC-11 hIPS single cell image dataset here +          └── punctate_cyto.py <- Point cloud sampling from raw images of cytoplasmic structures from the WTC-11 hIPS single cell image dataset here ``` # Polymorphic structures: Generate SDFs diff --git a/src/br/analysis/visualize_pointclouds.py b/src/br/analysis/visualize_pointclouds.py index 4a785db..4401dad 100644 --- a/src/br/analysis/visualize_pointclouds.py +++ b/src/br/analysis/visualize_pointclouds.py @@ -80,7 +80,7 @@ def main(args): for _, this_image in orig_image_df.iterrows(): cell_id = this_image["CellId"] if not strat: - strat_val = this_image['structure_name'] + strat_val = this_image["structure_name"] if args.dataset_name == "pcna": points_all, _, img, center = compute_labels_pcna(this_image, False) @@ -108,10 +108,12 @@ def main(args): if mem_ind is not None: img_mem = img[mem_ind] - if (args.dataset_name == 'other_punctate') and (strat_val in ["CETN2", "RAB5A", "SLC25A17"]): - img_raw = np.where(img_mem, img_raw, 0) # mask by mem/nuc seg + if (args.dataset_name == "other_punctate") and ( + strat_val in ["CETN2", "RAB5A", "SLC25A17"] + ): + img_raw = np.where(img_mem, img_raw, 0) # mask by mem/nuc seg else: - img_raw = np.where(img_nuc, img_raw, 0) # mask by mem/nuc seg + img_raw = np.where(img_nuc, img_raw, 0) # mask by mem/nuc seg # Sample sparse point cloud and get images probs2 = points_all["s"].values @@ -168,7 +170,7 @@ def main(args): center_slice=center_slice, ) ax_array[2].set_title("Sampling sparse PC") - print(f'Saving {name}.png') + print(f"Saving {name}.png") fig.savefig(Path(args.save_path) / Path(f"{name}.png"), bbox_inches="tight", dpi=300) From e4e50182d2daad464906b2ae64da4293f737d079 Mon Sep 17 00:00:00 2001 From: Ritvik Vasan Date: Thu, 12 Dec 2024 09:47:29 -0800 Subject: [PATCH 4/4] add example run doc --- src/br/data/preprocessing/pc_preprocessing/pcna.py | 4 +++- src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py | 6 ++++++ src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py | 6 ++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/br/data/preprocessing/pc_preprocessing/pcna.py b/src/br/data/preprocessing/pc_preprocessing/pcna.py index bb9ddd4..5e1f080 100644 --- a/src/br/data/preprocessing/pc_preprocessing/pcna.py +++ b/src/br/data/preprocessing/pc_preprocessing/pcna.py @@ -131,5 +131,7 @@ def main(args): main(args) """ - python pcna.py --save_path "./make_pcs_test" --preprocessed_manifest "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/tmp_output_pcna/processed/manifest.parquet" --global_path "/allen/aics/modeling/ritvik/projects/latest_clones/benchmarking_representations/subpackages/image_preprocessing/ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/pcna --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_pcna/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" """ diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py index fbf2b8e..d1a97ae 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py @@ -161,3 +161,9 @@ def main(args): ) args = parser.parse_args() main(args) + + """ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_variance/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" + """ diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py index feb34e1..3c551c0 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py @@ -133,3 +133,9 @@ def main(args): ) args = parser.parse_args() main(args) + + """ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_variance/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" + """