diff --git a/docs/PREPROCESSING.md b/docs/PREPROCESSING.md index 4d74e0f..e529c11 100644 --- a/docs/PREPROCESSING.md +++ b/docs/PREPROCESSING.md @@ -37,7 +37,7 @@ Preprocessing is divided into three steps that use two different virtual environ # Punctate structures: Generate pointclouds -Edit the data paths in the following file to match the location of the outputs of the alignment, masking, and registration step, then run it. +Use the preprocessed data manifest generated via the alignment, masking, and registration steps from image as input to the pointcloud generation step ``` src @@ -45,7 +45,9 @@ src └── data    └── preprocessing       └── pc_preprocessing -          └── punctate_cyto.py <- Point cloud sampling from raw images for punctate structures here +          └── pcna.py <- Point cloud sampling from raw images for DNA replication foci dataset here +          └── punctate_nuc.py <- Point cloud sampling from raw images of nuclear structures from the WTC-11 hIPS single cell image dataset here +          └── punctate_cyto.py <- Point cloud sampling from raw images of cytoplasmic structures from the WTC-11 hIPS single cell image dataset here ``` # Polymorphic structures: Generate SDFs diff --git a/src/br/analysis/visualize_pointclouds.py b/src/br/analysis/visualize_pointclouds.py index 4a785db..4401dad 100644 --- a/src/br/analysis/visualize_pointclouds.py +++ b/src/br/analysis/visualize_pointclouds.py @@ -80,7 +80,7 @@ def main(args): for _, this_image in orig_image_df.iterrows(): cell_id = this_image["CellId"] if not strat: - strat_val = this_image['structure_name'] + strat_val = this_image["structure_name"] if args.dataset_name == "pcna": points_all, _, img, center = compute_labels_pcna(this_image, False) @@ -108,10 +108,12 @@ def main(args): if mem_ind is not None: img_mem = img[mem_ind] - if (args.dataset_name == 'other_punctate') and (strat_val in ["CETN2", "RAB5A", "SLC25A17"]): - img_raw = np.where(img_mem, img_raw, 0) # mask by mem/nuc seg + if (args.dataset_name == "other_punctate") and ( + strat_val in ["CETN2", "RAB5A", "SLC25A17"] + ): + img_raw = np.where(img_mem, img_raw, 0) # mask by mem/nuc seg else: - img_raw = np.where(img_nuc, img_raw, 0) # mask by mem/nuc seg + img_raw = np.where(img_nuc, img_raw, 0) # mask by mem/nuc seg # Sample sparse point cloud and get images probs2 = points_all["s"].values @@ -168,7 +170,7 @@ def main(args): center_slice=center_slice, ) ax_array[2].set_title("Sampling sparse PC") - print(f'Saving {name}.png') + print(f"Saving {name}.png") fig.savefig(Path(args.save_path) / Path(f"{name}.png"), bbox_inches="tight", dpi=300) diff --git a/src/br/data/preprocessing/pc_preprocessing/pcna.py b/src/br/data/preprocessing/pc_preprocessing/pcna.py index 15fb431..5e1f080 100644 --- a/src/br/data/preprocessing/pc_preprocessing/pcna.py +++ b/src/br/data/preprocessing/pc_preprocessing/pcna.py @@ -1,4 +1,6 @@ +import argparse from multiprocessing import Pool +from pathlib import Path import numpy as np import pandas as pd @@ -64,12 +66,12 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): @@ -77,13 +79,21 @@ def get_center_of_mass(img): return np.floor(center_of_mass + 0.5).astype(int) -if __name__ == "__main__": - df = pd.read_csv(PCNA_SINGLE_CELL_PATH) +def main(args): + + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + + if args.global_path: + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) - path_prefix = SAVE_LOCATION + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) with Pool(40) as p: @@ -97,3 +107,31 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Script for computing point clouds for PCNA dataset" + ) + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) + + """ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/pcna --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_pcna/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" + """ diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py index f148ba1..d1a97ae 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py @@ -1,5 +1,7 @@ +import argparse import warnings from multiprocessing import Pool +from pathlib import Path import numpy as np import pandas as pd @@ -96,11 +98,11 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): @@ -108,13 +110,22 @@ def get_center_of_mass(img): return np.floor(center_of_mass + 0.5).astype(int) -if __name__ == "__main__": - df = pd.read_parquet(SINGLE_CELL_IMAGE_PATH) +def main(args): + + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + df = df.loc[df["structure_name"].isin(SKEW_EXP_DICT.keys())] - path_prefix = SAVE_LOCATION + if args.global_path: + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) + + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) with Pool(40) as p: @@ -128,3 +139,31 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Script for computing point clouds for cytoplasmic structures from WTC-11 hIPS single cell image dataset" + ) + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) + + """ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/punctate_cyto.py --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_variance/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" + """ diff --git a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py index 495b08c..3c551c0 100644 --- a/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py +++ b/src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py @@ -1,3 +1,7 @@ +import argparse +from multiprocessing import Pool +from pathlib import Path + import numpy as np import pandas as pd from pyntcloud import PyntCloud @@ -5,6 +9,8 @@ from skimage.io import imread from tqdm import tqdm +STRUCTS = ["HIST1H2BJ", "NUP153", "SMC1A", "SON"] + def compute_labels(row, save=True): path = row["registered_path"] @@ -64,11 +70,11 @@ def compute_labels(row, save=True): cell_id = str(row["CellId"]) - save_path = path_prefix + cell_id + ".ply" + save_path = Path(path_prefix) / Path(cell_id + ".ply") new_cents = new_cents.astype(float) cloud = PyntCloud(new_cents) - cloud.to_file(save_path) + cloud.to_file(str(save_path)) def get_center_of_mass(img): @@ -76,19 +82,23 @@ def get_center_of_mass(img): return np.floor(center_of_mass + 0.5).astype(int) -if __name__ == "__main__": - df = pd.read_parquet(SINGLE_CELL_IMAGE_PATH) +def main(args): + + # make save path directory + Path(args.save_path).mkdir(parents=True, exist_ok=True) + + df = pd.read_parquet(args.preprocessed_manifest) + df = df.loc[df["structure_name"].isin(STRUCTS)] - path_prefix = SAVE_LOCATION + if args.global_path: + df["registered_path"] = df["registered_path"].apply(lambda x: args.global_path + x) + + global path_prefix + path_prefix = args.save_path all_rows = [] - for ind, row in tqdm(df.iterrows(), total=len(df)): + for _, row in tqdm(df.iterrows(), total=len(df)): all_rows.append(row) - # if str(row['CellId']) == '660844': - # print('yes') - # compute_labels(row) - - from multiprocessing import Pool with Pool(40) as p: _ = tuple( @@ -101,3 +111,31 @@ def get_center_of_mass(img): desc="compute_everything", ) ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Script for computing point clouds for nuclear structures from WTC-11 hIPS single cell image dataset" + ) + parser.add_argument("--save_path", type=str, required=True, help="Path to save results.") + parser.add_argument( + "--global_path", + type=str, + default=None, + required=False, + help="Path to append to relative paths in preprocessed manifest", + ) + parser.add_argument( + "--preprocessed_manifest", + type=str, + required=True, + help="Path to processed single cell image manifest.", + ) + args = parser.parse_args() + main(args) + + """ + Example run: + + python src/br/data/preprocessing/pc_preprocessing/punctate_nuc.py --save_path "./make_pcs_test" --preprocessed_manifest "./subpackages/image_preprocessing/tmp_output_variance/processed/manifest.parquet" --global_path "./subpackages/image_preprocessing/" + """