Skip to content

Commit

Permalink
drafted script for sourcing rat images
Browse files Browse the repository at this point in the history
  • Loading branch information
niksirbi committed Oct 21, 2024
1 parent e07cd0c commit ec73930
Showing 1 changed file with 281 additions and 0 deletions.
281 changes: 281 additions & 0 deletions examples/rat/1_source_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
"""
Identify source images for the SWC rat template
===============================================
Set up project directory with all relevant data and identify source images
to be used for building the SWC rat brain template.
This scipt must be run on the SWC HPC cluster.
"""

# %%
# Imports
# -------
import os
import shutil
from datetime import date
from pathlib import Path

import pandas as pd
from loguru import logger
from tqdm import tqdm

from brainglobe_template_builder.io import ( # type: ignore
get_path_from_env_variable,
get_unique_folder_in_dir,
load_tiff,
)
from brainglobe_template_builder.plots import plot_grid, plot_orthographic

# Set up directories and logging
# ------------------------------

# Prepare directory structure
atlas_dir = get_path_from_env_variable(
"ATLAS_FORGE_DIR", "/ceph/neuroinformatics/neuroinformatics/atlas-forge"
)

species_id = "Rat" # the subfolder names within atlas_dir
species_dir = atlas_dir / species_id

species_dir.mkdir(parents=True, exist_ok=True)
# Make "rawdata", "derivatives", "templates", and "logs" directories
for folder in ["rawdata", "derivatives", "templates", "logs"]:
(species_dir / folder).mkdir(exist_ok=True)

# Directory where source images are stored for this species
# This must contain subfolders for each subject
source_dir = get_path_from_env_variable(
"ATLAS_SOURCE_DIR", "/ceph/akrami/capsid_testing/imaging/2p"
)

# Set up logging
today = date.today()
current_script_name = os.path.basename(__file__).replace(".py", "")
logger.add(species_dir / "logs" / f"{today}_{current_script_name}.log")
logger.info(f"Will save outputs to {species_dir}.")

# %%
# Load a dataframe with all SWC brains used for atlases
# ------------------------------------------------------

path_of_this_script = Path(__file__).resolve()
source_csv_dir = path_of_this_script.parent.parent.parent / "data"
source_csv_path = source_csv_dir / "SWC_brain-list_for-atlases_2024-04-15.csv"
df = pd.read_csv(source_csv_path)

# Strip trailing space from str columns and from column names
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.columns = df.columns.str.strip()
# Select only data for the species of interest
species_common_name = "rat" # as in the source csv table
df = df[df["Common name"] == species_common_name]
logger.info(f"Found {len(df)} {species_id} subjects.")

# %%
# Exract subject IDs and paths
# ----------------------------

# Rename "Specimen ID" to "subject_id"
df["subject_id"] = df["Specimen ID"]
# Assert that subject IDs are unique
assert len(df["subject_id"].unique()) == len(df), "Non-unique subject IDs"


# Use the Ceph path as the source data directory for all subjects
# (All data have been migrated there, after the table was created)
data_path_col = "Data path (raw)" # column name in the source csv
df[data_path_col] = source_dir.as_posix()


# Find each subject's source data folder
df["subject_path"] = df.apply(
lambda x: get_unique_folder_in_dir(
Path(x[data_path_col]), x["subject_id"], str_position="end"
),
axis=1,
)

sub_ids = df["subject_id"].values
logger.info(f"Identified {sub_ids.size} unique subject IDs. {sub_ids}")
sub_paths_msg = "Subject paths:\n"
for idx, row in df.iterrows():
sub_id = row["subject_id"]
sub_path = row["subject_path"].as_posix()
sub_paths_msg += f" {sub_id}: {sub_path}\n"
logger.debug(sub_paths_msg)

# %%
# Identify all downsampled images
# -------------------------------
# We will identify all downsampled images for each subject, by iterating over
# the downsampled_stacks folders and the various channels.
# We will aggregate the image information and paths in a dataframe.
# We assume the standard naming conventions of SWC's serial two-photon output.

valid_colors = ["far_red", "red", "green", "blue"] # order matters

images_list_of_dicts = []
for _, row in df.iterrows():
sub = row["subject_id"]

# Path to the downsampled stacks (often, but not always, in a subfolder)
sub_path_down = row["subject_path"]
if "downsampled_stacks" in os.listdir(sub_path_down):
sub_path_down = sub_path_down / "downsampled_stacks"

# Stacks are stored for each resolution in a separate folder
# These folders are named like "010_micron", "025_micron", etc.
stacks = [f for f in os.listdir(sub_path_down) if f.endswith("micron")]

for stack in sorted(stacks):
microns = int(stack.split("_")[0])
stack_path = sub_path_down / stack

images = [
f
for f in os.listdir(stack_path)
if f.endswith(".tif") and "_ch0" in f and not f.startswith(".")
]
for img in sorted(images):
ch_num = int(img.split("_ch0")[1].split("_")[0])
ch_color = [
c for c in valid_colors if img.split(".tif")[0].endswith(c)
][0]
ch_color = ch_color if ch_color != "far_red" else "farred"
image_id = f"sub-{sub}_res-{microns}um_channel-{ch_color}"
images_list_of_dicts.append(
{
"subject_id": sub,
"microns": microns,
"channel": ch_num,
"color": ch_color,
"image_id": image_id,
"image_path": stack_path / img,
}
)

images_df = pd.DataFrame(images_list_of_dicts)
n_img = len(images_df)
logger.info(f"Found {n_img} images across subjects, resolutions and channels.")

# %%
# Check for missing downsampled stacks
# -------------------------------------
# Logs a warning if any of the expected resolutions are missing

expected_microns = [10, 25, 50]
for sub in df["subject_id"]:
for microns in expected_microns:
if not (
images_df[
(images_df["subject_id"] == sub)
& (images_df["microns"] == microns)
].shape[0]
):
logger.warning(f"Subject {sub} lacks {microns} micron stack")


# %%
# Save dataframes and images to rawdata
# -------------------------------------
# Save the dataframes to the species rawdata directory.

rawdata_dir = species_dir / "rawdata"
subjects_csv = rawdata_dir / f"{today}_subjects.csv"
df.to_csv(subjects_csv, index=False)
logger.info(f"Saved subject information to csv: {subjects_csv}")

images_csv = rawdata_dir / f"{today}_images.csv"
images_df.to_csv(images_csv, index=False)
logger.info(f"Saved image information to csv: {images_csv}")


# %%
# Save images to the species rawdata directory
# (doesn't overwrite existing images).
# High-resolution images (10um) are just symlinked to avoid duplication
# of large files.

n_copied, n_symlinked = 0, 0
for idx in tqdm(images_df.index):
row = images_df.loc[idx, :]
sub = row["subject_id"]
sub_dir = rawdata_dir / f"sub-{sub}"
sub_dir.mkdir(exist_ok=True)
microns = row["microns"]
image_id = row["image_id"]
img_source_path = row["image_path"]
img_dest_path = sub_dir / f"{image_id}.tif"

print(img_dest_path)
# if the destination path exists, skip
if img_dest_path.exists():
logger.debug(f"Skipping {img_dest_path} as it already exists.")
continue

# if image is 10 microns, symlink it (to avoid duplication of large files)
if microns == 10:
img_dest_path.symlink_to(img_source_path)
logger.debug(f"Symlinked {img_dest_path} to {img_source_path}")
n_symlinked += 1
# else copy the image
else:
shutil.copyfile(img_source_path, img_dest_path)
logger.debug(f"Copied {img_source_path} to {img_dest_path}")
n_copied += 1

logger.info(
f"Copied {n_copied} and symlinked {n_symlinked} "
f"images to {rawdata_dir}."
)


# %%
# Save diagnostic plots to get a quick overview of the images.
# Plots will only be generated for the low-resolution images (50um).
# Plots are saved in the rawdata/sub-<subject_id>/plots folder.

subjects = sorted([f for f in os.listdir(rawdata_dir) if f.startswith("sub-")])

for sub in tqdm(subjects):
sub_dir = rawdata_dir / sub

# Find all 50um images for the subject
images = [
f
for f in os.listdir(sub_dir)
if f.endswith(".tif") and "res-50um" in f
]

# Plots will be saved in a subfolder
if images:
sub_plot_dir = sub_dir / "plots"
sub_plot_dir.mkdir(exist_ok=True)
logger.debug(f"Saving plots to {sub_plot_dir}...")

for img in images:
# load the tiff image as numpy array
img_path = sub_dir / img
try:
img = load_tiff(img_path)
except Exception as e:
logger.error(f"Failed to load {img_path}: {e}")
continue

# Plot frontal (coronal) sections in a grid
fig, _ = plot_grid(
img,
anat_space="PSL",
section="frontal",
n_slices=12,
save_path=sub_plot_dir / f"{img_path.stem}_grid",
)
logger.debug(f"Saved grid plot for {img_path.stem}.")
# Plot the image in three orthogonal views + max intensity projection
fig, _ = plot_orthographic(
img,
anat_space="PSL",
save_path=sub_plot_dir / f"{img_path.stem}_orthographic",
mip_attenuation=0.02,
)
logger.debug(f"Saved orthographic plot for {img_path.stem}.")

0 comments on commit ec73930

Please sign in to comment.