diff --git a/src/datasets/loaders/vizgen_merscope/config.vsh.yaml b/src/datasets/loaders/vizgen_merscope/config.vsh.yaml new file mode 100644 index 00000000..acdaf8cd --- /dev/null +++ b/src/datasets/loaders/vizgen_merscope/config.vsh.yaml @@ -0,0 +1,77 @@ +name: vizgen_merscope +namespace: datasets/loaders + +argument_groups: + - name: Inputs + arguments: + - type: string + name: --gcloud_bucket + default: "vz-ffpe-showcase" + description: "Google cloud bucket name" + - type: string + name: --dataset_bucket_name + default: "HumanBreastCancerPatient1" + description: "The name of the dataset base directory in the vizgen google cloud bucket" + - type: string + name: --segmentation_id + default: ["cell"] + description: The segmentation identifier + multiple: true + - name: Metadata + arguments: + - type: string + name: --id + description: "A unique identifier for the dataset" + required: true + - name: --dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: --dataset_url + description: Link to the original source of the dataset. + required: false + - name: --dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: --dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: --dataset_description + type: string + description: Long description of the dataset. + required: true + - name: --dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false + - name: Outputs + arguments: + - name: "--output" + __merge__: /src/api/file_common_ist.yaml + direction: output + required: true + +resources: + - type: python_script + path: script.py + - path: download.sh + +engines: + - type: docker + image: openproblems/base_python:1.0.0 + __merge__: + - /src/base/setup_spatialdata_partial.yaml + setup: + - type: python + pypi: + - spatialdata-io + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [highmem, midcpu, midtime] diff --git a/src/datasets/loaders/vizgen_merscope/download.sh b/src/datasets/loaders/vizgen_merscope/download.sh new file mode 100755 index 00000000..07dfee2e --- /dev/null +++ b/src/datasets/loaders/vizgen_merscope/download.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +###################################################################### +# Requirements: +# - Install google-cloud-sdk, e.g. mamba install -c conda-forge google-cloud-sdk +# - Authenticate with your Google account: gcloud auth login --no-browser +# +# Usage: merfish_download.sh +# - DATASET refers to the naming in the Vizgen download page +# +# Vizgen cancer panel datasets download page: https://console.cloud.google.com/storage/browser/vz-ffpe-showcase +# The script should also work for other Vizgen style data buckets. + +BUCKET_NAME=$1 +DATASET=$2 +OUT_DIR=$3 +DRY_RUN=$4 + +###################################################################### + +# Suppress BrokenPipeError messages from Python +export PYTHONWARNINGS="ignore:BrokenPipeError" + +LIST_LIMIT=10 # Maximum number of files to list in dry-run mode + +# List of files or directories to download +FILES=( + "cell_by_gene.csv" + "cell_metadata.csv" + "detected_transcripts.csv" + "cell_boundaries/" # This is a directory + "cell_boundaries.parquet" + "images/mosaic_DAPI_z3.tif" # If you want all images, change this to "images/" +) + +# Check if gsutil is installed +if ! command -v gsutil &> /dev/null; then + echo "ERROR: gsutil is not installed. Please install the Google Cloud SDK." +fi + +# Check if user is authenticated +if ! gcloud auth list 2>&1 | grep -q ACTIVE; then + echo "ERROR: No authenticated Google Cloud account found. Please authenticate using 'gcloud auth login'." +fi + + +# Loop over each file or directory to download +for FILE in "${FILES[@]}"; do + # Get the directory from the file path (remove the file name) + DIR=$(dirname "$FILE") + + # Create the full target directory path + TARGET_DIR="$OUT_DIR/$DIR" + + # If DIR is '.', set TARGET_DIR to OUT_DIR + if [ "$DIR" = "." ]; then + TARGET_DIR="$OUT_DIR" + fi + + # Create the directory if it doesn't exist + mkdir -p "$TARGET_DIR" + + # Check if the file or directory exists in the bucket + if gsutil ls -d "gs://$BUCKET_NAME/$DATASET/$FILE" > /dev/null 2>&1; then + if [ "$DRY_RUN" = true ]; then + # Dry-run: list files instead of downloading + echo -e "\nSimulating download for gs://$BUCKET_NAME/$DATASET/$FILE" + echo -e "Download to $TARGET_DIR" + + if [[ "$FILE" == */ ]]; then + echo "Listing first $LIST_LIMIT files from gs://$BUCKET_NAME/$DATASET/$FILE" + gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE" 2>/dev/null | head -n $LIST_LIMIT + else + gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE" + fi + else + # If it's a directory, download recursively + if [[ "$FILE" == */ ]]; then + echo "Downloading directory gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/" + gsutil -m cp -r "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/" + else + # Download the file, maintaining directory structure under OUT_DIR + echo "Downloading file gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/" + gsutil cp "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/" + fi + fi + else + # Skip download if file or directory does not exist + echo "Skipping $FILE: does not exist in gs://$BUCKET_NAME/$DATASET/" + fi +done diff --git a/src/datasets/loaders/vizgen_merscope/script.py b/src/datasets/loaders/vizgen_merscope/script.py new file mode 100644 index 00000000..fa5d25b2 --- /dev/null +++ b/src/datasets/loaders/vizgen_merscope/script.py @@ -0,0 +1,198 @@ +# https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard + +import os +import stat +from pathlib import Path +import subprocess +from datetime import datetime +import spatialdata as sd +import spatialdata_io as sdio + +## VIASH START +par = { + "gcloud_bucket": "vz-ffpe-showcase", + "dataset_bucket_name": "HumanBreastCancerPatient1", + "segmentation_id": ["cell"], + "output": "output.zarr", + "id": "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1", + #"dataset_id": "2022Vizgen_human_breast_cancer_Patient1", + "dataset_name": "value", + "dataset_url": "https://vizgen.com/data-release-program/", + "dataset_reference": "value", + "dataset_summary": "value", + "dataset_description": "value", + "dataset_organism": "human", +} + + +## VIASH END + +assert ("cell" in par["segmentation_id"]) and (len(par["segmentation_id"]) == 1), "Currently cell labels are definitely assigned in this script. And merscope does not provide other segmentations." + + +t0 = datetime.now() + +# Download raw data +print(datetime.now() - t0, "Download raw data", flush=True) +download_script = f"{meta['resources_dir']}/download.sh" #https://viash.io/guide/component/variables.html#meta-variables-in-meta +BUCKET_NAME = par['gcloud_bucket'] +DATASET = par["dataset_bucket_name"] +OUT_DIR = meta["temp_dir"] +DRY_RUN = "false" + +# Change permissions of the download script to make it executable +os.chmod(download_script, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # Grant read, write, and execute permissions to the user + +# Run the download script +out = subprocess.run([download_script, BUCKET_NAME, DATASET, OUT_DIR, DRY_RUN], check=True, capture_output=True, text=True) +print(out.stdout, flush=True) + +# If the cell polygons are in the old format (cell_boundaries/*.hdf5 instead of cell_boundaries.parquet) the raw data +# needs to be modified for the spatialdata-io loader +# (see: https://github.com/scverse/spatialdata-io/issues/71#issuecomment-1741995582) +import h5py +import pandas as pd +import geopandas as gpd +from shapely.geometry import Polygon +from shapely import MultiPolygon +import os + +def read_boundary_hdf5(folder): + print("Convert boundary hdf5 to parquet", flush=True) + all_boundaries = {} + boundaries = None + hdf5_files = os.listdir(folder + '/cell_boundaries/') + n_files = len(hdf5_files) + incr = n_files // 15 + for _,i in enumerate(hdf5_files): + if (_ % incr) == 0: + print(f"\tProcessed {_}/{n_files}", flush=True) + with h5py.File(folder + '/cell_boundaries/' + i, "r") as f: + for key in f['featuredata'].keys(): + if boundaries is not None: + boundaries.loc[key] = MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]) # doesn't matter which zIndex we use, MultiPolygon to work with read function in spatialdata-io + else: + #boundaries = gpd.GeoDataFrame(index=[key], geometry=MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])])) + boundaries = gpd.GeoDataFrame(geometry=gpd.GeoSeries(MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]),index=[key])) + all_boundaries[i] = boundaries + boundaries = None + all_concat = pd.concat(list(all_boundaries.values())) + all_concat = all_concat[~all_concat.index.duplicated(keep='first')] # hdf5 can contain duplicates with same cell_id and position, removing those + all_concat.rename_geometry('geometry_renamed', inplace=True) # renaming to make it compatible with spatialdata-io + all_concat["EntityID"] = all_concat.index # renaming to make it compatible with spatialdata-io + all_concat['ZIndex'] = 0 # adding to make it compatible with spatialdata-io + all_concat.to_parquet(folder + '/cell_boundaries.parquet') + + count_path = f"{folder}/cell_by_gene.csv" + obs_path = f"{folder}/cell_metadata.csv" + + data = pd.read_csv(count_path, index_col=0) + obs = pd.read_csv(obs_path, index_col=0) + + data.index = obs.index.astype(str) # data index in old software is range(n_obs) + data.index.name = "cell" # renaming to make it compatible with spatialdata-io + obs.index.name = 'EntityID' # renaming to make it compatible with spatialdata-io + data.to_csv(count_path) + obs.to_csv(obs_path) + +if not (Path(OUT_DIR) / "cell_boundaries.parquet").exists(): + read_boundary_hdf5(str(OUT_DIR)) + + +# Generate spatialdata.zarr +RAW_DATA_DIR = Path(OUT_DIR) + +######################################### +# Convert raw files to spatialdata zarr # +######################################### +print(datetime.now() - t0, "Convert raw files to spatialdata zarr", flush=True) + +slide_name = "slide" + +sdata = sdio.merscope( + RAW_DATA_DIR, + vpt_outputs=None, + z_layers=3, + region_name=None, + slide_name=slide_name, + backend=None, + transcripts=True, + cells_boundaries=True, + cells_table=True, + mosaic_images=True, + #imread_kwargs=mappingproxy({}), + #image_models_kwargs=mappingproxy({}) +) + +############### +# Rename keys # +############### +print(datetime.now() - t0, "Rename keys", flush=True) + +name = slide_name + "_" + RAW_DATA_DIR.name + +elements_renaming_map = { + f"{name}_z3" : "morphology_mip", #TODO: that is actually not the morphology_mip, i.e. either we should rename the label later, or we should actually project over z. But we also want to have 3d at some point anyway + f"{name}_transcripts" : "transcripts", + f"{name}_polygons" : "cell_polygons", + f"table" : "metadata", +} + +for old_key, new_key in elements_renaming_map.items(): + sdata[new_key] = sdata[old_key] + del sdata[old_key] + +# Rename transcript column +sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_z":"z"}) + +######################################### +# Throw out all channels except of DAPI # +######################################### +print(datetime.now() - t0, "Throw out all channels except of DAPI", flush=True) + +# TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when +# these channels aren't excluded, not sure why... +sdata["morphology_mip"] = sdata["morphology_mip"].sel(c=["DAPI"]) + +################################# +# Get cell labels from polygons # +################################# +print(datetime.now() - t0, "Get cell labels from polygons", flush=True) + +img_extent = sd.get_extent(sdata['morphology_mip']) +# TODO: Just note that currently the rasterize function has a bug, this error is small though with the given spatial resolution. +# Check https://github.com/scverse/spatialdata/issues/165 for updates on this bug. +sdata["cell_labels"] = sd.rasterize( + sdata["cell_polygons"], + ["x", "y"], + min_coordinate=[int(img_extent["x"][0]), int(img_extent["y"][0])], + max_coordinate=[int(img_extent["x"][1]), int(img_extent["y"][1])], + target_coordinate_system="global", + target_unit_to_pixels=1, + return_regions_as_labels=True, +) +del sdata["cell_labels"].attrs['label_index_to_category'] + +############################## +# Add info to metadata table # +############################## +print(datetime.now() - t0, "Add info to metadata table", flush=True) + +#TODO: values as input variables +sdata["metadata"].uns["dataset_id"] = par["id"] +sdata["metadata"].uns["dataset_name"] = par["dataset_name"] +sdata["metadata"].uns["dataset_url"] = par["dataset_url"] +sdata["metadata"].uns["dataset_reference"] = par["dataset_reference"] +sdata["metadata"].uns["dataset_summary"] = par["dataset_summary"] +sdata["metadata"].uns["dataset_description"] = par["dataset_description"] +sdata["metadata"].uns["dataset_organism"] = par["dataset_organism"] +sdata["metadata"].uns["segmentation_id"] = par["segmentation_id"] + +######### +# Write # +######### +print(datetime.now() - t0, f"Writing to {par['output']}", flush=True) + +sdata.write(par["output"]) + +print(datetime.now() - t0, "Done", flush=True) \ No newline at end of file