Skip to content

Commit

Permalink
Add dataloader for vizgen's merscope datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
LouisK92 committed Sep 24, 2024
1 parent ffd4e8f commit e9842e5
Show file tree
Hide file tree
Showing 3 changed files with 366 additions and 0 deletions.
77 changes: 77 additions & 0 deletions src/datasets/loaders/vizgen_merscope/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: vizgen_merscope
namespace: datasets/loaders

argument_groups:
- name: Inputs
arguments:
- type: string
name: --gcloud_bucket
default: "vz-ffpe-showcase"
description: "Google cloud bucket name"
- type: string
name: --dataset_bucket_name
default: "HumanBreastCancerPatient1"
description: "The name of the dataset base directory in the vizgen google cloud bucket"
- type: string
name: --segmentation_id
default: ["cell"]
description: The segmentation identifier
multiple: true
- name: Metadata
arguments:
- type: string
name: --id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
- name: Outputs
arguments:
- name: "--output"
__merge__: /src/api/file_common_ist.yaml
direction: output
required: true

resources:
- type: python_script
path: script.py
- path: download.sh

engines:
- type: docker
image: openproblems/base_python:1.0.0
__merge__:
- /src/base/setup_spatialdata_partial.yaml
setup:
- type: python
pypi:
- spatialdata-io
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [highmem, midcpu, midtime]
91 changes: 91 additions & 0 deletions src/datasets/loaders/vizgen_merscope/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash

######################################################################
# Requirements:
# - Install google-cloud-sdk, e.g. mamba install -c conda-forge google-cloud-sdk
# - Authenticate with your Google account: gcloud auth login --no-browser
#
# Usage: merfish_download.sh <BUCKET_NAME> <DATASET> <OUT_DIR> <DRY_RUN=true/false>
# - DATASET refers to the naming in the Vizgen download page
#
# Vizgen cancer panel datasets download page: https://console.cloud.google.com/storage/browser/vz-ffpe-showcase
# The script should also work for other Vizgen style data buckets.

BUCKET_NAME=$1
DATASET=$2
OUT_DIR=$3
DRY_RUN=$4

######################################################################

# Suppress BrokenPipeError messages from Python
export PYTHONWARNINGS="ignore:BrokenPipeError"

LIST_LIMIT=10 # Maximum number of files to list in dry-run mode

# List of files or directories to download
FILES=(
"cell_by_gene.csv"
"cell_metadata.csv"
"detected_transcripts.csv"
"cell_boundaries/" # This is a directory
"cell_boundaries.parquet"
"images/mosaic_DAPI_z3.tif" # If you want all images, change this to "images/"
)

# Check if gsutil is installed
if ! command -v gsutil &> /dev/null; then
echo "ERROR: gsutil is not installed. Please install the Google Cloud SDK."
fi

# Check if user is authenticated
if ! gcloud auth list 2>&1 | grep -q ACTIVE; then
echo "ERROR: No authenticated Google Cloud account found. Please authenticate using 'gcloud auth login'."
fi


# Loop over each file or directory to download
for FILE in "${FILES[@]}"; do
# Get the directory from the file path (remove the file name)
DIR=$(dirname "$FILE")

# Create the full target directory path
TARGET_DIR="$OUT_DIR/$DIR"

# If DIR is '.', set TARGET_DIR to OUT_DIR
if [ "$DIR" = "." ]; then
TARGET_DIR="$OUT_DIR"
fi

# Create the directory if it doesn't exist
mkdir -p "$TARGET_DIR"

# Check if the file or directory exists in the bucket
if gsutil ls -d "gs://$BUCKET_NAME/$DATASET/$FILE" > /dev/null 2>&1; then
if [ "$DRY_RUN" = true ]; then
# Dry-run: list files instead of downloading
echo -e "\nSimulating download for gs://$BUCKET_NAME/$DATASET/$FILE"
echo -e "Download to $TARGET_DIR"

if [[ "$FILE" == */ ]]; then
echo "Listing first $LIST_LIMIT files from gs://$BUCKET_NAME/$DATASET/$FILE"
gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE" 2>/dev/null | head -n $LIST_LIMIT
else
gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE"
fi
else
# If it's a directory, download recursively
if [[ "$FILE" == */ ]]; then
echo "Downloading directory gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/"
gsutil -m cp -r "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/"
else
# Download the file, maintaining directory structure under OUT_DIR
echo "Downloading file gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/"
gsutil cp "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/"
fi
fi
else
# Skip download if file or directory does not exist
echo "Skipping $FILE: does not exist in gs://$BUCKET_NAME/$DATASET/"
fi
done
198 changes: 198 additions & 0 deletions src/datasets/loaders/vizgen_merscope/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard

import os
import stat
from pathlib import Path
import subprocess
from datetime import datetime
import spatialdata as sd
import spatialdata_io as sdio

## VIASH START
par = {
"gcloud_bucket": "vz-ffpe-showcase",
"dataset_bucket_name": "HumanBreastCancerPatient1",
"segmentation_id": ["cell"],
"output": "output.zarr",
"id": "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1",
#"dataset_id": "2022Vizgen_human_breast_cancer_Patient1",
"dataset_name": "value",
"dataset_url": "https://vizgen.com/data-release-program/",
"dataset_reference": "value",
"dataset_summary": "value",
"dataset_description": "value",
"dataset_organism": "human",
}


## VIASH END

assert ("cell" in par["segmentation_id"]) and (len(par["segmentation_id"]) == 1), "Currently cell labels are definitely assigned in this script. And merscope does not provide other segmentations."


t0 = datetime.now()

# Download raw data
print(datetime.now() - t0, "Download raw data", flush=True)
download_script = f"{meta['resources_dir']}/download.sh" #https://viash.io/guide/component/variables.html#meta-variables-in-meta
BUCKET_NAME = par['gcloud_bucket']
DATASET = par["dataset_bucket_name"]
OUT_DIR = meta["temp_dir"]
DRY_RUN = "false"

# Change permissions of the download script to make it executable
os.chmod(download_script, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # Grant read, write, and execute permissions to the user

# Run the download script
out = subprocess.run([download_script, BUCKET_NAME, DATASET, OUT_DIR, DRY_RUN], check=True, capture_output=True, text=True)
print(out.stdout, flush=True)

# If the cell polygons are in the old format (cell_boundaries/*.hdf5 instead of cell_boundaries.parquet) the raw data
# needs to be modified for the spatialdata-io loader
# (see: https://github.com/scverse/spatialdata-io/issues/71#issuecomment-1741995582)
import h5py
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from shapely import MultiPolygon
import os

def read_boundary_hdf5(folder):
print("Convert boundary hdf5 to parquet", flush=True)
all_boundaries = {}
boundaries = None
hdf5_files = os.listdir(folder + '/cell_boundaries/')
n_files = len(hdf5_files)
incr = n_files // 15
for _,i in enumerate(hdf5_files):
if (_ % incr) == 0:
print(f"\tProcessed {_}/{n_files}", flush=True)
with h5py.File(folder + '/cell_boundaries/' + i, "r") as f:
for key in f['featuredata'].keys():
if boundaries is not None:
boundaries.loc[key] = MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]) # doesn't matter which zIndex we use, MultiPolygon to work with read function in spatialdata-io
else:
#boundaries = gpd.GeoDataFrame(index=[key], geometry=MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]))
boundaries = gpd.GeoDataFrame(geometry=gpd.GeoSeries(MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]),index=[key]))
all_boundaries[i] = boundaries
boundaries = None
all_concat = pd.concat(list(all_boundaries.values()))
all_concat = all_concat[~all_concat.index.duplicated(keep='first')] # hdf5 can contain duplicates with same cell_id and position, removing those
all_concat.rename_geometry('geometry_renamed', inplace=True) # renaming to make it compatible with spatialdata-io
all_concat["EntityID"] = all_concat.index # renaming to make it compatible with spatialdata-io
all_concat['ZIndex'] = 0 # adding to make it compatible with spatialdata-io
all_concat.to_parquet(folder + '/cell_boundaries.parquet')

count_path = f"{folder}/cell_by_gene.csv"
obs_path = f"{folder}/cell_metadata.csv"

data = pd.read_csv(count_path, index_col=0)
obs = pd.read_csv(obs_path, index_col=0)

data.index = obs.index.astype(str) # data index in old software is range(n_obs)
data.index.name = "cell" # renaming to make it compatible with spatialdata-io
obs.index.name = 'EntityID' # renaming to make it compatible with spatialdata-io
data.to_csv(count_path)
obs.to_csv(obs_path)

if not (Path(OUT_DIR) / "cell_boundaries.parquet").exists():
read_boundary_hdf5(str(OUT_DIR))


# Generate spatialdata.zarr
RAW_DATA_DIR = Path(OUT_DIR)

#########################################
# Convert raw files to spatialdata zarr #
#########################################
print(datetime.now() - t0, "Convert raw files to spatialdata zarr", flush=True)

slide_name = "slide"

sdata = sdio.merscope(
RAW_DATA_DIR,
vpt_outputs=None,
z_layers=3,
region_name=None,
slide_name=slide_name,
backend=None,
transcripts=True,
cells_boundaries=True,
cells_table=True,
mosaic_images=True,
#imread_kwargs=mappingproxy({}),
#image_models_kwargs=mappingproxy({})
)

###############
# Rename keys #
###############
print(datetime.now() - t0, "Rename keys", flush=True)

name = slide_name + "_" + RAW_DATA_DIR.name

elements_renaming_map = {
f"{name}_z3" : "morphology_mip", #TODO: that is actually not the morphology_mip, i.e. either we should rename the label later, or we should actually project over z. But we also want to have 3d at some point anyway
f"{name}_transcripts" : "transcripts",
f"{name}_polygons" : "cell_polygons",
f"table" : "metadata",
}

for old_key, new_key in elements_renaming_map.items():
sdata[new_key] = sdata[old_key]
del sdata[old_key]

# Rename transcript column
sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_z":"z"})

#########################################
# Throw out all channels except of DAPI #
#########################################
print(datetime.now() - t0, "Throw out all channels except of DAPI", flush=True)

# TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when
# these channels aren't excluded, not sure why...
sdata["morphology_mip"] = sdata["morphology_mip"].sel(c=["DAPI"])

#################################
# Get cell labels from polygons #
#################################
print(datetime.now() - t0, "Get cell labels from polygons", flush=True)

img_extent = sd.get_extent(sdata['morphology_mip'])
# TODO: Just note that currently the rasterize function has a bug, this error is small though with the given spatial resolution.
# Check https://github.com/scverse/spatialdata/issues/165 for updates on this bug.
sdata["cell_labels"] = sd.rasterize(
sdata["cell_polygons"],
["x", "y"],
min_coordinate=[int(img_extent["x"][0]), int(img_extent["y"][0])],
max_coordinate=[int(img_extent["x"][1]), int(img_extent["y"][1])],
target_coordinate_system="global",
target_unit_to_pixels=1,
return_regions_as_labels=True,
)
del sdata["cell_labels"].attrs['label_index_to_category']

##############################
# Add info to metadata table #
##############################
print(datetime.now() - t0, "Add info to metadata table", flush=True)

#TODO: values as input variables
sdata["metadata"].uns["dataset_id"] = par["id"]
sdata["metadata"].uns["dataset_name"] = par["dataset_name"]
sdata["metadata"].uns["dataset_url"] = par["dataset_url"]
sdata["metadata"].uns["dataset_reference"] = par["dataset_reference"]
sdata["metadata"].uns["dataset_summary"] = par["dataset_summary"]
sdata["metadata"].uns["dataset_description"] = par["dataset_description"]
sdata["metadata"].uns["dataset_organism"] = par["dataset_organism"]
sdata["metadata"].uns["segmentation_id"] = par["segmentation_id"]

#########
# Write #
#########
print(datetime.now() - t0, f"Writing to {par['output']}", flush=True)

sdata.write(par["output"])

print(datetime.now() - t0, "Done", flush=True)

0 comments on commit e9842e5

Please sign in to comment.