Skip to content

Commit

Permalink
fix data loader
Browse files Browse the repository at this point in the history
Co-authored-by: LouisK92 <[email protected]>
  • Loading branch information
rcannood and LouisK92 committed Sep 18, 2024
1 parent e12acb8 commit b00775b
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 252 deletions.
93 changes: 28 additions & 65 deletions scripts/test_resources/2023_10x_mouse_brain_xenium.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,73 +8,36 @@ cd "$REPO_ROOT"

set -e

DATASET_ID="2023_10x_mouse_brain_xenium"
TMP_DIR="temp/datasets/$DATASET_ID"
OUT_DIR="resources_test/common/2023_10x_mouse_brain_xenium"

# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip
# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip
# https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip


rep1="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs"
rep2="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs"
rep3="$TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs"

if [ ! -d "$rep1" ]; then
wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip \
-O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip
unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_1
fi

if [ ! -d "$rep2" ]; then
wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip \
-O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip
unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_2
fi

if [ ! -d "$rep3" ]; then
wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip \
-O $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip
unzip $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip -d $TMP_DIR/Xenium_V1_FF_Mouse_Brain_MultiSection_3
fi
cat > /tmp/params.yaml << HERE
param_list:
- id: 2023_10x_mouse_brain_xenium_rep1
input: https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip
replicate_id: rep1
segmentation_id:
- cell
- nucleus
dataset_id: 2023_10x_mouse_brain_xenium_rep1
dataset_name: Xenium V1 Fresh Frozen Mouse Brain rep1
dataset_url: https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard
dataset_summary: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform.
dataset_description: Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1).
dataset_organism: mus_musculus
crop_region_min_x: 10000
crop_region_max_x: 12000
crop_region_min_y: 10000
crop_region_max_y: 12000
publish_dir: resources_test/common
output: '\$id/dataset.h5ad'
output_state: '\$id/state.yaml'
HERE

# convert to zarr and concatenate
viash run src/data_loaders/download_10x_xenium/config.vsh.yaml -- \
--input "$rep1" \
--input "$rep2" \
--input "$rep3" \
--replicate_id rep1 \
--replicate_id rep2 \
--replicate_id rep3 \
--output $TMP_DIR/full_dataset.zarr \
--dataset_id "$DATASET_ID" \
--dataset_name "Xenium V1 Fresh Frozen Mouse Brain" \
--dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard" \
--dataset_summary "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1)." \
--dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581)." \
--dataset_organism "mus_musculus" \
--segmentation_id "cell;nucleus"

# crop the region
viash run src/data_processors/crop_region/config.vsh.yaml -- \
--input "$TMP_DIR/full_dataset.zarr" \
--output "$OUT_DIR/dataset.zarr" \
--replicate_id "rep1" \
--min_x 10000 \
--max_x 12000 \
--min_y 10000 \
--max_y 12000 \
--replicate_id rep2 \
--min_x 10000 \
--max_x 12000 \
--min_y 10000 \
--max_y 12000 \
--replicate_id rep3 \
--min_x 10000 \
--max_x 12000 \
--min_y 10000 \
--max_y 12000
nextflow run . \
-main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
-profile docker \
-resume \
-c common/nextflow_helpers/labels_ci.config \
-params-file /tmp/params.yaml

aws s3 sync --profile op \
"resources_test/common/2023_10x_mouse_brain_xenium" \
Expand Down
87 changes: 0 additions & 87 deletions src/data_processors/crop_region/script.py

This file was deleted.

14 changes: 6 additions & 8 deletions src/datasets/loaders/tenx_xenium/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: load_tenx_xenium
name: tenx_xenium
namespace: datasets/loaders

argument_groups:
Expand All @@ -7,13 +7,11 @@ argument_groups:
- type: file
name: --input
required: true
description: A 10x xenium directory
multiple: true
description: A 10x xenium directory or zip file
- type: string
name: --replicate_id
required: true
description: The replicate identifier
multiple: true
- type: string
name: --segmentation_id
required: true
Expand Down Expand Up @@ -51,10 +49,10 @@ argument_groups:
required: false
- name: Outputs
arguments:
- name: "--output"
__merge__: /src/api/file_common_spatialdata.yaml
direction: output
required: true
- name: "--output"
__merge__: /src/api/file_common_spatialdata.yaml
direction: output
required: true

resources:
- type: python_script
Expand Down
118 changes: 39 additions & 79 deletions src/datasets/loaders/tenx_xenium/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,8 @@

## VIASH START
par = {
"input": [
"temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs",
"temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs",
"temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs",
],
"replicate_id": [
"rep1",
"rep2",
"rep3",
],
"input": "temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs",
"replicate_id": "rep1",
"segmentation_id": [
"cell",
"nucleus",
Expand All @@ -30,79 +22,47 @@
"dataset_summary": "value",
"dataset_description": "value",
"dataset_organism": "value",
"output": "temp/datasets/10x_xenium/2023_10x_mouse_brain_xenium/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zarr"
}
## VIASH END

assert len(par["input"]) == len(par["replicate_id"]), "Length of 'input' and 'replicate_id' must be the same."

sdatas = []

for i, input in enumerate(par["input"]):
replicate_id = par["replicate_id"][i]
print(f"Processing replicate '{replicate_id}'", flush=True)

# if input is a zip, extract it to a temporary folder
with tempfile.TemporaryDirectory() as tmpdirname:
if zipfile.is_zipfile(input):
print("Extracting input zip", flush=True)
with zipfile.ZipFile(input, "r") as zip_ref:
zip_ref.extractall(tmpdirname)
input = tmpdirname

# read the data
sdata = xenium(
path=input,
n_jobs=8,
cells_boundaries=True,
nucleus_boundaries=True,
morphology_focus=True,
cells_as_circles=False,
)

# rename coordinate system
sdata.rename_coordinate_systems({"global": replicate_id + "_global"})

# rename images
sdata.images[replicate_id + "_image"] = sdata.images.pop("morphology_mip")

# remove morphology_focus
_ = sdata.images.pop("morphology_focus")

# rename labels
sdata.labels[replicate_id + "_cell"] = sdata.labels.pop("cell_labels")
sdata.labels[replicate_id + "_nucleus"] = sdata.labels.pop("nucleus_labels")

# rename points
sdata.points[replicate_id + "_transcripts"] = sdata.points.pop("transcripts")

# rename shapes
sdata.shapes[replicate_id + "_cell_boundaries"] = sdata.shapes.pop("cell_boundaries")
sdata.shapes[replicate_id + "_nucleus_boundaries"] = sdata.shapes.pop("nucleus_boundaries")

# rename tables
sdata.tables[replicate_id + "_cell_table"] = sdata.tables.pop("table")

sdatas.append(sdata)

print("Concatenate sdatas", flush=True)
sdata = sd.concatenate(sdatas)

print("Add metadata table", flush=True)
sdata.tables["metadata"] = ad.AnnData(
uns={
"dataset_id": par["dataset_id"],
"dataset_name": par["dataset_name"],
"dataset_url": par["dataset_url"],
"dataset_reference": par["dataset_reference"],
"dataset_summary": par["dataset_summary"],
"dataset_description": par["dataset_description"],
"dataset_organism": par["dataset_organism"],
"variables": {
"replicate_id": par["replicate_id"],
"segmentation_id": par["segmentation_id"],
}
}
)
# if input is a zip, extract it to a temporary folder
par_input = par["input"]
with tempfile.TemporaryDirectory() as tmpdirname:
if zipfile.is_zipfile(par_input):
print("Extracting input zip", flush=True)
with zipfile.ZipFile(par_input, "r") as zip_ref:
zip_ref.extractall(tmpdirname)
par_input = tmpdirname

# read the data
sdata = xenium(
path=par_input,
n_jobs=8,
cells_boundaries=True,
nucleus_boundaries=True,
morphology_focus=True,
cells_as_circles=False,
)

# remove morphology_focus
_ = sdata.images.pop("morphology_focus")

print("Add uns to table", flush=True)
new_uns = {
"dataset_id": par["dataset_id"],
"dataset_name": par["dataset_name"],
"dataset_url": par["dataset_url"],
"dataset_reference": par["dataset_reference"],
"dataset_summary": par["dataset_summary"],
"dataset_description": par["dataset_description"],
"dataset_organism": par["dataset_organism"],
"replicate_id": par["replicate_id"],
"segmentation_id": par["segmentation_id"],
}
for key, value in new_uns.items():
sdata.tables["table"].uns[key] = value

print(f"Output: {sdata}", flush=True)

Expand Down
Loading

0 comments on commit b00775b

Please sign in to comment.