From 32a6c989940429a8d5d504de5c00baaff1664f2d Mon Sep 17 00:00:00 2001 From: David Williams Date: Wed, 6 Mar 2024 22:27:29 +0000 Subject: [PATCH] Cell Ranger extract tool initial commit --- cellranger_extract_rename/v1.0/Dockerfile | 25 +++ cellranger_extract_rename/v1.0/README.md | 66 ++++++ .../v1.0/rename_files.sh | 204 ++++++++++++++++++ 3 files changed, 295 insertions(+) create mode 100644 cellranger_extract_rename/v1.0/Dockerfile create mode 100644 cellranger_extract_rename/v1.0/README.md create mode 100755 cellranger_extract_rename/v1.0/rename_files.sh diff --git a/cellranger_extract_rename/v1.0/Dockerfile b/cellranger_extract_rename/v1.0/Dockerfile new file mode 100644 index 0000000..17993b1 --- /dev/null +++ b/cellranger_extract_rename/v1.0/Dockerfile @@ -0,0 +1,25 @@ +FROM joshkeegan/zip:3.19.1 + +LABEL maintainer="David Williams " +LABEL description="This Docker image contains an in-house script written to extract and rename files coming from the 'outs.zip' file from Cellranger" + +RUN apk update +RUN apk add vim vim-doc vim-tutor +RUN apk add bash + +# Add scripts to make it run +ADD rename_files.sh / + +# Add to /opt folder +RUN mv rename_files.sh /opt/ + +# Change permissions to make things exectuable +RUN chmod u+x /opt/rename_files.sh + +# Set wrkdir +RUN mkdir /data +COPY ./*.zip /data +WORKDIR /data + +# Add to environment +ENV PATH $PATH:/opt/ diff --git a/cellranger_extract_rename/v1.0/README.md b/cellranger_extract_rename/v1.0/README.md new file mode 100644 index 0000000..2c00f00 --- /dev/null +++ b/cellranger_extract_rename/v1.0/README.md @@ -0,0 +1,66 @@ +# Description + +This Docker image contains an in-house script written to extract and rename key files from the large 'outs.zip' file from the Cell Ranger process. Its purpose is to fit in the single cell RNA sequencing workflow, as well as ATACseq and potentially GUIDEseq, in the RMIP project. The Input to this should be ZIP file(s) coming from a Cell Ranger run, and the output is an output directory with renamed files and a renamed copy of the input 'outs.zip' file. Files and relative paths within the 'outs.zip' file are listed in the table below: + +| Directory | Filename | Description | Link | +| -- | -- | -- | -- | +| ./ | web_summary.html | Interactive summary HTML file named that contains summary metrics and automated secondary analysis results. | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-outputs-web-summary-count | +| ./ | metrics_summary.csv | The metrics_summary.csv is organized with each column specifying one metric name. The metric values are specified in a single row.  | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-3p-outputs-metrics-count | +| ./ | raw_feature_bc_matrix.h5 | Raw feature-barcode matrices describing the number of UMIs associated with a feature and a barcode. | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-outputs-h5-matrices | +| ./ | possorted_genome_bam.bam | Index file containing position-sorted reads aligned to the genome and transcriptome. | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-outputs-bam | +| ./ | possorted_genome_bam.bam.bai | Index file associated with the BAM file. | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-outputs-bam | +| ./ | filtered_feature_bc_matrix.h5 | Filtered feature-barcode matrices describing the number of UMIs associated with a feature and a barcode. | https://www.10xgenomics.com/support/software/cell-ranger/analysis/outputs/cr-outputs-h5-matrices | + +Each of these files are copied to an output directory and prepended with a "linker" for a given sample. Additionally, the 'outs.zip' file is copied to this directory and also prepended with this linker. Given the example linker `RMIP_001_002_A_001_A`, this can be separated into different components, delimited by `_`. Details on the linker and its format are below. + +| Name | Component | Description | +| -- | -- | -- | +| RMIP identifier | `RMIP` | Goes at the beginning of each linker | +| Project Identifier | `001` | Numeric only | +| Participant ID | `002` | Numeric only | +| Discriminator | `A` | Alphabetic only - combination with "Identifier" uniquely identifies every collection event | +| Identifier | `001` | Numeric only - combination with "Discriminator" uniquely identifies every collection event | +| Vial identifier (alphabetic) | `A` | Alphabetic only - identifies specific collection aliquot - optional if only one vial | + +## Sample usage + +This script is used to extract results from the large output ZIP file from Cell Ranger, called 'outs.zip'. + +Build +``` +docker build -t cellranger_extract_rename:v1 . +``` + +Run +``` +docker run -it -v $PWD:/data cellranger_extract_rename:v1 rename_files.sh +``` + +Usage info: +``` +Usage: ./rename_files.sh [OPTIONS] +Options: + -h, --help Display this help message + -v, --verbose Enable verbose mode + -l, --linker STRING Specify name of linker to prepend to extracted files (format 'RMIP_____') - Required + e.g. linker='RMIP_001_001_A_001_A' + Note that the Vial Identifier (last letter) is optional + -z, --zip_file STRING/PATH Specify name and path of ZIP file to read, decompress, and rename - Required + -o, --output_dir STRING/PATH Specify directory where to put extracted files. Default = '.' + +Example usage + Required flags: ./rename_files.sh -z input_zip.zip -l RMIP_001_001_A_001_A + Verbose mode: ./rename_files.sh -v -z input_zip.zip -l RMIP_001_001_A_246 + Writing to output directory: ./rename_files.sh -z input_zip.zip -l RMIP_001_001_A_246 -o test_output +``` + +## Files included + +- `Dockerfile`: the Docker file used to build this image +- `rename_files.sh`: Bash shell script that serves as the main executable when the Docker container is run. Expected behavior is to take all input ZIP files from Cellranger in the current working directory, unzip specific files from it, and rename those files to include a prefix signifying a sample name. This writes those specific files and a renamed copy of the given 'outs.zip' file to an output directory. + +## Contact + +If you have any questions or feedback, please feel free to contact the maintainers of this project: + +- David Williams, email: dnwilliams@rti.org diff --git a/cellranger_extract_rename/v1.0/rename_files.sh b/cellranger_extract_rename/v1.0/rename_files.sh new file mode 100755 index 0000000..0fd0cd9 --- /dev/null +++ b/cellranger_extract_rename/v1.0/rename_files.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# Function to display script usage +usage() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " -h, --help Display this help message" + echo " -v, --verbose Enable verbose mode" + echo " -l, --linker STRING Specify name of linker to prepend to extracted files (format 'RMIP_____') - Required" + echo " e.g. linker='RMIP_001_001_A_001_A'" + echo " Note that the Vial Identifier (last letter) is optional" + echo " -z, --zip_file STRING/PATH Specify name and path of ZIP file to read, decompress, and rename - Required" + echo " -o, --output_dir STRING/PATH Specify directory where to put extracted files. Default = '.'" + echo "" + echo "Example usage" + echo " Required flags: ./rename_files.sh -z input_zip.zip -l RMIP_001_001_A_001_A" + echo " Verbose mode: ./rename_files.sh -v -z input_zip.zip -l RMIP_001_001_A_246" + echo " Writing to output directory: ./rename_files.sh -z input_zip.zip -l RMIP_001_001_A_246 -o test_output" +} + +# Defining tool functions +has_argument() { + [[ ("$1" == *=* && -n ${1#*=}) || ( ! -z "$2" && "$2" != -*) ]]; +} + +extract_argument() { + echo "${2:-${1#*=}}" +} + +echo_verbose() { + if [ "$verbose_mode" == true ]; then echo $1; fi +} + +# Function to handle options and arguments +handle_options() { + while [ $# -gt 0 ]; do + case $1 in + -h | --help) + usage + exit 0 + ;; + -v | --verbose) + verbose_mode=true + ;; + -l | --linker*) + if ! has_argument $@; then + echo "ERROR: Linker not specified." >&2 + usage + exit 1 + fi + + linker=$(extract_argument $@) + + shift + ;; + -z | --zip_file*) + if ! has_argument $@; then + echo "ERROR: Input ZIP file not specified." >&2 + usage + exit 1 + fi + + zip_file=$(extract_argument $@) + + shift + ;; + -o | --output_dir*) + if ! has_argument $@; then + echo "Warning: Output directory flag given, but not specified." >&2 + echo "Setting to current working directory." >&2 + output_dir="" + else + output_dir=$(extract_argument $@) + fi + + shift + ;; + *) + echo "Invalid option: $1" >&2 + usage + exit 1 + ;; + esac + shift + done +} + +# Main script execution +handle_options "$@" + +echo_verbose "Verbose mode turned on" +echo_verbose "" + +# QC checks of input parameters +if [[ ${#linker} == 0 ]]; then + echo "ERROR: Linker not supplied. Must specify linker" + usage + exit 1 +fi + +if [[ ${#zip_file} == 0 ]]; then + echo "ERROR: ZIP file not supplied. Must specify ZIP file" + usage + exit 1 +fi + +if [[ ! -f ${zip_file} ]]; then + echo "ERROR: ZIP file not found. Please ensure path is correct and/or file exists" + echo "Given zip_file: $zip_file" + exit 1 +fi + +# Extracting basename from ZIP file name +zip_file_name=$(basename -- "$zip_file") +zip_file_name="${zip_file_name%.*}" +echo_verbose "ZIP file name extracted: $zip_file_name" + +# If there is no output_dir supplied, then set to current working directory +if [[ ${#output_dir} == 0 ]]; then + output_dir="./${linker}_${zip_file_name}" +fi + +echo_verbose "Here is the directory to write to: ${output_dir}" + +if [[ ! -d ${output_dir} ]]; then + echo_verbose "Output directory '$output_dir' not found, creating..." + mkdir -p $output_dir +fi + +# Defining regex matches for each part of linker +linker_array[0]="RMIP" +linker_array[1]="[[:digit:]]{3}" +linker_array[2]="[[:digit:]]{3}" +linker_array[3]="[[:alpha:]]" +linker_array[4]="[[:digit:]]{3}" +linker_array[5]="[[:alpha:]]" + +# Defining length restrictions for each part of linker +linker_piece_length_array[0]=4 +linker_piece_length_array[1]=3 +linker_piece_length_array[2]=3 +linker_piece_length_array[3]=1 +linker_piece_length_array[4]=3 +linker_piece_length_array[5]=1 + + +# Creating, splitting, and validating input linker +IFS="_" read -ra linker_split <<< "$linker" + +echo_verbose "" +echo_verbose "Checking validity of linker format..." + +j=0 +for i in "${linker_split[@]}"; do + echo_verbose "Linker part $j: $i" + echo_verbose "Linker regexp: ${linker_array[$j]}" + if [[ "$i" =~ ${linker_array[$j]} && ${#i} == ${linker_piece_length_array[$j]} ]]; then + echo_verbose "Regexp match!" + else + echo_verbose "Regexp not match" + echo_verbose "" + echo "ERROR: Invalid linker format, exiting" + usage + exit 1 + fi + ((j+=1)) + echo_verbose "" +done + +if [[ $j -gt ${#linker_array[@]} || $j -lt 5 ]]; then + echo "ERROR: Input linker not long enough, got length $j" + echo "Expected length 5 or 6" + usage + exit 1 +fi + +file_list=(web_summary.html metrics_summary.csv raw_feature_bc_matrix.h5 possorted_genome_bam.bam possorted_genome_bam.bam.bai filtered_feature_bc_matrix.h5) +# file_list=(web_summary.html metrics_summary.csv raw_feature_bc_matrix.h5 possorted_genome_bam.bam possorted_genome_bam.bam.bai raw_feature_bc_matrix/matrix.mtx.gz raw_feature_bc_matrix/features.tsv.gz raw_feature_bc_matrix/barcodes.tsv.gz filtered_feature_bc_matrix/matrix.mtx.gz filtered_feature_bc_matrix/barcodes.tsv.gz filtered_feature_bc_matrix/features.tsv.gz) + +# Extracting and renaming files +for file in ${file_list[@]}; do + unzip -j $zip_file $file -d $output_dir; + echo_verbose $file; + + # Removing "_bam" from filename + if [[ $file == *"_bam"* ]]; then + echo_verbose "Found '_bam' in $file" + new_file=${file//"_bam"/} + echo_verbose "Removed '_bam': $new_file" + echo_verbose "Moving '$output_dir/${file}' to '$output_dir/${linker}_${new_file}'"; + echo_verbose "" + mv $output_dir/${file} $output_dir/${linker}_${new_file}; + else + echo_verbose "Moving '$output_dir/${file}' to '$output_dir/${linker}_${file}'"; + echo_verbose "" + mv $output_dir/${file} $output_dir/${linker}_${file}; + fi + +done + +echo_verbose "Copying ZIP file to output directory: $zip_file -> ${output_dir}/${linker}_${zip_file}" +cp $zip_file ${output_dir}/${linker}_${zip_file_name}.zip + +echo_verbose "Reached end of script"