Skip to content

Commit

Permalink
changed default ref to 93 + minor improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
LisaSikkema committed Nov 4, 2020
1 parent f2d0441 commit 24b4230
Show file tree
Hide file tree
Showing 12 changed files with 62 additions and 66 deletions.
2 changes: 1 addition & 1 deletion LCA_pipeline_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# D: run Lung Cell Atlas cellranger pipeline

# LCA pipeline version:
pipeline_version="0.1.0"
pipeline_version="1.0.0"

# parameter defaults:

Expand Down
40 changes: 24 additions & 16 deletions LCA_pipeline_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
# D: download files, set up conda environment and build reference genome for Lung Cell Atlas cellranger pipeline

# version of pipeline
pipeline_version="0.1.0"
pipeline_version="1.0.0"

# parameter defaults:

# Ensembl release
ensrel="99"
ensrel="93"
# Genome string
genomestring="GRCh38"
# species
Expand Down Expand Up @@ -86,7 +86,8 @@ usage() {
[work_dir]/refgenomes/ directory matching with the
specified species, ensembl_release and genome_release
will be removed. If set to false, the necessary
downloaded files should be present in your refgenomes
downloaded files should be present in your
refgenomes/${species}_${genomestring}_ensrel${ensrel}
folder. (default: ${download_ensembl_files})
Optional argument to include Sars-CoV2 in the reference genome:
Expand Down Expand Up @@ -277,7 +278,7 @@ fi
# and for build_ref_genome
if [ "$build_ref_genome" == "true" ]; then
echo "building of reference genome will be included" | tee -a ${LOGFILE}
if [ "$download_files" == "false" ]; then
if [ "$download_ensembl_files" == "false" ]; then
echo "download of files from ensembl needed for refgenome building will be skipped." | tee -a ${LOGFILE}
fi
if [ "$incl_sarscov2" == "true" ]; then
Expand Down Expand Up @@ -389,38 +390,45 @@ if [ "$build_ref_genome" == "true" ]; then
mkdir refgenomes
fi
cd refgenomes
# create/cd into folder that corresponds to our species and genome version, so that no mixing of versions is possible
genome_name=${species}_${genomestring}_ensrel${ensrel}
if ! [ -d $genome_name ]; then
mkdir $genome_name
fi
cd $genome_name

echo "Currently working in folder `pwd`" | tee -a ${LOGFILE}
# now run the script to build the genome:
echo "We will now start building the reference genome, using the script $script_dir/src/create_cellranger_ref_from_ensembl.sh" | tee -a ${LOGFILE}
echo "This might take a few hours. Start time: `date`" | tee -a ${LOGFILE}
echo "For a detailed log of the genome building, check out the logfile in your ${work_dir}/refgenomes folder!" | tee -a ${LOGFILE}
echo "For a detailed log of the genome building, check out the logfile in your ${work_dir}/refgenomes/${genome_name} folder!" | tee -a ${LOGFILE}
if [ "$incl_sarscov2" == "false" ]; then
if [ "$download_ensembl_files" == "true" ]; then
# -d default true, u true
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -o ${work_dir}/refgenomes | tee -a ${LOGFILE}
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
else
# -d to false
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -o ${work_dir}/refgenomes | tee -a ${LOGFILE}
${script_dir}/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
fi
# check md5sum if default parameters were used:
if [ "${genomestring}" == "GRCh38" ] && [ "${ensrel}" == "99" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
if [ "${genomestring}" == "GRCh38" ] && [ "${ensrel}" == "93" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel99_cr3.1.0.md5 | tee -a ${LOGFILE}
md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel93_cr3.1.0.md5 | tee -a ${LOGFILE}
fi
elif [ "$incl_sarscov2" == "true" ]; then
echo "Including Sars-cov2 genome into the reference..." | tee -a ${LOGFILE}
if [ "$download_ensembl_files" == "true" ]; then
# -d default true, u true
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes | tee -a ${LOGFILE}
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -u true -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
else
# -d to false
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes | tee -a ${LOGFILE}
fi
# check md5sum if default parameters were used:
if [ ${genomestring} == "GRCh38" ] && [ "${ensrel}" == "99" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel99_cr3.1.0_sars_cov2.md5 | tee -a ${LOGFILE}
$script_dir/src/create_cellranger_ref_from_ensembl.sh -e ${ensrel} -g ${genomestring} -s ${species} -c ${expectedcrv} -t ${nthreads} -m ${memgb} -d false -n sars_cov2 -f $script_dir/res/sars_cov2_genome/sars_cov2_genome.gtf -a $script_dir/res/sars_cov2_genome/sars_cov2.fasta -o ${work_dir}/refgenomes/${genome_name} | tee -a ${LOGFILE}
fi
# # check md5sum if default parameters were used (didn't generate the checksum file for ensemble 93 yet):
# if [ ${genomestring} == "GRCh38" ] && [ "${ensrel}" == "93" ] && [ "${species}" == "homo_sapiens" ] && [ "${expectedcrv}" == "3.1.0" ]; then
# echo "Checking md5sum of output folder..." | tee -a ${LOGFILE}
# md5sum -c $script_dir/src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel93_cr3.1.0_sars_cov2.md5 | tee -a ${LOGFILE}
# fi
fi
echo "End time: `date`" | tee -a ${LOGFILE}
fi
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ $ ./LCA_pipeline_setup.sh -w /users/lisa.sikkema/LCA_scRNAseq_pipeline -u 1234:5
### testrun the installed pipeline:

We made a testrun script [LCA_pipeline_testrun.sh](test/LCA_pipeline_testrun.sh) with a small toy dataset from https://support.10xgenomics.com/single-cell-gene-expression/datasets/3.0.2/5k_pbmc_v3 (downloaded automatically during pipeline setup above) to make sure
that your pipeline works as expected. (Note that this testrun is designed for the pipeline __excluding__ SARS-cov2 in the reference genome.) The script can be run with only one line in terminal, and should take between half an hour and a few hours to run (depending on your computing resources).
that your pipeline works as expected. __Note__ that this testrun is designed for the pipeline with the default reference genome and cell ranger version, excluding SARS-cov2 in the reference genome. The script can be run with only one line in terminal, and should take between half an hour and a few hours to run (depending on your computing resources).
It runs the entire pipeline on the toy data, and includes an option to transfer the output files, exluding .bam and .bai files, to a secure storage at the Helmholtz Center in Munich, Germany.

Detailed documentation on how to use the script is available under
Expand Down
2 changes: 1 addition & 1 deletion res/Samples_file_example.xls
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Sample ncells fastqpath refindex refgtf chemistry
sub_5k_pbmc_v3 5000 /home/lisa/LCA/scRNAseq_pipelines/testdata/ /home/lisa/LCA/scRNAseq_pipelines/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0 /home/lisa/LCA/scRNAseq_pipelines/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0/genes/genes.gtf SC3Pv3
sub_5k_pbmc_v3 5000 /home/lisa/LCA/scRNAseq_pipelines/testdata/ /home/lisa/LCA/scRNAseq_pipelines/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0 /home/lisa/LCA/scRNAseq_pipelines/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0/genes/genes.gtf SC3Pv3
14 changes: 7 additions & 7 deletions src/create_cellranger_ref_from_ensembl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ VERSION="1.1"
# Parameters defaults

# Ensembl release
ensrel="99"
ensrel="93"
# Genome string
genomestring="GRCh38"
# species
Expand Down Expand Up @@ -115,7 +115,6 @@ done
shift $((OPTIND - 1))


# LISA, test this part:
if ! [ -d $outdir ]; then
echo "the provided outdir (-o flag) is not a directory. Exiting."
exit 1
Expand Down Expand Up @@ -420,7 +419,8 @@ eval ${mkgtfcmd} >> ${LOGFILE} 2>&1
# Check
if [ "$?" -ne "0" ]
then
echo "Error: There was an error running cellranger mkgtf" | tee -a ${LOGFILE}
echo "Error: There was an error running cellranger mkgtf. Exiting." | tee -a ${LOGFILE}
exit 1
fi


Expand Down Expand Up @@ -456,8 +456,8 @@ if [[ ${genomestring2} != "" ]]
# Check
if [ "$?" -ne "0" ]
then
echo "Error: There was an error running cellranger mkgtf" | tee -a ${LOGFILE}
#exit 1
echo "Error: There was an error running cellranger mkgtf. Exiting." | tee -a ${LOGFILE}
exit 1
fi

fi
Expand Down Expand Up @@ -495,8 +495,8 @@ eval ${mkrefcmd} >> ${LOGFILE} 2>&1
# Check
if [ "$?" -ne "0" ]
then
echo "Error: There was an error running cellranger mkref" | tee -a ${LOGFILE}
#exit 1
echo "Error: There was an error running cellranger mkref. Exiting." | tee -a ${LOGFILE}
exit 1
fi

# File cleanup
Expand Down
17 changes: 17 additions & 0 deletions src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel93_cr3.1.0.md5
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
9e6b9465dc708d92bf6d67e9c9fa9389 homo_sapiens_GRCh38_ensrel93_cr3.1.0/fasta/genome.fa
d527f3eb6b664020cf4d882b5820056f homo_sapiens_GRCh38_ensrel93_cr3.1.0/fasta/genome.fa.fai
c00e3fb210b085bbe04023cf1b405c56 homo_sapiens_GRCh38_ensrel93_cr3.1.0/genes/genes.gtf
d57b2ed04655d661b860b177875421d9 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/chrLength.txt
5bb25def58f5ab73e05711b51aee01df homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/chrNameLength.txt
c93ac05176d615ddec18a1faac7ef355 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/chrName.txt
a117b5f64c14c8349efd5a01b67f89ba homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/chrStart.txt
7e327f253f6873727fa774c1a5ae7b5e homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/exonGeTrInfo.tab
b61bc3e34f80e199b7172ca03a7051a2 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/exonInfo.tab
e51a4ecfe91d613a113d8ddc8ccc4493 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/geneInfo.tab
6f778cf30bb4f654b42fa7b4c99652e2 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/Genome
6006baaae0e02e9b560f1ef01c19a526 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/SA
bec9c7c1fee71a3312dda37550b8d3a3 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/SAindex
c9ca1c60b312577956b8957fd9470e93 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/sjdbInfo.txt
c5ea59b7e9fd0f8f52e58e039f5b6706 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/sjdbList.fromGTF.out.tab
2ae522d7bd0ffe27cc23130142b74879 homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/sjdbList.out.tab
11609f77698eedce75570aa761e4ed6c homo_sapiens_GRCh38_ensrel93_cr3.1.0/star/transcriptInfo.tab
17 changes: 0 additions & 17 deletions src/refgenomes_md5checks/homo_sapiens_GRCh38_ensrel99_cr3.1.0.md5

This file was deleted.

This file was deleted.

7 changes: 7 additions & 0 deletions test/CHECKSUM_testrun.md5
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# this checksum was made based on ensembl93 refgenome
a7d8bbb138de738184c7748079059403 ../cellranger/sub_5k_pbmc_v3/count_matrices/filtered_feature_bc_matrix/barcodes.tsv
6010c970f5df1e21e388c916cd6c36e9 ../cellranger/sub_5k_pbmc_v3/count_matrices/filtered_feature_bc_matrix/features.tsv
89628f995fc2c18a8fd0af94e519e372 ../cellranger/sub_5k_pbmc_v3/count_matrices/filtered_feature_bc_matrix/matrix.mtx
bf994f994561412303520a2fb3ac87f8 ../cellranger/sub_5k_pbmc_v3/count_matrices/raw_feature_bc_matrix/barcodes.tsv
6010c970f5df1e21e388c916cd6c36e9 ../cellranger/sub_5k_pbmc_v3/count_matrices/raw_feature_bc_matrix/features.tsv
003531bd569c9238a7b54b69bc6585d7 ../cellranger/sub_5k_pbmc_v3/count_matrices/raw_feature_bc_matrix/matrix.mtx
4 changes: 2 additions & 2 deletions test/LCA_pipeline_testrun.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# D: testrun Lung Cell Atlas cellranger pipeline

# LCA pipeline version:
pipeline_version="0.1.0"
pipeline_version="1.0.0"

# parameter defaults:

Expand Down Expand Up @@ -372,7 +372,7 @@ fi
# check md5sum of output .mtx, and barcodes and features .tsvs. (h5 and h5ad have timestamps and therefore
# cannot be used for checksums. Loom files also get prefix in indices)
echo "We will now do an md5sum check on cellranger output:" | tee -a ${LOGFILE}
md5sum -c $work_dir/testdata/CHECKSUM_testrun | tee -a ${LOGFILE}
md5sum -c $script_dir/test/CHECKSUM_testrun.md5 | tee -a ${LOGFILE}

# move back to out_dir
cd $out_dir
Expand Down
2 changes: 1 addition & 1 deletion test/Samples_testdata_sarscov2_template.xls
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Sample ncells fastqpath refindex refgtf chemistry
sub_5k_pbmc_v3 5000 {workdir}/testdata/ {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0_sars_cov2 {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0_sars_cov2/genes/genes.gtf SC3Pv3
sub_5k_pbmc_v3 5000 {workdir}/testdata/ {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0_sars_cov2 {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0_sars_cov2/genes/genes.gtf SC3Pv3
2 changes: 1 addition & 1 deletion test/Samples_testdata_template.xls
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Sample ncells fastqpath refindex refgtf chemistry
sub_5k_pbmc_v3 5000 {workdir}/testdata/ {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0 {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel99_cr3.1.0/genes/genes.gtf SC3Pv3
sub_5k_pbmc_v3 5000 {workdir}/testdata/ {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0 {workdir}/refgenomes/homo_sapiens_GRCh38_ensrel93/homo_sapiens_GRCh38_ensrel93_cr3.1.0/genes/genes.gtf SC3Pv3

0 comments on commit 24b4230

Please sign in to comment.