From b1a402399e70655582afcf259a5233f9ebad3f13 Mon Sep 17 00:00:00 2001 From: Acribbs Date: Thu, 11 Apr 2024 15:58:21 +0100 Subject: [PATCH] added pipeline for lrvelocity --- tallytrin/pipeline_singlecell_lrvelocyto.py | 274 ++++++++++++++++++ .../pipeline.yml | 54 ++++ ...lor.py => pipeline_singlecell_scolorv2.py} | 0 .../pipeline.yml | 0 4 files changed, 328 insertions(+) create mode 100644 tallytrin/pipeline_singlecell_lrvelocyto.py create mode 100644 tallytrin/pipeline_singlecell_lrvelocyto/pipeline.yml rename tallytrin/{pipeline_singlecell_scolor.py => pipeline_singlecell_scolorv2.py} (100%) rename tallytrin/{pipeline_singlecell_scolor => pipeline_singlecell_scolorv2}/pipeline.yml (100%) diff --git a/tallytrin/pipeline_singlecell_lrvelocyto.py b/tallytrin/pipeline_singlecell_lrvelocyto.py new file mode 100644 index 0000000..ee6b6ec --- /dev/null +++ b/tallytrin/pipeline_singlecell_lrvelocyto.py @@ -0,0 +1,274 @@ +""" +============================ +Pipeline single-cell_macosko +============================ + +Overview +================== + +This code is for a Python script that processes single-cell sequencing +data and generated veloyto output for running downstream analysis at the gene and +transcript level. It requires you to run the singlecell pipeline and +generate the mapping_genome.dir/ sorted bam file. + +The pipeline uses the CGAT-core library for pipeline management and the +Ruffus library for task management. + + +Input steps +=========== + +The pipeline performs the following steps: + +Reads the pipeline configuration from the pipeline.yml file. +Splits the input fastq file into smaller pieces. + + +Input files +=========== + +To run the pipeline, you will need the following input files: + +A genome mapped sorted bam file from running the singlecell pipleine +A minimap2 junction bed generated following minimap2 instructions. +A BED file. + +The main outputs of the pipeline include: + +Output files +============ + +A loom file for import into tools such as scVelo + +Usage +===== + +The pipeline requires a configured pipeline.yml file, +which contains various settings and parameters required for +the pipeline to run. + +To generate the config file to change the running of the pipeline you need to +run: + +tallytrin singlecell_levelocyto config + +This will generate a pipeline.yml file that the user can modify to change the +output of the pipeline. Once the user has modified the pipeline.yml file the +pipeline can then be ran using the following commandline command: + +tallytrin singlecell_levelocyto make full -v5 + +You can run the pipeline locally (without a cluster) using --local + +tallytrin singlecell_levelocyto make full -v5 --local + + +Code +================== + +""" +from ruffus import * + +import sys +import os +import re +import sqlite3 +import glob + +import cgatcore.pipeline as P +import cgatcore.experiment as E +import cgatcore.database as database + +# Load options from the config file + +PARAMS = P.get_parameters( + ["%s/pipeline.yml" % os.path.splitext(__file__)[0], + "../pipeline.yml", + "pipeline.yml"]) + + +# Determine the location of the input fastq files + +try: + PARAMS['data'] +except NameError: + DATADIR = "." +else: + if PARAMS['data'] == 0: + DATADIR = "." + elif PARAMS['data'] == 1: + DATADIR = "data" + else: + DATADIR = PARAMS['data'] + + + +SEQUENCESUFFIXES = ("*.bam") +SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name) + for suffix_name in SEQUENCESUFFIXES]) + + +@follows(mkdir("unspliced.dir")) +@transform(DATADIR + '/*.bam', + regex(DATADIR + '/(\S+).bam'), + r"unspliced.dir/intron_overlapping.bam") +def split_unspliced(infile, outfile): + ''' + Splits the input BAM file into reads that overlap a intron so + data can be parsed faster and easier. + ''' + + infile = "".join(infile) + + statement = ''' + bedtools intersect -a %(infile)s -b %(intron_bed)s -f 0.8 -wa -u > %(outfile)s''' + + P.run(statement) + + +@follows(mkdir("spliced.dir")) +@transform(DATADIR + '/*.bam', + regex(DATADIR + '/(\S+).bam'), + r"spliced.dir/intron_nonoverlapping.bam") +def split_spliced(infile, outfile): + ''' + Splits the input BAM file into reads that overlap a intron so + data can be parsed faster and easier. + ''' + + infile = "".join(infile) + + statement = ''' + bedtools intersect -a %(infile)s -b %(intron_bed)s -f 0.8 -wa -v > %(outfile)s''' + + P.run(statement) + + +@transform(split_spliced, + regex('spliced.dir/(\S+).bam'), + r"spliced.dir/\1.fastq.gz") +def spliced_fastq(infile, outfile): + ''' + ''' + + infile = "".join(infile) + + statement = ''' + bedtools bamtofastq -i %(infile)s -fq /dev/stdout | gzip > %(outfile)s + ''' + + P.run(statement) + + +@transform(split_unspliced, + regex('unspliced.dir/(\S+).bam'), + r"unspliced.dir/\1.fastq.gz") +def unspliced_fastq(infile, outfile): + ''' + ''' + + infile = "".join(infile) + + statement = ''' + bedtools bamtofastq -i %(infile)s -fq /dev/stdout | gzip > %(outfile)s + ''' + + P.run(statement) + + +@transform(spliced_fastq, + regex("spliced.dir/(\S+).fastq.gz"), + r"spliced.dir/\1.sam") +def mapping_spliced(infile, outfile): + ''' + Maps the reads from the spliced FASTQ file to a transcriptome genome using + the minimap2 aligner. This function takes an input merged FASTQ file + and generates a SAM (Sequence Alignment/Map) file containing the + read alignments. + ''' + + + cdna = PARAMS['minimap2_fasta_cdna'] + options = PARAMS['minimap2_options'] + + statement = '''minimap2 %(options)s %(cdna)s %(infile)s > %(outfile)s 2> %(outfile)s.log''' + + P.run(statement, job_options='-t 24:00:00') + + + +@transform(unspliced_fastq, + regex("unspliced.dir/(\S+).fastq.gz"), + r"unspliced.dir/\1.sam") +def mapping_unspliced(infile, outfile): + ''' + Maps the reads from the unspliced FASTQ file to a transcriptome genome using + the minimap2 aligner. This function takes an input merged FASTQ file + and generates a SAM (Sequence Alignment/Map) file containing the + read alignments. + ''' + + + cdna = PARAMS['minimap2_fasta_cdna'] + options = PARAMS['minimap2_options'] + + statement = '''minimap2 %(options)s %(cdna)s %(infile)s > %(outfile)s 2> %(outfile)s.log''' + + P.run(statement, job_options='-t 24:00:00') + + +@transform(mapping_spliced, + regex("(\S+).sam"), + r"spliced.dir/final_trans_sorted.bam") +def run_samtools_spliced_trans(infile, outfile): + '''convert sam to bam and sort -F 272''' + + statement = '''samtools view -bS %(infile)s > spliced.dir/final_trans.bam && + samtools sort spliced.dir/final_trans.bam -o spliced.dir/final_trans_sorted.bam && + samtools index spliced.dir/final_trans_sorted.bam''' + + P.run(statement, job_options='-t 24:00:00') + + +@transform(mapping_unspliced, + regex("(\S+).sam"), + r"unspliced.dir/final_trans_sorted.bam") +def run_samtools_unspliced_trans(infile, outfile): + '''convert sam to bam and sort -F 272''' + + statement = '''samtools view -bS %(infile)s > unspliced.dir/final_trans.bam && + samtools sort unspliced.dir/final_trans.bam -o unspliced.dir/final_trans_sorted.bam && + samtools index unspliced.dir/final_trans_sorted.bam''' + + P.run(statement, job_options='-t 24:00:00') + + +@follows(run_samtools_spliced_trans, run_samtools_unspliced_trans) +def full(): + ''' + A placeholder function that serves as a checkpoint + to run all previous ruffus tasks and ensure that all + previous tasks are completed. + ''' + pass + + +def main(argv=None): + ''' + The main function that runs the pipeline using the cgatcore.pipeline module. + Takes an optional argument list (default is sys.argv). + + Please note that some of these functions use external Python scripts or + tools. For a complete understanding of their functionality, it is + necessary to examine the code of those scripts as well. + ''' + if argv is None: + argv = sys.argv + P.main(argv) + +if __name__ == "__main__": + sys.exit(P.main(sys.argv)) + + + + diff --git a/tallytrin/pipeline_singlecell_lrvelocyto/pipeline.yml b/tallytrin/pipeline_singlecell_lrvelocyto/pipeline.yml new file mode 100644 index 0000000..f56bcc6 --- /dev/null +++ b/tallytrin/pipeline_singlecell_lrvelocyto/pipeline.yml @@ -0,0 +1,54 @@ +################################################################ +# +# +# Pipeline pipeline_nanopore.py configuration file for sphinxreport +# +# This pipeline.ini file lists some configuration options that you might +# want a user to set automatically. +# +# Add pipeline specific options into separate sections +# +################################################################ +## general options + + +# Project name +projectname: CGATProject + +# Copyright statement +copyright: CGAT (2010-2014) + +# The short X.Y version. +version: 0.1 + +# The full version, including alpha/beta/rc tags. +release: 0.1 + +# mapped bam file (sorted) data from output of mapped_genome.dir after running pipeline_ +data: data.dir + +# location of the intron bed file +intron_bed: intron.bed + + +# minimap2 mapping options +minimap2: + options: -ax map-ont -p 0.9 --end-bonus 10 -N 3 + + fasta_cdna: /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/Homo_sapiens.GRCh38.cdna.all.fa + + fasta_genome: /media/caeruleus/cg1/resources/index/hg38.fa + + junc_bed: --junc-bed /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/ensembl.dir/geneset_all.bed + + gene_options: '' + +featurecounts: + + gtf: /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/ensembl.dir/geneset_all.gtf.gz + +database: + url: sqlite:///./csvdb + +job_options: -t 48:00:00 + diff --git a/tallytrin/pipeline_singlecell_scolor.py b/tallytrin/pipeline_singlecell_scolorv2.py similarity index 100% rename from tallytrin/pipeline_singlecell_scolor.py rename to tallytrin/pipeline_singlecell_scolorv2.py diff --git a/tallytrin/pipeline_singlecell_scolor/pipeline.yml b/tallytrin/pipeline_singlecell_scolorv2/pipeline.yml similarity index 100% rename from tallytrin/pipeline_singlecell_scolor/pipeline.yml rename to tallytrin/pipeline_singlecell_scolorv2/pipeline.yml