Skip to content

Commit

Permalink
added pipeline for lrvelocity
Browse files Browse the repository at this point in the history
  • Loading branch information
Acribbs committed Apr 11, 2024
1 parent 3b4187b commit b1a4023
Show file tree
Hide file tree
Showing 4 changed files with 328 additions and 0 deletions.
274 changes: 274 additions & 0 deletions tallytrin/pipeline_singlecell_lrvelocyto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
"""
============================
Pipeline single-cell_macosko
============================
Overview
==================
This code is for a Python script that processes single-cell sequencing
data and generated veloyto output for running downstream analysis at the gene and
transcript level. It requires you to run the singlecell pipeline and
generate the mapping_genome.dir/ sorted bam file.
The pipeline uses the CGAT-core library for pipeline management and the
Ruffus library for task management.
Input steps
===========
The pipeline performs the following steps:
Reads the pipeline configuration from the pipeline.yml file.
Splits the input fastq file into smaller pieces.
Input files
===========
To run the pipeline, you will need the following input files:
A genome mapped sorted bam file from running the singlecell pipleine
A minimap2 junction bed generated following minimap2 instructions.
A BED file.
The main outputs of the pipeline include:
Output files
============
A loom file for import into tools such as scVelo
Usage
=====
The pipeline requires a configured pipeline.yml file,
which contains various settings and parameters required for
the pipeline to run.
To generate the config file to change the running of the pipeline you need to
run:
tallytrin singlecell_levelocyto config
This will generate a pipeline.yml file that the user can modify to change the
output of the pipeline. Once the user has modified the pipeline.yml file the
pipeline can then be ran using the following commandline command:
tallytrin singlecell_levelocyto make full -v5
You can run the pipeline locally (without a cluster) using --local
tallytrin singlecell_levelocyto make full -v5 --local
Code
==================
"""
from ruffus import *

import sys
import os
import re
import sqlite3
import glob

import cgatcore.pipeline as P
import cgatcore.experiment as E
import cgatcore.database as database

# Load options from the config file

PARAMS = P.get_parameters(
["%s/pipeline.yml" % os.path.splitext(__file__)[0],
"../pipeline.yml",
"pipeline.yml"])


# Determine the location of the input fastq files

try:
PARAMS['data']
except NameError:
DATADIR = "."
else:
if PARAMS['data'] == 0:
DATADIR = "."
elif PARAMS['data'] == 1:
DATADIR = "data"
else:
DATADIR = PARAMS['data']



SEQUENCESUFFIXES = ("*.bam")
SEQUENCEFILES = tuple([os.path.join(DATADIR, suffix_name)
for suffix_name in SEQUENCESUFFIXES])


@follows(mkdir("unspliced.dir"))
@transform(DATADIR + '/*.bam',
regex(DATADIR + '/(\S+).bam'),
r"unspliced.dir/intron_overlapping.bam")
def split_unspliced(infile, outfile):
'''
Splits the input BAM file into reads that overlap a intron so
data can be parsed faster and easier.
'''

infile = "".join(infile)

statement = '''
bedtools intersect -a %(infile)s -b %(intron_bed)s -f 0.8 -wa -u > %(outfile)s'''

P.run(statement)


@follows(mkdir("spliced.dir"))
@transform(DATADIR + '/*.bam',
regex(DATADIR + '/(\S+).bam'),
r"spliced.dir/intron_nonoverlapping.bam")
def split_spliced(infile, outfile):
'''
Splits the input BAM file into reads that overlap a intron so
data can be parsed faster and easier.
'''

infile = "".join(infile)

statement = '''
bedtools intersect -a %(infile)s -b %(intron_bed)s -f 0.8 -wa -v > %(outfile)s'''

P.run(statement)


@transform(split_spliced,
regex('spliced.dir/(\S+).bam'),
r"spliced.dir/\1.fastq.gz")
def spliced_fastq(infile, outfile):
'''
'''

infile = "".join(infile)

statement = '''
bedtools bamtofastq -i %(infile)s -fq /dev/stdout | gzip > %(outfile)s
'''

P.run(statement)


@transform(split_unspliced,
regex('unspliced.dir/(\S+).bam'),
r"unspliced.dir/\1.fastq.gz")
def unspliced_fastq(infile, outfile):
'''
'''

infile = "".join(infile)

statement = '''
bedtools bamtofastq -i %(infile)s -fq /dev/stdout | gzip > %(outfile)s
'''

P.run(statement)


@transform(spliced_fastq,
regex("spliced.dir/(\S+).fastq.gz"),
r"spliced.dir/\1.sam")
def mapping_spliced(infile, outfile):
'''
Maps the reads from the spliced FASTQ file to a transcriptome genome using
the minimap2 aligner. This function takes an input merged FASTQ file
and generates a SAM (Sequence Alignment/Map) file containing the
read alignments.
'''


cdna = PARAMS['minimap2_fasta_cdna']
options = PARAMS['minimap2_options']

statement = '''minimap2 %(options)s %(cdna)s %(infile)s > %(outfile)s 2> %(outfile)s.log'''

P.run(statement, job_options='-t 24:00:00')



@transform(unspliced_fastq,
regex("unspliced.dir/(\S+).fastq.gz"),
r"unspliced.dir/\1.sam")
def mapping_unspliced(infile, outfile):
'''
Maps the reads from the unspliced FASTQ file to a transcriptome genome using
the minimap2 aligner. This function takes an input merged FASTQ file
and generates a SAM (Sequence Alignment/Map) file containing the
read alignments.
'''


cdna = PARAMS['minimap2_fasta_cdna']
options = PARAMS['minimap2_options']

statement = '''minimap2 %(options)s %(cdna)s %(infile)s > %(outfile)s 2> %(outfile)s.log'''

P.run(statement, job_options='-t 24:00:00')


@transform(mapping_spliced,
regex("(\S+).sam"),
r"spliced.dir/final_trans_sorted.bam")
def run_samtools_spliced_trans(infile, outfile):
'''convert sam to bam and sort -F 272'''

statement = '''samtools view -bS %(infile)s > spliced.dir/final_trans.bam &&
samtools sort spliced.dir/final_trans.bam -o spliced.dir/final_trans_sorted.bam &&
samtools index spliced.dir/final_trans_sorted.bam'''

P.run(statement, job_options='-t 24:00:00')


@transform(mapping_unspliced,
regex("(\S+).sam"),
r"unspliced.dir/final_trans_sorted.bam")
def run_samtools_unspliced_trans(infile, outfile):
'''convert sam to bam and sort -F 272'''

statement = '''samtools view -bS %(infile)s > unspliced.dir/final_trans.bam &&
samtools sort unspliced.dir/final_trans.bam -o unspliced.dir/final_trans_sorted.bam &&
samtools index unspliced.dir/final_trans_sorted.bam'''

P.run(statement, job_options='-t 24:00:00')


@follows(run_samtools_spliced_trans, run_samtools_unspliced_trans)
def full():
'''
A placeholder function that serves as a checkpoint
to run all previous ruffus tasks and ensure that all
previous tasks are completed.
'''
pass


def main(argv=None):
'''
The main function that runs the pipeline using the cgatcore.pipeline module.
Takes an optional argument list (default is sys.argv).
Please note that some of these functions use external Python scripts or
tools. For a complete understanding of their functionality, it is
necessary to examine the code of those scripts as well.
'''
if argv is None:
argv = sys.argv
P.main(argv)

if __name__ == "__main__":
sys.exit(P.main(sys.argv))




54 changes: 54 additions & 0 deletions tallytrin/pipeline_singlecell_lrvelocyto/pipeline.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
################################################################
#
#
# Pipeline pipeline_nanopore.py configuration file for sphinxreport
#
# This pipeline.ini file lists some configuration options that you might
# want a user to set automatically.
#
# Add pipeline specific options into separate sections
#
################################################################
## general options


# Project name
projectname: CGATProject

# Copyright statement
copyright: CGAT (2010-2014)

# The short X.Y version.
version: 0.1

# The full version, including alpha/beta/rc tags.
release: 0.1

# mapped bam file (sorted) data from output of mapped_genome.dir after running pipeline_
data: data.dir

# location of the intron bed file
intron_bed: intron.bed


# minimap2 mapping options
minimap2:
options: -ax map-ont -p 0.9 --end-bonus 10 -N 3

fasta_cdna: /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/Homo_sapiens.GRCh38.cdna.all.fa

fasta_genome: /media/caeruleus/cg1/resources/index/hg38.fa

junc_bed: --junc-bed /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/ensembl.dir/geneset_all.bed

gene_options: ''

featurecounts:

gtf: /media/caeruleus/cg1/resources/annotations/hg38_ensembl98/ensembl.dir/geneset_all.gtf.gz

database:
url: sqlite:///./csvdb

job_options: -t 48:00:00

File renamed without changes.

0 comments on commit b1a4023

Please sign in to comment.