Skip to content

Commit

Permalink
Merge pull request #9 from simonlabcode/featurecount
Browse files Browse the repository at this point in the history
bam2bakR version 3.0.0
  • Loading branch information
isaacvock authored Feb 1, 2024
2 parents c02775c + 0aa3b4d commit 80fc96d
Show file tree
Hide file tree
Showing 55 changed files with 783,024 additions and 1,182 deletions.
57 changes: 57 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
name: Tests

on:
push:
branches:
- main
- featurecount
pull_request:
branches_ignore: []

jobs:
formatting:
runs-on: ubuntu-latest
steps:
- name: Checkout with submodules
uses: actions/checkout@v3
with:
submodules: recursive
fetch-depth: 0
- name: Formatting
uses: github/super-linter@v5
env:
VALIDATE_ALL_CODEBASE: false
DEFAULT_BRANCH: featurecount
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
VALIDATE_SNAKEMAKE_SNAKEFMT: true
linting:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Linting
uses: snakemake/[email protected]
with:
directory: .test
snakefile: workflow/Snakefile
args: "--configfile .test/configs/star_config.yaml --lint"

run-workflow:
runs-on: ubuntu-latest
needs:
- linting
- formatting
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Test workflow (star)
uses: snakemake/[email protected]
with:
directory: .test
snakefile: workflow/Snakefile
args: "--configfile .test/configs/star_config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
- name: Test workflow (bam)
uses: snakemake/[email protected]
with:
directory: .test
snakefile: workflow/Snakefile
args: "--configfile .test/configs/bam2bakR.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
15 changes: 0 additions & 15 deletions .readthedocs.yaml

This file was deleted.

9 changes: 9 additions & 0 deletions .snakemake-workflow-catalog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
usage:
mandatory-flags: # optional definition of additional flags
desc: # describe your flags here in a few sentences (they will be inserted below the example commands)
flags: # put your flags here
software-stack-deployment: # definition of software deployment method (at least one of conda, singularity, or singularity+conda)
conda: true # whether pipeline works with --use-conda
singularity: true # whether pipeline works with --use-singularity
singularity+conda: true # whether pipeline works with --use-singularity --use-conda
report: false # add this to confirm that the workflow allows to use 'snakemake --report report.zip' to generate a report containing all results and explanations
113 changes: 113 additions & 0 deletions .test/configs/bam2bakR.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
##### Parameters you will likely have to change #####

# Run bam2bakR only? If true, will expect paths to bam files as input and alignment steps will be skipped. If false,
# paths to directories containing fastq files will be expected as input.
bam2bakr: True

# path to directory containing fastq files if bam2bakr is False
# path to bam files if bam2bakr is True
## example of what samples could look like for bam input
samples:
WT_1: data/bams/WT_replicate_1.bam
WT_2: data/bams/WT_replicate_2.bam
WT_ctl: data/bams/WT_nos4U.bam


## example of what samples will look like for fastq input
# samples:
# WT_1: data/fastq/WT_1
# WT_2: data/fastq/WT_2
# WT_ctl: data/fastq/WT_ctl
# KO_1: data/fastq/KO_1
# KO_2: data/fastq/KO_2
# KO_ctl: data/fastq/KO_ctl

# location of annotation gtf file
annotation: data/annotation/GRCh38.gtf

# -s4U control sample IDs
control_samples: ['WT_ctl', 'KO_ctl']

# location of genome fasta file
genome_fasta: data/genome/GRCh38.fa

##### Parameters that are always relevant #####

# Format of reads
FORMAT: "PE" # (PE, SE, NU)
# [SE - single end reads]
# [NU - including non-unique] (not tested)


# Strandedness of reads
strandedness: "R" # (F, R); F means read 1 represents original RNA sequence (vs. its reverse complement).


# Type of browser tracks to generate
mut_tracks: "TC" # ("TC", "GA", "TC,GA")

# Minimum base quality to call mutation
minqual: 40

# String common to spike-in gene_ids in annotation gtf
# If you have no spike-ins, then this should be "\"\"", i.e., an empty string ("")
spikename: "\"\""

# If True, tracks will be normalized
normalize: True

# Are you using the Windows subsystem for linux? 0 = Yes, 1 = No
WSL: 1

# Are there jI and jM tags in your bam file?
remove_tags: False


##### Parameters that are only relevant if bam2bakr is False #####

# location of hisat2 indices (directory containing indices)
HISAT2: data/hisat2/grch38_tran/

# location of hisat3n indices (path to and common prefix of all .ht2 files)
HISAT_3N: data/hisat_3n/GRCh38

# location of STAR indices
STAR_index: data/star/

# If TRUE, hisat-3n will be used for alignment
use_hisat3n: False

# If TRUE STAR will be used for alignment; if hisat-3n is also TRUE, hisat-3n will be used and not STAR
use_star: False

# If TRUE, index will be built for star and directory will be created at path set in STAR_index parameter
build_star: False

# If use_hisat3n TRUE, then this specifies path to hisat_3n executable
hisat3n_path: hisat-3n
# If hisat-3n is on your PATH, this will just be hisat-3n

# Add a 'chr' to each chromosome number during alignment. [Useful when aliner index is number-based, but GTF annotation is chr-based]
chr_tag: True

# Are you on Yale's HPC where hisat-3n can be loaded as a module using Lmod system?
Yale: False

# Are you using a flattened annotation from FlatStacks?
flattened: False

# code specifying adapters to be trimmed that will be passed to cutadapt
adapter: "-a AGATCGGAAGAGC -A AGATCGGAAGAGC"

# Optional code to pass to cutadapt
cutadapt_extra: "--minimum-length 20"
# No additional parameters are passed, so this can include any of cutadapt's optional parameters

# Extra parameters to be passed to STAR
star_extra: "--outFilterMismatchNmax 20"
# Already passing: "--outSAMtype BAM SortedByCoordinate --outSAMattributes NH HI AS NM MD --quantMode TranscriptomeSAM GeneCounts --sjdbGTFfile config["annotation"]
# Including any of the already set parameters will yield an error.

# Extra parameters to be passed to Hisat2
hisat2_extra: "--mp 1,0"
# No additional parameters are passed, so this can include any of hisat2's optional parameters
126 changes: 126 additions & 0 deletions .test/configs/star_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
####### GENERAL INFORMATION ABOUT THIS CONFIG #######
#
# This config file allows you to specify a number of important pieces of information that
# the fastq2EZbakR pipeline will require to run. It also allows you to set optional parameters
# for all tools that fastq2EZbakR makes use of.
#
# File paths can either be absolute (e.g., ~/path/to/file/or/directory) or relative
# to the directory in which you are calling the pipeline from (e.g., data/fastq/WT_1 in the
# example samples entry means to look in the data directory present in the directory
# where you called `snakemake` to run the pipeline).
#
####### PARAMETERS YOU NEED TO SET #######

# Run bam2bakR only? If true, will expect paths to bam files as input and alignment steps will be skipped. If false,
# paths to directories containing fastq files will be expected as input.
bam2bakr: False

## Paths to data to process
# path to directory containing fastq files if bam2bakr is False
# fastq files can be either gzipped or unzipped
# Each set of fastq files must be in a different directory
# path to bam files if bam2bakr is True
samples:
WT_1: data/WT1
WT_2: data/WT2
WT_ctl: data/WTctl

## example of what samples will look like for fastq input
# samples:
# WT_1: data/fastq/WT_1
# WT_2: data/fastq/WT_2
# WT_ctl: data/fastq/WT_ctl
# KO_1: data/fastq/KO_1
# KO_2: data/fastq/KO_2
# KO_ctl: data/fastq/KO_ctl

# location of annotation gtf file
annotation: data/annotation/genome.gtf

# -s4U control sample IDs
control_samples: ['WT_ctl']

# location of genome fasta file
genome_fasta: data/genome/genome.fasta

##### Parameters that are always relevant #####

# Format of reads
FORMAT: "PE" # (PE, SE, NU)
# [SE - single end reads]
# [NU - including non-unique] (not tested)


# Strandedness of reads
strandedness: "R" # (F, R); F means read 1 represents original RNA sequence (vs. its reverse complement).


# Type of browser tracks to generate
mut_tracks: "TC" # ("TC", "GA", "TC,GA")

# Minimum base quality to call mutation
minqual: 40

# String common to spike-in gene_ids in annotation gtf
# If you have no spike-ins, then this should be "\"\"", i.e., an empty string ("")
spikename: "\"\""

# If True, tracks will be normalized
normalize: True

# Are you using the Windows subsystem for linux? 0 = Yes, 1 = No
WSL: 1

# Are there jI and jM tags in your bam file?
remove_tags: False


##### Parameters that are only relevant if bam2bakr is False #####

# location of hisat2 indices (directory containing indices)
HISAT2: data/hisat2/

# location of hisat3n indices (path to and common prefix of all .ht2 files)
HISAT_3N: data/hisat_3n/GRCh38

# location of STAR indices
STAR_index: data/star/

# If TRUE, hisat-3n will be used for alignment
use_hisat3n: False

# If TRUE STAR will be used for alignment; if hisat-3n is also TRUE, hisat-3n will be used and not STAR
use_star: True

# If TRUE, index will be built for star and directory will be created at path set in STAR_index parameter
build_star: True

# If use_hisat3n TRUE, then this specifies path to hisat_3n executable
hisat3n_path: hisat-3n
# If hisat-3n is on your PATH, this will just be hisat-3n

# Add a 'chr' to each chromosome number during alignment. [Useful when aliner index is number-based, but GTF annotation is chr-based]
chr_tag: True

# Are you on Yale's HPC where hisat-3n can be loaded as a module using Lmod system?
Yale: False

# Are you using a flattened annotation from FlatStacks?
flattened: False

# code specifying adapters to be trimmed that will be passed to cutadapt
adapter: "-a AGATCGGAAGAGC -A AGATCGGAAGAGC"

# Optional code to pass to cutadapt
cutadapt_extra: "--minimum-length 20"
# No additional parameters are passed, so this can include any of cutadapt's optional parameters

# Extra parameters to be passed to STAR
star_extra: "--outFilterMismatchNmax 20"
# Already passing: "--outSAMtype BAM SortedByCoordinate --outSAMattributes NH HI AS NM MD --quantMode TranscriptomeSAM GeneCounts --sjdbGTFfile config["annotation"]
# Including any of the already set parameters will yield an error.

# Extra parameters to be passed to Hisat2
hisat2_extra: "--mp 1,0"
# No additional parameters are passed, so this can include any of hisat2's optional parameters

Binary file added .test/data/WT1/WT1_test_R1.fastq.gz
Binary file not shown.
Binary file added .test/data/WT1/WT1_test_R2.fastq.gz
Binary file not shown.
Binary file added .test/data/WT2/WT2_test_R1.fastq.gz
Binary file not shown.
Binary file added .test/data/WT2/WT2_test_R2.fastq.gz
Binary file not shown.
Binary file added .test/data/WTctl/WTctl_test_R1.fastq.gz
Binary file not shown.
Binary file added .test/data/WTctl/WTctl_test_R2.fastq.gz
Binary file not shown.
Loading

0 comments on commit 80fc96d

Please sign in to comment.