Merge pull request #9 from simonlabcode/featurecount

bam2bakR version 3.0.0
simonlabcode · Feb 1, 2024 · 80fc96d · 80fc96d
2 parents c02775c + 0aa3b4d
commit 80fc96d
Show file tree

Hide file tree

Showing 55 changed files with 783,024 additions and 1,182 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,57 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+      - featurecount
+  pull_request:
+    branches_ignore: []
+
+jobs:
+  formatting:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout with submodules
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+        fetch-depth: 0
+    - name: Formatting
+      uses: github/super-linter@v5
+      env:
+        VALIDATE_ALL_CODEBASE: false
+        DEFAULT_BRANCH: featurecount
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        VALIDATE_SNAKEMAKE_SNAKEFMT: true
+  linting:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Linting
+      uses: snakemake/[email protected]
+      with:
+        directory: .test
+        snakefile: workflow/Snakefile
+        args: "--configfile .test/configs/star_config.yaml --lint"
+
+  run-workflow:
+    runs-on: ubuntu-latest
+    needs:
+      - linting
+      - formatting
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+    - name: Test workflow (star)
+      uses: snakemake/[email protected]
+      with:
+        directory: .test
+        snakefile: workflow/Snakefile
+        args: "--configfile .test/configs/star_config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
+    - name: Test workflow (bam)
+      uses: snakemake/[email protected]
+      with:
+        directory: .test
+        snakefile: workflow/Snakefile
+        args: "--configfile .test/configs/bam2bakR.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache"
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
diff --git a/.snakemake-workflow-catalog.yml b/.snakemake-workflow-catalog.yml
@@ -0,0 +1,9 @@
+usage:
+  mandatory-flags: # optional definition of additional flags
+    desc: # describe your flags here in a few sentences (they will be inserted below the example commands)
+    flags: # put your flags here
+  software-stack-deployment: # definition of software deployment method (at least one of conda, singularity, or singularity+conda)
+    conda: true # whether pipeline works with --use-conda
+    singularity: true # whether pipeline works with --use-singularity
+    singularity+conda: true # whether pipeline works with --use-singularity --use-conda
+  report: false # add this to confirm that the workflow allows to use 'snakemake --report report.zip' to generate a report containing all results and explanations
diff --git a/.test/configs/bam2bakR.yaml b/.test/configs/bam2bakR.yaml
@@ -0,0 +1,113 @@
+##### Parameters you will likely have to change #####
+
+# Run bam2bakR only? If true, will expect paths to bam files as input and alignment steps will be skipped. If false,
+# paths to directories containing fastq files will be expected as input.
+bam2bakr: True
+
+# path to directory containing fastq files if bam2bakr is False
+# path to bam files if bam2bakr is True
+## example of what samples could look like for bam input
+samples:
+  WT_1: data/bams/WT_replicate_1.bam
+  WT_2: data/bams/WT_replicate_2.bam
+  WT_ctl: data/bams/WT_nos4U.bam
+
+
+## example of what samples will look like for fastq input
+# samples:
+#   WT_1: data/fastq/WT_1
+#   WT_2: data/fastq/WT_2
+#   WT_ctl: data/fastq/WT_ctl
+#   KO_1: data/fastq/KO_1
+#   KO_2: data/fastq/KO_2
+#   KO_ctl: data/fastq/KO_ctl
+
+# location of annotation gtf file
+annotation: data/annotation/GRCh38.gtf
+
+# -s4U control sample IDs
+control_samples: ['WT_ctl', 'KO_ctl']
+
+# location of genome fasta file
+genome_fasta: data/genome/GRCh38.fa
+
+##### Parameters that are always relevant #####
+
+# Format of reads
+FORMAT: "PE" # (PE, SE, NU)
+                    # [SE - single end reads]
+                    # [NU - including non-unique] (not tested)
+
+
+# Strandedness of reads
+strandedness: "R" # (F, R); F means read 1 represents original RNA sequence (vs. its reverse complement).
+
+
+# Type of browser tracks to generate
+mut_tracks: "TC" # ("TC", "GA", "TC,GA")
+
+# Minimum base quality to call mutation
+minqual: 40
+
+# String common to spike-in gene_ids in annotation gtf
+  # If you have no spike-ins, then this should be "\"\"", i.e., an empty string ("")
+spikename: "\"\""
+
+# If True, tracks will be normalized
+normalize: True
+
+# Are you using the Windows subsystem for linux? 0 = Yes, 1 = No
+WSL: 1
+
+# Are there jI and jM tags in your bam file?
+remove_tags: False
+
+
+##### Parameters that are only relevant if bam2bakr is False #####
+
+# location of hisat2 indices (directory containing indices)
+HISAT2: data/hisat2/grch38_tran/
+
+# location of hisat3n indices (path to and common prefix of all .ht2 files)
+HISAT_3N: data/hisat_3n/GRCh38
+
+# location of STAR indices
+STAR_index: data/star/
+
+# If TRUE, hisat-3n will be used for alignment
+use_hisat3n: False
+
+# If TRUE STAR will be used for alignment;  if hisat-3n is also TRUE, hisat-3n will be used and not STAR
+use_star: False
+
+# If TRUE, index will be built for star and directory will be created at path set in STAR_index parameter
+build_star: False
+
+# If use_hisat3n TRUE, then this specifies path to hisat_3n executable
+hisat3n_path: hisat-3n
+  # If hisat-3n is on your PATH, this will just be hisat-3n
+
+# Add a 'chr' to each chromosome number during alignment. [Useful when aliner index is number-based, but GTF annotation is chr-based]
+chr_tag: True
+
+# Are you on Yale's HPC where hisat-3n can be loaded as a module using Lmod system?
+Yale: False
+
+# Are you using a flattened annotation from FlatStacks?
+flattened: False
+
+# code specifying adapters to be trimmed that will be passed to cutadapt
+adapter: "-a AGATCGGAAGAGC -A AGATCGGAAGAGC"
+
+# Optional code to pass to cutadapt
+cutadapt_extra: "--minimum-length 20"
+  # No additional parameters are passed, so this can include any of cutadapt's optional parameters
+
+# Extra parameters to be passed to STAR
+star_extra: "--outFilterMismatchNmax 20"
+  # Already passing: "--outSAMtype BAM SortedByCoordinate --outSAMattributes NH HI AS NM MD --quantMode TranscriptomeSAM GeneCounts --sjdbGTFfile config["annotation"]
+  # Including any of the already set parameters will yield an error.
+
+# Extra parameters to be passed to Hisat2
+hisat2_extra: "--mp 1,0"
+  # No additional parameters are passed, so this can include any of hisat2's optional parameters
diff --git a/.test/configs/star_config.yaml b/.test/configs/star_config.yaml
@@ -0,0 +1,126 @@
+####### GENERAL INFORMATION ABOUT THIS CONFIG #######
+#
+# This config file allows you to specify a number of important pieces of information that
+# the fastq2EZbakR pipeline will require to run. It also allows you to set optional parameters
+# for all tools that fastq2EZbakR makes use of.
+#
+# File paths can either be absolute (e.g., ~/path/to/file/or/directory) or relative
+# to the directory in which you are calling the pipeline from (e.g., data/fastq/WT_1 in the 
+# example samples entry means to look in the data directory present in the directory
+# where you called `snakemake` to run the pipeline).
+#
+####### PARAMETERS YOU NEED TO SET #######
+
+# Run bam2bakR only? If true, will expect paths to bam files as input and alignment steps will be skipped. If false,
+# paths to directories containing fastq files will be expected as input.
+bam2bakr: False
+
+## Paths to data to process
+# path to directory containing fastq files if bam2bakr is False
+  # fastq files can be either gzipped or unzipped
+  # Each set of fastq files must be in a different directory
+# path to bam files if bam2bakr is True
+samples:
+  WT_1: data/WT1
+  WT_2: data/WT2
+  WT_ctl: data/WTctl
+
+## example of what samples will look like for fastq input
+# samples:
+#   WT_1: data/fastq/WT_1
+#   WT_2: data/fastq/WT_2
+#   WT_ctl: data/fastq/WT_ctl
+#   KO_1: data/fastq/KO_1
+#   KO_2: data/fastq/KO_2
+#   KO_ctl: data/fastq/KO_ctl
+
+# location of annotation gtf file
+annotation: data/annotation/genome.gtf
+
+# -s4U control sample IDs
+control_samples: ['WT_ctl']
+
+# location of genome fasta file
+genome_fasta: data/genome/genome.fasta
+
+##### Parameters that are always relevant #####
+
+# Format of reads
+FORMAT: "PE" # (PE, SE, NU)
+                    # [SE - single end reads]
+                    # [NU - including non-unique] (not tested)
+
+
+# Strandedness of reads
+strandedness: "R" # (F, R); F means read 1 represents original RNA sequence (vs. its reverse complement).
+
+
+# Type of browser tracks to generate
+mut_tracks: "TC" # ("TC", "GA", "TC,GA")
+
+# Minimum base quality to call mutation
+minqual: 40
+
+# String common to spike-in gene_ids in annotation gtf
+  # If you have no spike-ins, then this should be "\"\"", i.e., an empty string ("")
+spikename: "\"\""
+
+# If True, tracks will be normalized
+normalize: True
+
+# Are you using the Windows subsystem for linux? 0 = Yes, 1 = No
+WSL: 1
+
+# Are there jI and jM tags in your bam file?
+remove_tags: False
+
+
+##### Parameters that are only relevant if bam2bakr is False #####
+
+# location of hisat2 indices (directory containing indices)
+HISAT2: data/hisat2/
+
+# location of hisat3n indices (path to and common prefix of all .ht2 files)
+HISAT_3N: data/hisat_3n/GRCh38
+
+# location of STAR indices
+STAR_index: data/star/
+
+# If TRUE, hisat-3n will be used for alignment
+use_hisat3n: False
+
+# If TRUE STAR will be used for alignment;  if hisat-3n is also TRUE, hisat-3n will be used and not STAR
+use_star: True
+
+# If TRUE, index will be built for star and directory will be created at path set in STAR_index parameter
+build_star: True
+
+# If use_hisat3n TRUE, then this specifies path to hisat_3n executable
+hisat3n_path: hisat-3n
+  # If hisat-3n is on your PATH, this will just be hisat-3n
+
+# Add a 'chr' to each chromosome number during alignment. [Useful when aliner index is number-based, but GTF annotation is chr-based]
+chr_tag: True
+
+# Are you on Yale's HPC where hisat-3n can be loaded as a module using Lmod system?
+Yale: False
+
+# Are you using a flattened annotation from FlatStacks?
+flattened: False
+
+# code specifying adapters to be trimmed that will be passed to cutadapt
+adapter: "-a AGATCGGAAGAGC -A AGATCGGAAGAGC"
+
+# Optional code to pass to cutadapt
+cutadapt_extra: "--minimum-length 20"
+  # No additional parameters are passed, so this can include any of cutadapt's optional parameters
+
+# Extra parameters to be passed to STAR
+star_extra: "--outFilterMismatchNmax 20"
+  # Already passing: "--outSAMtype BAM SortedByCoordinate --outSAMattributes NH HI AS NM MD --quantMode TranscriptomeSAM GeneCounts --sjdbGTFfile config["annotation"]
+  # Including any of the already set parameters will yield an error.
+
+# Extra parameters to be passed to Hisat2
+hisat2_extra: "--mp 1,0"
+  # No additional parameters are passed, so this can include any of hisat2's optional parameters
+
diff --git a/.test/data/WT1/WT1_test_R1.fastq.gz b/.test/data/WT1/WT1_test_R1.fastq.gz
diff --git a/.test/data/WT1/WT1_test_R2.fastq.gz b/.test/data/WT1/WT1_test_R2.fastq.gz
diff --git a/.test/data/WT2/WT2_test_R1.fastq.gz b/.test/data/WT2/WT2_test_R1.fastq.gz
diff --git a/.test/data/WT2/WT2_test_R2.fastq.gz b/.test/data/WT2/WT2_test_R2.fastq.gz
diff --git a/.test/data/WTctl/WTctl_test_R1.fastq.gz b/.test/data/WTctl/WTctl_test_R1.fastq.gz
diff --git a/.test/data/WTctl/WTctl_test_R2.fastq.gz b/.test/data/WTctl/WTctl_test_R2.fastq.gz