Merge pull request #23 from instituteofcancerresearch/R-parser

Annotated input from VCF files
instituteofcancerresearch · Nov 9, 2023 · 2719fe4 · 2719fe4
2 parents 7ee4a04 + 1426efb
commit 2719fe4
Show file tree

Hide file tree

Showing 51 changed files with 254,841 additions and 493 deletions.
diff --git a/.github/workflows/dev_tests.yml b/.github/workflows/dev_tests.yml
@@ -31,8 +31,8 @@ jobs:
       - name: Test conda environment
         run: |
           conda activate soprano-dev
-          pytest -s tests/test_configuration
+          pytest -s tests/test_installation.py
       - name: Test units
         run: |
           conda activate soprano-dev
-          pytest -s tests/test_units
+          pytest -s tests/units
diff --git a/.github/workflows/main_tests.yml b/.github/workflows/main_tests.yml
@@ -35,7 +35,7 @@ jobs:
       - name: Test conda environment
         run: |
           conda activate soprano-dev
-          pytest -s tests/test_configuration
+          pytest -s tests/test_installation.py
       - name: Test units
         run: |
           conda activate soprano-dev

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ requires-python = ">=3.10"
 dependencies = [
     "pandas",
     "numpy",
-    "streamlit == 1.27.0",
+    "streamlit >= 1.27.0",
     "requests",
     "types-requests",
     "clint"
@@ -30,6 +30,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Perl",
+    "Programming Language :: R",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
     "Intended Audience :: Science/Research"
 ]
@@ -64,6 +65,7 @@ soprano-app = "SOPRANO.run:run_app"
 soprano-get-genome = "SOPRANO.run:download_genome"
 soprano-link-vep = "SOPRANO.run:link_vep_cache"
 soprano-hla2ip = "SOPRANO.run:hla2pip"
+soprano-annotate = "SOPRANO.run:annotate_vcfs"
 
 [tool.hatch]
 version.source = "vcs"

diff --git a/setup.sh b/setup.sh
@@ -116,9 +116,14 @@ function activate_env() {
   fi
   if [ $? == 0 ]
   then
-    echo "Installing SOPRANO python package"
+    echo "Installing SOPRANO Python packages"
     eval "$_PIP_CMD"
   fi
+  if [ $? == 0 ]
+  then
+    echo "Installing SOPRANO R packages"
+    Rscript "src/SOPRANO/R/pkgs.R"
+  fi
 }
 
 if command -v conda &> /dev/null
@@ -145,7 +150,6 @@ then
     fi
   fi
 
-
 else
   echo "Fatal: Conda not detected."
 fi

diff --git a/src/SOPRANO/R/parse_vcf.R b/src/SOPRANO/R/parse_vcf.R
@@ -0,0 +1,192 @@
+library(magrittr)
+
+# Define parser
+opt_parser <- optparse::OptionParser()
+
+# Data directory containing vcf.gz files
+opt_parser <- optparse::add_option(
+  opt_parser, c("-d", "--dir"),
+  type = "character",
+  help = "Directory containing .vcf.gz files.", metavar = "character",
+)
+
+# Output filename
+opt_parser <- optparse::add_option(
+  opt_parser, c("-o", "--out"),
+  type = "character", default = NULL,
+  help = "Output file path.", metavar = "character"
+)
+
+# Location of sources for translating ensp and ref
+opt_parser <- optparse::add_option(
+  opt_parser, c("-t", "--trans"),
+  type = "character",
+  help = "Location of translator files.", metavar = "character"
+)
+
+# Parse inputs
+args <- optparse::parse_args(opt_parser)
+
+# Get data sources
+data_dir <- args$dir
+
+if (is.null(data_dir)) {
+    stop("Data directory not defined. Flag -d | --d <directory path> required")
+}
+if (!dir.exists(data_dir)) {
+    stop(paste("Data directory does not exist:", data_dir))
+}
+
+
+# Get translator dir
+trans_dir <- args$trans
+
+if (is.null(trans_dir)) {
+    stop("Translator directory not defined. Flag -t | --t <dir path> required")
+}
+
+if (!dir.exists(trans_dir)) {
+    stop(paste("Translator directory does not exits:", trans_dir))
+}
+
+ensp_2_enst_path <- file.path(trans_dir, "ENSP2ENST.txt")
+variant_list_path <- file.path(trans_dir, "REF2VEP.txt")
+covs_path = file.path(trans_dir, "covariates_hg19_hg38_epigenome_pcawg.rda")
+refdb_path <- file.path(
+    trans_dir, "RefCDS_human_GRCh38_GencodeV18_recommended.rda"
+)
+
+for (p in c(ensp_2_enst_path, variant_list_path, covs_path, refdb_path)) {
+    if (! file.exists(p)) {
+        stop(paste("Auxiliary file not found:", p))
+    }
+}
+
+# Get output file name
+if (is.null(args$out)) {
+  out_name <- file.path(data_dir, paste0(basename(data_dir), ".anno"))
+} else {
+  out_name <- args$out
+}
+
+# Read mutations from VCF files
+vcf_files <- Sys.glob(file.path(args$dir, "*.vcf.gz"))
+vcf_names <- sub(
+  "_TAIL_somatic_snvs_snpEff.ann.vcf.gz", "", sub("\\.\\/", "", vcf_files)
+)
+vcf_data <- lapply(vcf_files, function(vcf) {
+  v <- vcfR::read.vcfR(vcf)
+  v <- vcfR::extract.indels(v)
+  v <- v[vcfR::is.biallelic(v), ]
+})
+names(vcf_data) <- vcf_names
+
+gt_fields <- lapply(vcf_data, function(vcf) {
+  vcfR::extract_gt_tidy(vcf)
+})
+
+fix_fields <- lapply(vcf_data, function(vcf) {
+  vcf@fix %>% tibble::as_tibble()
+})
+
+merged_list <- mapply(c, fix_fields, gt_fields, SIMPLIFY = FALSE)
+
+# Create table with basic data from VCF
+all_with_vaf <- lapply(merged_list, function(test) {
+  test2 <- tibble::as_tibble(cbind(test$CHROM, test$POS, test$REF, test$ALT))
+})
+
+df_all_with_vaf <- dplyr::bind_rows(all_with_vaf, .id = "name")
+names(df_all_with_vaf) <- c(
+  "sampleID", "chr", "position", "ref_allele", "alt_allele"
+)
+
+# Read transcript and annotation info
+
+
+
+transcriptlist <- readr::read_delim(
+  ensp_2_enst_path, delim = "\t", col_names = TRUE
+)
+variantlist <- readr::read_delim(
+  variant_list_path, delim = "\t", col_names = TRUE
+)
+
+# See discussion:
+# https://github.com/im3sanger/dndscv/issues/30#issuecomment-1000868593
+
+load(covs_path) # Loads the covs object
+
+
+# dndscv bit...
+
+# Run dndscv with GRCh38 defs... required a bit of tinkering
+# Other genome need to download Rda file-> see dndscv tutorial website.
+
+# See discussion:
+# unix.stackexchange.com/questions/497990/
+#   how-to-know-if-rsync-did-not-change-any-files
+df_all_with_vaf$chr <- gsub("chr", "", as.vector(df_all_with_vaf$chr))
+
+
+res1dnds <- dndscv::dndscv(
+  mutations = df_all_with_vaf,
+  outmats = TRUE,
+  max_muts_per_gene_per_sample = Inf,
+  max_coding_muts_per_sample = Inf,
+  outp = 2,
+  use_indel_sites = TRUE,
+  min_indels = 1,
+  refdb = refdb_path,
+  cv = covs
+)
+
+
+## Get table for annotated mutations
+annotation <- dplyr::as_tibble(res1dnds$annotmuts)
+
+## Join protein ID with transcript ID
+annot1 <- annotation %>%
+  dplyr::left_join(., transcriptlist, by = c("pid" = "ProteinstableID")) %>%
+  dplyr::left_join(., variantlist, by = c("impact" = "REF")) %>%
+  dplyr::select(
+    chr, pos, ref, mut, TranscriptstableID, VEP, aachange, ntchange, codonsub
+  )
+
+## Parse data
+annot1$codonsub2 <- gsub(">", "/", annot1$codonsub)
+annot1$aachange2 <- gsub("[0-9]+", "/", annot1$aachange)
+
+number <- "[0-9]+"
+
+annot2 <- annot1 %>%
+  tidyr::unite(idtmp, c("chr", "pos"), sep = "_") %>%
+  tidyr::unite(change, c("ref", "mut"), sep = "/") %>%
+  tidyr::unite(id, c("idtmp", "change"), sep = "_") %>%
+  dplyr::mutate(
+    protpos = stringr::str_extract(annot1$aachange, number),
+    cdspos = stringr::str_extract(annot1$ntchange, number)
+  )
+
+dummy_ssb <- annot2 %>%
+  dplyr::mutate(
+    col1 = "NA", col2 = "NA", col3 = "NA", col4 = "Transcript", col5 = "NA",
+    col6 = "NA", col7 = "NA", col8 = "NA"
+  ) %>%
+  dplyr::select(
+    id, col1, col2, col3, TranscriptstableID,
+    col4, VEP, col5, cdspos, protpos, aachange2, codonsub2
+  )
+
+dummy_ssb %<>% dplyr::mutate(aachange2 = as.character(aachange2))
+dummy_ssb %<>% dplyr::mutate(codonsub2 = as.character(codonsub2))
+
+df_ssb <- as.data.frame(dummy_ssb)
+
+df_ssb <- df_ssb[!is.na(df_ssb$TranscriptstableID), ]
+
+write.table(
+  df_ssb,
+  file = out_name, quote = FALSE, sep = "\t",
+  col.names = FALSE, row.names = FALSE
+)
diff --git a/src/SOPRANO/R/pkgs.R b/src/SOPRANO/R/pkgs.R
@@ -0,0 +1,29 @@
+package_list <- c(
+  "devtools", "vcfR", "readr", "stringr", "tidyr", "optparse", "poilog",
+  "plyr", "dplyr", "tibble", "magrittr", "MASS"
+)
+
+get_installed <- function() {
+  return(installed.packages()[, "Package"])
+}
+
+for (pkg in package_list) {
+  if (!pkg %in% get_installed()) {
+    install.packages(
+        pkg,
+        character.only = TRUE,
+        repos = "https://cloud.r-project.org/"
+    )
+  }
+  library(pkg, character.only = TRUE)
+}
+
+gh_package_lsit <- c("im3sanger/dndscv")
+
+for (gh_pkg in gh_package_lsit) {
+  pkg <- str_split(gh_pkg, pattern = "/")[[1]][2]
+  if (!pkg %in% get_installed()) {
+    install_github(gh_pkg, character.only = TRUE, dependencies = FALSE)
+  }
+  library(pkg, character.only = TRUE)
+}
diff --git a/src/SOPRANO/app.py b/src/SOPRANO/app.py
@@ -1,10 +1,13 @@
 import os
+import pathlib
 
 import streamlit as st
 from streamlit.delta_generator import DeltaGenerator
 
 from SOPRANO.core import objects
 from SOPRANO.utils.app_utils import (
+    AnnotatorUIOptions,
+    AnnotatorUIProcessing,
     DownloaderUIOptions,
     DownloaderUIProcessing,
     ImmunopeptidomesUIOptions,
@@ -187,13 +190,37 @@ def with_tab_genomes(tab: DeltaGenerator):
 def with_tab_annotator(tab: DeltaGenerator):
     with tab:
         st.title("Annotate VCF File")
-        st.caption(
-            "Upload a VCF file to annotate for use in the SOPRANO pipeline."
+        st.markdown(
+            "Generate an annotated mutation file from a directory containing "
+            "VCF files suitable for consumption by SOPRANO."
         )
-        st.file_uploader("Select a VCF file:", key="vcf_selection")
 
-        if st.button("Annotate", disabled=True):
-            RunTab.annotate()
+        vcf_dir_selection = st.text_input(
+            "Directory containing VCF files:", value=pathlib.Path.home()
+        )
+        vcf_dir_ready, vcf_dir_processed = AnnotatorUIProcessing.vcf_dir(
+            vcf_dir_selection
+        )
+
+        assembly_selection = st.selectbox(
+            "Genome reference assembly:",
+            options=AnnotatorUIOptions.genome_assembly(),
+        )
+        (
+            assembly_ready,
+            assembly_processed,
+        ) = AnnotatorUIProcessing.genome_assembly(assembly_selection)
+
+        name_selection = st.text_input(
+            "Choose a name for the annotated output:"
+        )
+        name_ready, name = AnnotatorUIProcessing.output_name(name_selection)
+
+        if st.button(
+            "Annotate",
+            disabled=not (vcf_dir_ready and assembly_ready and name_ready),
+        ):
+            RunTab.annotate(sources_dir=vcf_dir_processed, output_name=name)
 
 
 def with_tab_info(tab: DeltaGenerator):
@@ -290,8 +317,6 @@ def with_tab_immunopeptidome(tab: DeltaGenerator):
             transcripts_processed, subset_method_selected
         )
 
-        st.header("Ensembl transcript selections")
-
         st.markdown(
             "Once you are happy with your immunopeptidome choices, "
             "provide a name for the corresponding file, then click "

diff --git a/src/SOPRANO/ci_linux.yml b/src/SOPRANO/ci_linux.yml
@@ -4,16 +4,17 @@ channels:
   - anaconda
 dependencies:
   - perl
-  - bedtools=2.31.0
+  - bedtools>=2.31.0
   - python>=3.10,<3.12
   - pip
   - pandas
   - numpy
-  - streamlit==1.27.0
+  - streamlit>=1.27.0
   - requests
   - types-requests
   - clint
   - black
   - pytest
   - pytest-dependency
-  - ruff
+  - ruff
+  - r-base>=4.0.0