Skip to content

Commit

Permalink
Merge pull request #23 from instituteofcancerresearch/R-parser
Browse files Browse the repository at this point in the history
Annotated input from VCF files
  • Loading branch information
bkmarzouk authored Nov 9, 2023
2 parents 7ee4a04 + 1426efb commit 2719fe4
Show file tree
Hide file tree
Showing 51 changed files with 254,841 additions and 493 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/dev_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ jobs:
- name: Test conda environment
run: |
conda activate soprano-dev
pytest -s tests/test_configuration
pytest -s tests/test_installation.py
- name: Test units
run: |
conda activate soprano-dev
pytest -s tests/test_units
pytest -s tests/units
2 changes: 1 addition & 1 deletion .github/workflows/main_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- name: Test conda environment
run: |
conda activate soprano-dev
pytest -s tests/test_configuration
pytest -s tests/test_installation.py
- name: Test units
run: |
conda activate soprano-dev
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ requires-python = ">=3.10"
dependencies = [
"pandas",
"numpy",
"streamlit == 1.27.0",
"streamlit >= 1.27.0",
"requests",
"types-requests",
"clint"
Expand All @@ -30,6 +30,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Perl",
"Programming Language :: R",
"Topic :: Scientific/Engineering :: Bio-Informatics",
"Intended Audience :: Science/Research"
]
Expand Down Expand Up @@ -64,6 +65,7 @@ soprano-app = "SOPRANO.run:run_app"
soprano-get-genome = "SOPRANO.run:download_genome"
soprano-link-vep = "SOPRANO.run:link_vep_cache"
soprano-hla2ip = "SOPRANO.run:hla2pip"
soprano-annotate = "SOPRANO.run:annotate_vcfs"

[tool.hatch]
version.source = "vcs"
Expand Down
8 changes: 6 additions & 2 deletions setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,14 @@ function activate_env() {
fi
if [ $? == 0 ]
then
echo "Installing SOPRANO python package"
echo "Installing SOPRANO Python packages"
eval "$_PIP_CMD"
fi
if [ $? == 0 ]
then
echo "Installing SOPRANO R packages"
Rscript "src/SOPRANO/R/pkgs.R"
fi
}

if command -v conda &> /dev/null
Expand All @@ -145,7 +150,6 @@ then
fi
fi


else
echo "Fatal: Conda not detected."
fi
Expand Down
192 changes: 192 additions & 0 deletions src/SOPRANO/R/parse_vcf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
library(magrittr)

# Define parser
opt_parser <- optparse::OptionParser()

# Data directory containing vcf.gz files
opt_parser <- optparse::add_option(
opt_parser, c("-d", "--dir"),
type = "character",
help = "Directory containing .vcf.gz files.", metavar = "character",
)

# Output filename
opt_parser <- optparse::add_option(
opt_parser, c("-o", "--out"),
type = "character", default = NULL,
help = "Output file path.", metavar = "character"
)

# Location of sources for translating ensp and ref
opt_parser <- optparse::add_option(
opt_parser, c("-t", "--trans"),
type = "character",
help = "Location of translator files.", metavar = "character"
)

# Parse inputs
args <- optparse::parse_args(opt_parser)

# Get data sources
data_dir <- args$dir

if (is.null(data_dir)) {
stop("Data directory not defined. Flag -d | --d <directory path> required")
}
if (!dir.exists(data_dir)) {
stop(paste("Data directory does not exist:", data_dir))
}


# Get translator dir
trans_dir <- args$trans

if (is.null(trans_dir)) {
stop("Translator directory not defined. Flag -t | --t <dir path> required")
}

if (!dir.exists(trans_dir)) {
stop(paste("Translator directory does not exits:", trans_dir))
}

ensp_2_enst_path <- file.path(trans_dir, "ENSP2ENST.txt")
variant_list_path <- file.path(trans_dir, "REF2VEP.txt")
covs_path = file.path(trans_dir, "covariates_hg19_hg38_epigenome_pcawg.rda")
refdb_path <- file.path(
trans_dir, "RefCDS_human_GRCh38_GencodeV18_recommended.rda"
)

for (p in c(ensp_2_enst_path, variant_list_path, covs_path, refdb_path)) {
if (! file.exists(p)) {
stop(paste("Auxiliary file not found:", p))
}
}

# Get output file name
if (is.null(args$out)) {
out_name <- file.path(data_dir, paste0(basename(data_dir), ".anno"))
} else {
out_name <- args$out
}

# Read mutations from VCF files
vcf_files <- Sys.glob(file.path(args$dir, "*.vcf.gz"))
vcf_names <- sub(
"_TAIL_somatic_snvs_snpEff.ann.vcf.gz", "", sub("\\.\\/", "", vcf_files)
)
vcf_data <- lapply(vcf_files, function(vcf) {
v <- vcfR::read.vcfR(vcf)
v <- vcfR::extract.indels(v)
v <- v[vcfR::is.biallelic(v), ]
})
names(vcf_data) <- vcf_names

gt_fields <- lapply(vcf_data, function(vcf) {
vcfR::extract_gt_tidy(vcf)
})

fix_fields <- lapply(vcf_data, function(vcf) {
vcf@fix %>% tibble::as_tibble()
})

merged_list <- mapply(c, fix_fields, gt_fields, SIMPLIFY = FALSE)

# Create table with basic data from VCF
all_with_vaf <- lapply(merged_list, function(test) {
test2 <- tibble::as_tibble(cbind(test$CHROM, test$POS, test$REF, test$ALT))
})

df_all_with_vaf <- dplyr::bind_rows(all_with_vaf, .id = "name")
names(df_all_with_vaf) <- c(
"sampleID", "chr", "position", "ref_allele", "alt_allele"
)

# Read transcript and annotation info



transcriptlist <- readr::read_delim(
ensp_2_enst_path, delim = "\t", col_names = TRUE
)
variantlist <- readr::read_delim(
variant_list_path, delim = "\t", col_names = TRUE
)

# See discussion:
# https://github.com/im3sanger/dndscv/issues/30#issuecomment-1000868593

load(covs_path) # Loads the covs object


# dndscv bit...

# Run dndscv with GRCh38 defs... required a bit of tinkering
# Other genome need to download Rda file-> see dndscv tutorial website.

# See discussion:
# unix.stackexchange.com/questions/497990/
# how-to-know-if-rsync-did-not-change-any-files
df_all_with_vaf$chr <- gsub("chr", "", as.vector(df_all_with_vaf$chr))


res1dnds <- dndscv::dndscv(
mutations = df_all_with_vaf,
outmats = TRUE,
max_muts_per_gene_per_sample = Inf,
max_coding_muts_per_sample = Inf,
outp = 2,
use_indel_sites = TRUE,
min_indels = 1,
refdb = refdb_path,
cv = covs
)


## Get table for annotated mutations
annotation <- dplyr::as_tibble(res1dnds$annotmuts)

## Join protein ID with transcript ID
annot1 <- annotation %>%
dplyr::left_join(., transcriptlist, by = c("pid" = "ProteinstableID")) %>%
dplyr::left_join(., variantlist, by = c("impact" = "REF")) %>%
dplyr::select(
chr, pos, ref, mut, TranscriptstableID, VEP, aachange, ntchange, codonsub
)

## Parse data
annot1$codonsub2 <- gsub(">", "/", annot1$codonsub)
annot1$aachange2 <- gsub("[0-9]+", "/", annot1$aachange)

number <- "[0-9]+"

annot2 <- annot1 %>%
tidyr::unite(idtmp, c("chr", "pos"), sep = "_") %>%
tidyr::unite(change, c("ref", "mut"), sep = "/") %>%
tidyr::unite(id, c("idtmp", "change"), sep = "_") %>%
dplyr::mutate(
protpos = stringr::str_extract(annot1$aachange, number),
cdspos = stringr::str_extract(annot1$ntchange, number)
)

dummy_ssb <- annot2 %>%
dplyr::mutate(
col1 = "NA", col2 = "NA", col3 = "NA", col4 = "Transcript", col5 = "NA",
col6 = "NA", col7 = "NA", col8 = "NA"
) %>%
dplyr::select(
id, col1, col2, col3, TranscriptstableID,
col4, VEP, col5, cdspos, protpos, aachange2, codonsub2
)

dummy_ssb %<>% dplyr::mutate(aachange2 = as.character(aachange2))
dummy_ssb %<>% dplyr::mutate(codonsub2 = as.character(codonsub2))

df_ssb <- as.data.frame(dummy_ssb)

df_ssb <- df_ssb[!is.na(df_ssb$TranscriptstableID), ]

write.table(
df_ssb,
file = out_name, quote = FALSE, sep = "\t",
col.names = FALSE, row.names = FALSE
)
29 changes: 29 additions & 0 deletions src/SOPRANO/R/pkgs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package_list <- c(
"devtools", "vcfR", "readr", "stringr", "tidyr", "optparse", "poilog",
"plyr", "dplyr", "tibble", "magrittr", "MASS"
)

get_installed <- function() {
return(installed.packages()[, "Package"])
}

for (pkg in package_list) {
if (!pkg %in% get_installed()) {
install.packages(
pkg,
character.only = TRUE,
repos = "https://cloud.r-project.org/"
)
}
library(pkg, character.only = TRUE)
}

gh_package_lsit <- c("im3sanger/dndscv")

for (gh_pkg in gh_package_lsit) {
pkg <- str_split(gh_pkg, pattern = "/")[[1]][2]
if (!pkg %in% get_installed()) {
install_github(gh_pkg, character.only = TRUE, dependencies = FALSE)
}
library(pkg, character.only = TRUE)
}
39 changes: 32 additions & 7 deletions src/SOPRANO/app.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os
import pathlib

import streamlit as st
from streamlit.delta_generator import DeltaGenerator

from SOPRANO.core import objects
from SOPRANO.utils.app_utils import (
AnnotatorUIOptions,
AnnotatorUIProcessing,
DownloaderUIOptions,
DownloaderUIProcessing,
ImmunopeptidomesUIOptions,
Expand Down Expand Up @@ -187,13 +190,37 @@ def with_tab_genomes(tab: DeltaGenerator):
def with_tab_annotator(tab: DeltaGenerator):
with tab:
st.title("Annotate VCF File")
st.caption(
"Upload a VCF file to annotate for use in the SOPRANO pipeline."
st.markdown(
"Generate an annotated mutation file from a directory containing "
"VCF files suitable for consumption by SOPRANO."
)
st.file_uploader("Select a VCF file:", key="vcf_selection")

if st.button("Annotate", disabled=True):
RunTab.annotate()
vcf_dir_selection = st.text_input(
"Directory containing VCF files:", value=pathlib.Path.home()
)
vcf_dir_ready, vcf_dir_processed = AnnotatorUIProcessing.vcf_dir(
vcf_dir_selection
)

assembly_selection = st.selectbox(
"Genome reference assembly:",
options=AnnotatorUIOptions.genome_assembly(),
)
(
assembly_ready,
assembly_processed,
) = AnnotatorUIProcessing.genome_assembly(assembly_selection)

name_selection = st.text_input(
"Choose a name for the annotated output:"
)
name_ready, name = AnnotatorUIProcessing.output_name(name_selection)

if st.button(
"Annotate",
disabled=not (vcf_dir_ready and assembly_ready and name_ready),
):
RunTab.annotate(sources_dir=vcf_dir_processed, output_name=name)


def with_tab_info(tab: DeltaGenerator):
Expand Down Expand Up @@ -290,8 +317,6 @@ def with_tab_immunopeptidome(tab: DeltaGenerator):
transcripts_processed, subset_method_selected
)

st.header("Ensembl transcript selections")

st.markdown(
"Once you are happy with your immunopeptidome choices, "
"provide a name for the corresponding file, then click "
Expand Down
7 changes: 4 additions & 3 deletions src/SOPRANO/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@ channels:
- anaconda
dependencies:
- perl
- bedtools=2.31.0
- bedtools>=2.31.0
- python>=3.10,<3.12
- pip
- pandas
- numpy
- streamlit==1.27.0
- streamlit>=1.27.0
- requests
- types-requests
- clint
- black
- pytest
- pytest-dependency
- ruff
- ruff
- r-base>=4.0.0
Loading

0 comments on commit 2719fe4

Please sign in to comment.