Skip to content

Commit

Permalink
fix bug causing hlaToAAVariation infer 2-digit alleles amino acid seq…
Browse files Browse the repository at this point in the history
…uence

now inferrence missing alleles is done at the pre-parsing step
  • Loading branch information
migdal committed Apr 29, 2021
1 parent da0d5f2 commit ec29296
Show file tree
Hide file tree
Showing 37 changed files with 78 additions and 82 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: midasHLA
Title: R package for immunogenomics data handling and association analysis
Version: 0.99.31
Version: 0.99.33
Authors@R: c(
person("Christian", "Hammer", email = "[email protected]", role = "aut"),
person("Maciej", "Migdał", email = "[email protected]", role = c("aut", "cre")))
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 0.99.33 - 28/04/21
+ fixed bug causing hlaToAAVariation infer 2-digit alleles amino acid sequences.
Now 2-digit alleles are treated as NA.

# 0.99.31 - 30/03/21
+ removed unneeded dependencies from package tutorial
+ in the tutorial, external packages are now called explicitly
Expand Down
26 changes: 11 additions & 15 deletions R/parsingFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,15 @@ readHlaCalls <- function(file,
#' set to \code{NULL}, \code{file} parameter is used instead and alignment is
#' read from the provided file. In EBI database alignments for DRB1, DRB3, DRB4
#' and DRB5 genes are provided as a single file, here they are separated.
#' Additionally, were possible sequences for alleles not present in the alignments have been
#' inferred based on higher resolution alleles. To this end we have reduced
#' alleles to 6 and 4 digit resolution and took consensus sequence to represent
#' missing alleles. Positions for which there was no full agreement were marked
#' as unknown.
#'
#'
#'
#' @inheritParams readHlaCalls
#' @inheritParams reduceAlleleResolution
#' @param gene Character vector of length one specifying the name of a gene for
#' which alignment is required. See details for further explanations.
#' @param trim Logical indicating if alignment should be trimmed to start codon
Expand All @@ -112,19 +118,17 @@ readHlaCalls <- function(file,
#' @examples
#' hla_alignments <- readHlaAlignments(gene = "A")
#'
#' @importFrom assertthat assert_that is.count is.readable is.string
#' @importFrom assertthat assert_that is.count is.readable is.string see_if
#' @importFrom stringi stri_flatten stri_split_regex stri_sub
#' @importFrom stringi stri_subset_fixed stri_read_lines stri_detect_regex
#' @export
readHlaAlignments <- function(file,
gene = NULL,
trim = FALSE,
unkchar = "",
resolution = 8) {
unkchar = "") {
assert_that(
isTRUEorFALSE(trim),
is.string(unkchar),
is.count(resolution)
is.string(unkchar)
)

if (is.null(gene)) {
Expand Down Expand Up @@ -238,7 +242,7 @@ readHlaAlignments <- function(file,
)
)

cached_aln_obj <- readRDS(file) # list(readHlaAlignments(file, trim = FALSE, unkchar = "*", resolution = 8), first_codon_idx)
cached_aln_obj <- readRDS(file) # list(readHlaAlignments(file, trim = FALSE, unkchar = "*"), first_codon_idx)
aln <- cached_aln_obj[[1]]

# discard aa '5 to start codon of mature protein
Expand All @@ -251,14 +255,6 @@ readHlaAlignments <- function(file,
# substitute unkchar
aln[aln == "*"] <- unkchar

# reduce alignment matrix to selected resolution
allele_numbers <- reduceAlleleResolution(rownames(aln),
resolution = resolution
)
unique_numbers <- ! duplicated(allele_numbers)
aln <- aln[unique_numbers, , drop = FALSE]
rownames(aln) <- allele_numbers[unique_numbers]

return(aln)
}

Expand Down
1 change: 0 additions & 1 deletion R/summarise.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ summariseAAPosition <- function(hla_calls,
alleles_wo_na <- reduceAlleleResolution(alleles_wo_na, hla_resolution)
aln <- readHlaAlignments(
gene = gsub("_", "", gene),
resolution = hla_resolution,
unkchar = "*"
)

Expand Down
40 changes: 19 additions & 21 deletions R/transformationFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,26 +75,25 @@ hlaToAAVariation <- function(hla_calls,
gene_names_uniq <- gene_names_uniq[av_genes_idx]
}

# read alignment matrices in all resolutions
hla_aln <- lapply(X = gene_names_uniq,
FUN = function(x) {
alns <- lapply(
X = c(2, 4, 6, 8),
FUN = function(res) {
readHlaAlignments(
gene = x,
resolution = res,
unkchar = "*"
)
}
)
aln <- do.call(rbind, alns)
aln <- aln[! duplicated(rownames(aln)), ]

return(aln)
}
# read alignment matrices
hla_aln <- lapply(
X = gene_names_uniq,
FUN = function(x) {
readHlaAlignments(gene = x,
unkchar = "*")
}
)

# helper function
truncVector <- function(vec, len, append) {
if (length(vec) > len) {
vec <- vec[1:len]
vec <- c(vec, append)
}

return(vec)
}

# get aa variations for each gene
aa_variation <- list()
for (i in seq_along(gene_names_uniq)) {
Expand All @@ -107,7 +106,7 @@ hlaToAAVariation <- function(hla_calls,
if (any(mask_alleles_wo_ref[! is.na(x_calls_unlist)], na.rm = TRUE)) {
warn(sprintf(
"Alignments for alleles %s are not available and will be omitted.",
paste(x_calls_unlist[mask_alleles_wo_ref], collapse = ", ")
paste(truncVector(x_calls_unlist[mask_alleles_wo_ref], 10, "..."), collapse = ", ")
))
x_calls_unlist[mask_alleles_wo_ref] <- NA
}
Expand Down Expand Up @@ -157,8 +156,7 @@ hlaToAAVariation <- function(hla_calls,
if (as_df) {
aa_variation <- as.data.frame(aa_variation,
optional = TRUE,
stringsAsFactors = FALSE
)
stringsAsFactors = FALSE)
aa_variation <- cbind(ID = ids, aa_variation, stringsAsFactors = FALSE)
rownames(aa_variation) <- NULL
}
Expand Down
22 changes: 10 additions & 12 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -613,22 +613,22 @@ hlaCallsGranthamDistance <- function(hla_calls,

sel <- paste0(gene, c("_1", "_2"))
pairs <- hla_calls[, sel]
resolution <- getAlleleResolution(unlist(pairs)) %>%
na.omit()
assert_that( # this should become obsolete
see_if(
all(resolution == resolution[1]),
msg = sprintf("Allele resolutions for gene %s are not equal", gene)
)
)
# resolution <- getAlleleResolution(unlist(pairs)) %>%
# na.omit()
# assert_that( # this should become obsolete
# see_if(
# all(resolution == resolution[1]),
# msg = sprintf("Allele resolutions for gene %s are not equal", gene)
# )
# )

# process alignment
aa_sel <- list(
binding_groove = 2:182,
B_pocket = c(7, 9, 24, 25, 34, 45, 63, 66, 67, 70, 99),
F_pocket = c(77, 80, 81, 84, 95, 116, 123, 143, 146, 147)
)
alignment <- hlaAlignmentGrantham(gene, resolution[1], aa_sel[[aa_selection]])
alignment <- hlaAlignmentGrantham(gene, aa_sel[[aa_selection]])

allele_numbers <- rownames(alignment)
d[[gene]] <- vapply(
Expand Down Expand Up @@ -662,17 +662,15 @@ hlaCallsGranthamDistance <- function(hla_calls,
#' Helper function returning alignment for Grantham distance calculations
#'
#' @param gene Character vector specifying HLA gene.
#' @param resolution Number giving allele resolution.
#' @param aa_sel Numeric vector specifying amino acids that should be extracted.
#'
#' @return HLA alignment processed for grantham distance calculation. Processing
#' includes extracting specific amino acids, masking indels, gaps and stop
#' codons.
#'
hlaAlignmentGrantham <- function(gene, resolution, aa_sel = 2:182) {
hlaAlignmentGrantham <- function(gene, aa_sel = 2:182) {
alignment <- readHlaAlignments(
gene = gene,
resolution = resolution,
trim = TRUE
)
alignment <- alignment[, aa_sel] # select amino acids
Expand Down
Binary file modified inst/extdata/A_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/B_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/C_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DMA_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DMB_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DOA_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DOB_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DPA1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DPB1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DQA1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DQB1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DRA_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DRB1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DRB3_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DRB4_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/DRB5_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/E_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/F_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/G_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/HFE_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/MICA_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/MICB_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/TAP1_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/TAP2_prot.Rdata
Binary file not shown.
Binary file modified inst/extdata/test_aa_variation.Rdata
Binary file not shown.
15 changes: 15 additions & 0 deletions inst/scripts/parse_alignments.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ for (file in alignment_files) {
trim = FALSE,
unkchar = "*",
resolution = 8)
# infer missing lower resolution alleles
for (res in c(6, 4)) {
allele_numbers <- reduceAlleleResolution(rownames(alignment), resolution = res)
missing_alleles <- unique(allele_numbers[! allele_numbers %in% rownames(alignment)])
missing_aln <- list()
for (allele in missing_alleles) {
i <- allele_numbers == allele
missing_aln[[allele]] <- apply(alignment[i, , drop=FALSE], 2, function(col) {
if (all(col == col[1])) col[1]
else "*"
})
}
missing_aln <- do.call(rbind, missing_aln)
alignment <- rbind(alignment, missing_aln)
}

# find first codon idx
aln_raw <- stri_read_lines(file)
Expand Down
4 changes: 1 addition & 3 deletions man/hlaAlignmentGrantham.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 5 additions & 9 deletions man/readHlaAlignments.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 7 additions & 8 deletions tests/testthat/test_parsing.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,14 @@ test_that("readHlaCalls", {

test_that("readHlaAlignments", {
file <- system.file("extdata", "TAP1_prot.txt", package = "midasHLA")
hla_alignments <- readHlaAlignments(file)
test_hla_alignments <- readHlaAlignments(gene = "TAP1")
# our alignment files also contain infered alignment sequences
m <- c("TAP1*01:02N", "TAP1*03:01", "TAP1*04:01", "TAP1*05:01", "TAP1*06:01",
"TAP1*02:01:01", "TAP1*02:01:02", "TAP1*01:01:01:01", "TAP1*01:01:01:02",
"TAP1*01:01:01:03", "TAP1*01:01:01:04", "TAP1*01:01:01:05")
hla_alignments <- readHlaAlignments(file)[m, ]
test_hla_alignments <- readHlaAlignments(gene = "TAP1")[m, ]
expect_equal(hla_alignments, test_hla_alignments)

hla_alignments_res2 <- readHlaAlignments(gene = "TAP1", resolution = 2)
res2 <- getAlleleResolution(rownames(hla_alignments_res2))
test_res2 <- c(2, 4, 2, 2, 2, 2, 2)
expect_equal(res2, test_res2)

expect_error(readHlaAlignments(file.path("path", "to", "nonexisting", "file")),
sprintf("Path '%s' does not exist",
file.path("path", "to", "nonexisting", "file")
Expand Down Expand Up @@ -97,7 +96,7 @@ test_that("readHlaAlignments", {
fasta_file <- system.file("extdata", "TAP1_prot.fasta", package = "midasHLA")
fasta <- seqinr::read.alignment(fasta_file, format = "fasta")
fasta <- fasta$seq[[1]]
expect_equal(paste(hla_alignments[1, ], collapse = ""),
expect_equal(paste(hla_alignments["TAP1*01:01:01:01", ], collapse = ""),
toupper(fasta)
) # check sequence with sequence from fasta

Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/test_transformation.R
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ test_that("hlaToAAVariation", {
aa_counts <- aaVariationToCounts(aa_var)
test_aa_counts <- data.frame(
ID = c("P1", "P2"),
`A_-15_L` = c(1, 1),
`A_-15_V` = c(1, 1),
A_44_K = c(1, 1),
A_44_R = c(1, 1),
A_62_Q = c(1, 1),
A_62_G = c(1, 1),
stringsAsFactors = FALSE, check.names = FALSE
)
expect_equal(aa_counts, test_aa_counts)
Expand All @@ -194,7 +194,7 @@ test_that("getAAFrequencies", {
aa_var <- hlaToAAVariation(minimal_hla_calls)[, 1:5]
aa_freq <- getAAFrequencies(aa_var)
test_aa_freq <- data.frame(
aa_pos = c("A_-15_L", "A_-15_V", "A_44_K", "A_44_R"),
aa_pos = c("A_44_K", "A_44_R", "A_62_Q", "A_62_G"),
Counts = c(2, 2, 2, 2),
Freq = c(0.5, 0.5, 0.5, 0.5),
row.names = NULL,
Expand Down
11 changes: 2 additions & 9 deletions tests/testthat/test_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -269,20 +269,13 @@ test_that("hlaCallsGranthamDistance", {
expect_error(hlaCallsGranthamDistance(MiDAS_tut_HLA, genes = c("A", NA)),
"genes contains 1 missing values")

hla_calls_bad <- MiDAS_tut_HLA
hla_calls_bad[2, 2] <- "A*01"
expect_error(
hlaCallsGranthamDistance(hla_calls_bad, genes = "A"),
"Allele resolutions for gene A are not equal"
)

expect_error(hlaCallsGranthamDistance(MiDAS_tut_HLA, genes = "A", aa_selection = "Z"),
"aa_selection should be one of \"binding_groove\", \"B_pocket\", \"F_pocket\".")
})

test_that("hlaAlignmentGrantham", {
aln <- hlaAlignmentGrantham("TAP1", 2, 2:182)
aln_test <- readHlaAlignments(gene = "TAP1", resolution = 2)
aln <- hlaAlignmentGrantham("TAP1", 2:182)
aln_test <- readHlaAlignments(gene = "TAP1")
aln_test <- aln_test[, 2:182]
mask <- apply(aln_test, 1, function(x) any(x == "" | x == "X" | x == "."))
aln_test <- aln_test[! mask, ]
Expand Down

0 comments on commit ec29296

Please sign in to comment.