Skip to content

Commit

Permalink
Fixed bug in assign_ancestry_mahalanobis.R for cases when there are n…
Browse files Browse the repository at this point in the history
…o ancestry assignments
  • Loading branch information
ngaddis committed Oct 19, 2023
1 parent 24efd0b commit ffe28e4
Showing 1 changed file with 33 additions and 7 deletions.
40 changes: 33 additions & 7 deletions assign_ancestry_mahalanobis/v1/assign_ancestry_mahalanobis.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ option_list = list(
default=3,
type='integer',
help="StdDev threshold for filtering ancestries"
),
make_option(
c('--scale-to-ref'),
action='store_true',
default=FALSE,
help="Scale Mahalanobis distance to reference for SD cutoffs rather than within scaling within dataset"
)
)

Expand Down Expand Up @@ -323,9 +329,23 @@ scaled = data.frame(matrix(ncol = 2, nrow = 0))
colnames(scaled) = c("ID","SCALED_MAHAL")
for (ancestry in ancestries) {
tempScaled = dataset_samples[dataset_samples$ANCESTRY == ancestry, c("ID", ancestry)]
tempScaled$SCALED_MAHAL = abs(scale(tempScaled[, ancestry], center=median(tempScaled[, ancestry]), scale=sd(tempScaled[, ancestry])))
tempScaled = tempScaled[, c("ID", "SCALED_MAHAL")]
scaled = rbind(scaled, tempScaled)
if (nrow(tempScaled) > 0) {
if (get_arg(args, 'scale-to-ref')) {
tempScaled$SCALED_MAHAL = abs(scale(
tempScaled[, ancestry],
center=median(filtered_pcs[filtered_pcs$POP == ancestry, ancestry]),
scale=sd(filtered_pcs[filtered_pcs$POP == ancestry, ancestry])
))
} else {
tempScaled$SCALED_MAHAL = abs(scale(
tempScaled[, ancestry],
center=median(tempScaled[, ancestry]),
scale=sd(tempScaled[, ancestry])
))
}
tempScaled = tempScaled[, c("ID", "SCALED_MAHAL")]
scaled = rbind(scaled, tempScaled)
}
}
dataset_samples = merge(dataset_samples, scaled, sort=FALSE)
dataset_samples = dataset_samples[order(dataset_samples$ANCESTRY,dataset_samples$SCALED_MAHAL),]
Expand Down Expand Up @@ -380,16 +400,21 @@ for (sd in sd_cutoffs) {
}
# Get summary of ancestry assignments
out = dataset_samples[dataset_samples$SCALED_MAHAL <= sd,]
newSummary = as.data.frame.matrix(table(out$ANCESTRY, out$POP))
colnames(newSummary) = c(paste0(sd, '_StdDev'))
newSummary$ANCESTRY = rownames(newSummary)
if (nrow(out) == 0) {
newSummary = data.frame(ancestries, rep(0, length(ancestries)))
colnames(newSummary) = c('ANCESTRY', paste0(sd, '_StdDev'))
} else {
newSummary = as.data.frame.matrix(table(out$ANCESTRY, out$POP))
colnames(newSummary) = c(paste0(sd, '_StdDev'))
newSummary$ANCESTRY = rownames(newSummary)
}
summary = merge(summary, newSummary, by='ANCESTRY', all.x=TRUE)
# Generate plots
plot_data = data.frame(dataset_samples)
plot_data = plot_data[!is.null(plot_data$SCALED_MAHAL),]
plot_data[plot_data$SCALED_MAHAL > sd, 'color'] = 'black'
generate_pc_plot(
out,
plot_data,
'PC1',
'PC2',
'PC3',
Expand All @@ -415,6 +440,7 @@ write.table(
# Generate plots of ancestry outliers using different stddev thresholds
for (ancestry in ancestries) {
plot_data = dataset_samples[dataset_samples$ANCESTRY == ancestry,]
print(plot_data)
if (nrow(plot_data) > 0) {
plot_data$color = "green"
plot_data[plot_data$SCALED_MAHAL > 2,'color'] = "yellow"
Expand Down

0 comments on commit ffe28e4

Please sign in to comment.