forked from joyceyiyiwang/Portability_Questions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
07a_group_level_pred.R
170 lines (148 loc) · 6.9 KB
/
07a_group_level_pred.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
library(dplyr)
library(tidyr)
library(data.table)
library(readr)
library(stringr)
partial.R2.helper <- function(nested.lm, ref.lm){
a <- anova(nested.lm)
b <- anova(ref.lm)
length.ref <- length(attributes(ref.lm$terms)$"dataClasses")
length.nested <- length(attributes(nested.lm$terms)$"dataClasses")
if(length.nested > length.ref) stop("Specify nested model first in arguements")
if(length.ref - length.nested > 1) stop("Reference and nested model should only differ with repsect to the presence/absence of one predictor")
SSE.wo <- tail(a$"Sum Sq", 1)
SSE.with <- tail(b$"Sum Sq", 1)
P.R2<-(SSE.wo-SSE.with)/SSE.wo
return(P.R2)
}
partial.R2 <- function(nested.lm.list, ref.lm.list){
res = c()
for(i in 1:length(nested.lm.list)){
res = c(res, partial.R2.helper(nested.lm.list[[i]], ref.lm.list[[i]]))
}
return(res)
}
load_non_pgs_df <- function(num_bins) {
# Load covariates (age, sex, age-sex interactions, PC1, ..., PC20)
covar_df <- read_tsv('data/ukb_merged/covar.tsv')
covar_df <- covar_df %>% select(-c(8:47))
# Load array type
array_df <- read_table("data/extracted_data_fields/array_type.txt")
colnames(array_df) = c("IID", "array_batch")
array_df$array_type = ifelse(array_df$array_batch < 0, "BiLEVE",
ifelse(array_df$array_batch > 0 & array_df$array_batch < 100, "Axiom",
ifelse(array_df$array_batch == 1000, "BiLEVE",
ifelse(array_df$array_batch == 2000, "Axiom", array_df$array_batch))))
covar_df <- covar_df %>% filter(pop != "WB_GWAS")
phenotypes_df <- read_tsv('data/phenotypes/phenotypes.tsv')
# Load the individuals
population_files <- c('data/ukb_populations/nwb_all_id.txt',
'data/ukb_populations/wb_gwas_id.txt',
'data/ukb_populations/wb_pred_id.txt')
populations_df <- data.frame()
for (file in population_files) {
pop_df <- read_delim(file, delim = ' ', trim_ws = T,
col_types = c('#FID' = col_integer(), 'IID' = col_integer())) %>%
mutate(population = str_extract(file, '(?<=data/ukb_populations/)[a-z]+_[a-z]+'))
populations_df <- bind_rows(populations_df, pop_df)
}
pc_values <- read_tsv('data/pca/pc_dist_best_pred_std.tsv')
pc_values <- pc_values[, c(2:3)]
# Combine all the above tables into a table that will be joined with PGS information for
# phenotype-threshold combinations.
phenotypes_df <- phenotypes_df %>%
inner_join(covar_df, by = c('#FID', 'IID')) %>%
inner_join(array_df, by = "IID") %>%
inner_join(populations_df, by = c('#FID', 'IID')) %>%
left_join(pc_values, by = c("IID")) %>%
pivot_longer(Height:LDL, names_to = 'phenotype', values_to = 'phenotype_value')
# Divides into equally-sized components
phenotypes_df <- phenotypes_df %>%
mutate(weighted_pc_groups = pc_dist %>% ntile(num_bins))
return(phenotypes_df)
}
get_r2_values <- function(df, group_var) {
df$group <- df %>% select(any_of(group_var)) %>% as.data.frame()
df1 = data.frame()
for(g in unique(df$group$weighted_pc_groups)){
for(p in unique(df$phenotype)){
for(t in unique(df$threshold)){
temp <- df %>% filter(group == g & phenotype == p & threshold == t)
meta <- as.data.frame(t(c(g, p, t)))
colnames(meta) <- c("weighted_pc_groups", "phenotype", "threshold")
meta$weighted_pc_groups <- as.numeric(meta$weighted_pc_groups)
meta$threshold <- as.numeric(meta$threshold)
# Use the array type as a covariate only when both types have more than 1 samples
if(length(temp$array_type[temp$array_type == "Axiom"]) > 1 & length(temp$array_type[temp$array_type == "BiLEVE"]) > 1){
meta_2 <- meta %>%
do(nested = lm(phenotype_value ~ sex + age + age_sq + age_sex + age_sq_sex + array_type, data = temp),
full = lm(phenotype_value ~ pgs + sex + age + age_sq + age_sex + age_sq_sex + array_type, data = temp))
} else{
meta_2 <- meta %>%
do(nested = lm(phenotype_value ~ sex + age + age_sq + age_sex + age_sq_sex, data = temp),
full = lm(phenotype_value ~ pgs + sex + age + age_sq + age_sex + age_sq_sex, data = temp))
}
df1 <- rbind.data.frame(df1, cbind.data.frame(meta, meta_2))
}
}
}
df1 <- df1 %>%
mutate(partial = partial.R2(nested, full)) %>%
select(-nested, -full)
df1$group <- 1:nrow(df1)
colnames(df1)[1] <- group_var
return(df1)
}
# Calculate PGS prediction accuracy
make_pgs_evaluation_df <- function(non_pgs_df, group_var_as_string) {
# Combine partial R^2 values for each phenotype and threshold
col_types = c('#FID' = col_integer(), 'IID' = col_integer(), 'ALLELE_CT' = col_integer(),
'NAMED_ALLELE_DOSAGE_SUM' = col_double(), 'SCORE1_AVG' = col_double())
pgs_df <- data.frame()
for (file in list.files(path = 'data/pgs',
pattern = '[a-zA-Z]+_[0-9]_scores.sscore', full.names = T)) {
print(file)
this_df <- read_tsv(file, col_types = col_types) %>%
filter(IID > 0) %>%
select('#FID', 'IID', pgs = 'SCORE1_AVG') %>%
mutate(
phenotype = str_extract(string = file, pattern = '(?<=data/pgs/)[A-Za-z_]+(?=_)'),
threshold = str_extract_all(string = file, pattern = '[0-4]')[[1]] %>% as.integer
) %>%
inner_join(non_pgs_df, by = c('#FID', 'IID','phenotype')) %>%
get_r2_values(., group_var = group_var_as_string)
pgs_df <- bind_rows(pgs_df, this_df)
}
return(pgs_df)
}
# Data frame without PGS information
non_pgs_df <- load_non_pgs_df(num_bins = 500)
non_pgs_df <- non_pgs_df %>%
mutate(array_type = as.factor(array_type))
# Data frame with PGS information
pgs_df <- make_pgs_evaluation_df(non_pgs_df, "weighted_pc_groups")
pgs_df <- pgs_df %>%
arrange(phenotype, weighted_pc_groups, threshold) %>%
mutate(group_number = weighted_pc_groups)
# Calculate the median PC distance for each group
get_median_pc <- function(file){
median_pc_values <- file %>%
filter(phenotype =="Height") %>%
select(weighted_pc_groups, pc_dist) %>%
group_by(weighted_pc_groups) %>%
dplyr::summarize(median_pc = median(pc_dist),
count = n()) %>%
na.omit() %>%
as.data.frame()
return(median_pc_values)
}
# Order the bins by how close to genetic distance = 1 (mean distance to the GWAS centroidof the GWAS group)
median_pc <- get_median_pc(non_pgs_df)
median_pc$group_close_to_gwas <- abs(median_pc$median_pc - 1)
median_pc <- median_pc %>% arrange(group_close_to_gwas)
median_pc$group_close_to_gwas <- 1:500
median_pc <- median_pc %>% arrange(weighted_pc_groups)
non_pgs_df <- non_pgs_df %>% left_join(median_pc[, c(1:2, 4)], by = "weighted_pc_groups")
pgs_df <- pgs_df %>% left_join(median_pc[, c(1:2, 4)], by = "weighted_pc_groups")
non_pgs_df %>% write_tsv("data/pgs_pred/group_non_pgs_df.tsv")
pgs_df %>% write_tsv("data/pgs_pred/group_pgs_df.tsv")