-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulation_study_summary.R
350 lines (296 loc) · 13.2 KB
/
simulation_study_summary.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
setwd("/users/4/neher015/abcd_multiview")
library(tidyverse)
library(vctrs)
library(xtable)
library(reshape2)
library(gridExtra)
library(RColorBrewer)
source("src/utility_functions.R")
base_path <- "simulation_study_results/2024-11-14_simulation_study"
scenarios <- file.path(base_path, "scenarios.csv") %>% read.csv() %>% rename(task_number = X)
coverage_results <- data.frame(
Scenario = integer(),
Seed = integer(),
param_type = character(),
total = integer(),
within_ci = integer(),
proportion_within_ci = numeric(),
stringsAsFactors = FALSE
)
# coverage <- file.path(task_folder, paste0("variance_param_coverage_", task_number, ".csv")) %>%
# read.csv()
# coverage_results <- rbind(coverage_results, data.frame(Scenario = scenario_id, Seed = seed, coverage))
# Create a credible interval coverage table
# # Process all files and calculate the summary statistics
# coverage_summary <- coverage_results %>%
# group_by(Scenario, param_type) %>%
# summarise(
# proportion_within_ci_mean = mean(proportion_within_ci),
# proportion_within_ci_sd = sd(proportion_within_ci),
# .groups = "drop"
# ) %>% filter(!is.na(scenario_id)) %>%
# mutate(Method = "BIPmixed")
prediction_results <- data.frame(
Scenario = integer(),
Seed = integer(),
Method = character(),
MSE = numeric(),
Bias2 = numeric(),
Variance = numeric(),
Mean = numeric(),
Correlation = numeric(),
stringsAsFactors = FALSE
)
prediction_granular_results <- data.frame(
Scenario = integer(),
True_Y = numeric(),
Predicted_Y = numeric(),
Method = character(),
Family = integer(),
Seed = integer(),
stringsAsFactors = FALSE
)
feature_selection_results <- data.frame(
Scenario = integer(),
Seed = integer(),
FalsePosRate = numeric(),
FalseNegRate = numeric(),
F1measure = numeric(),
AUC = numeric(),
Method = character(),
View = integer(),
stringsAsFactors = FALSE
)
# Process all relevant files
for (i in 1:nrow(scenarios)) {
task_folder <- scenarios$subdir_name[i]
task_number <- scenarios$task_number[i]
scenario_id <- scenarios$scenario_id[i]
seed <- scenarios$train_seed[i]
# Check if the folder is empty
files <- list.files(task_folder)
if (length(files) == 0) {
# Skip this iteration if the folder is empty
print(paste("Skipping", task_folder))
next
}
# Continue with your operations if the folder is not empty
print(paste("Processing files in", task_folder))
# Prediction
prediction_file_path <- file.path(task_folder, paste0("prediction_performance_by_method_", task_number, ".csv"))
if (file.exists(prediction_file_path)) {
prediction <- prediction_file_path %>%
read.csv()
} else {
# Skip this iteration if the folder is empty
print(paste("Prediction results don't exist for:", task_folder))
next
}
prediction_results <- rbind(prediction_results, data.frame(Scenario = scenario_id, Seed = seed, prediction))
prediction_granular <- file.path(task_folder, paste0("prediction_data_", task_number, ".csv")) %>%
read.csv()
prediction_granular_results <- rbind(prediction_granular_results, data.frame(Scenario = scenario_id, Seed = seed, prediction_granular))
# Construct the file path
feature_selection_path <- file.path(task_folder, paste0("variable_selection_performance_", task_number, ".csv"))
# Check if the file exists
if (file.exists(feature_selection_path)) {
# If the file exists, read it and append to feature_selection_results
feature_selection_data <- read.csv(feature_selection_path)
feature_selection_results <- rbind(feature_selection_results, data.frame(Scenario = scenario_id, Seed = seed, feature_selection_data))
}
}
# Process all files and calculate the summary statistics
feature_selection_summary <- feature_selection_results %>%
group_by(Scenario, Method, View) %>%
summarise(across(c(FalsePosRate, FalseNegRate, F1measure, AUC),
list(mean = ~mean(.), sd = ~sd(.))),
.groups = "drop") %>% filter(!is.na(scenario_id)) %>% filter(!is.na(scenario_id))
# 1. Extract the results for the 1st view
first_view_selection <- feature_selection_summary %>%
filter(View == 1) %>%
rename_with(~paste0(., "_X1"), -c(Scenario, Method, View))
# 2. Calculate the averaged results across all 4 views
average_view_selection <- feature_selection_summary %>%
group_by(Scenario, Method) %>%
summarise(across(c(FalsePosRate_mean, FalseNegRate_mean, F1measure_mean, AUC_mean,
FalsePosRate_sd, FalseNegRate_sd, F1measure_sd, AUC_sd),
list(avg = ~mean(.)), .names = "{col}"), .groups = "drop")
# 3. Combine the first view and averaged results into a single data frame
combined_feature_selection_summary <- first_view_selection %>%
left_join(average_view_selection, by = c("Scenario", "Method"))
# Create a prediction performance table
# Remove replicates with NA prediction results
filtered_prediction_results <- prediction_results %>%
# Remove replicates with missing MSE
filter(!is.na(MSE)==T) %>%
# Remove outliers
group_by(Method, Scenario) %>%
mutate(
Q1 = quantile(MSE, 0.25, na.rm = TRUE),
Q3 = quantile(MSE, 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
lower_bound = Q1 - 1.5 * IQR,
upper_bound = Q3 + 1.5 * IQR,
is_outlier = MSE < lower_bound | MSE > upper_bound
) %>%
filter(!is_outlier)
print("N tasks removed due to missing/ outlier MSE:")
print(nrow(prediction_results)-nrow(filtered_prediction_results))
print("Out of N tasks overall:")
print(nrow(prediction_results))
# Count the number of rows for each Method
prediction_results %>%
group_by(Method, Scenario) %>%
summarise(n = n())
# Count the number of rows for each Method
filtered_prediction_results %>%
group_by(Method, Scenario) %>%
summarise(n = n())
# Calculate the summary statistics
prediction_summary <- filtered_prediction_results %>%
group_by(Scenario, Method) %>%
summarise(
MSE_mean = mean(MSE),
MSE_sd = sd(MSE),
Bias2_mean = mean(Bias2),
Bias2_sd = sd(Bias2),
Variance_mean = mean(Variance),
Variance_sd = sd(Variance),
Mean_mean = mean(Mean),
Mean_sd = sd(Mean),
Correlation_mean = mean(Correlation),
Correlation_sd = sd(Correlation),
.groups = "drop"
)
# # Calculate quartiles of Predicted_Y and filter to smallest (1st) and largest (4th) quartiles
# quartile_summary_mspe <- prediction_granular_results %>%
# group_by(Scenario, Seed, Method) %>%
# mutate(Quartile = ntile(Predicted_Y, 4)) %>% # Assign quartiles based on Predicted_Y
# group_by(Scenario, Seed, Method, Quartile) %>%
# reframe(MSPE = (True_Y - Predicted_Y)^2) %>% # Calculate MSPE for given replicate
# group_by(Scenario, Method, Quartile) %>%
# summarise(
# avg_mspe = mean(MSPE), # Average MSPE in the quartile
# sd_mspe = sd(MSPE), # Standard deviation of MSPE in the quartile
# .groups = 'drop'
# )
#
# # View the filtered quartile summary for MSPE
# head(quartile_summary_mspe)
# Step 1: Combine mean and sd for each metric
results_aggregated <- left_join(prediction_summary,
combined_feature_selection_summary)
results_formatted <- results_aggregated %>%
mutate(
MSE = paste0(sprintf("%.3f", MSE_mean), " (", sprintf("%.3f", MSE_sd), ")"),
Bias2 = paste0(sprintf("%.3f", Bias2_mean), " (", sprintf("%.3f", Bias2_sd), ")"),
Variance = paste0(sprintf("%.3f", Variance_mean), " (", sprintf("%.3f", Variance_sd), ")"),
Mean = paste0(sprintf("%.3f", Mean_mean), " (", sprintf("%.3f", Mean_sd), ")"),
Correlation = paste0(sprintf("%.3f", Correlation_mean), " (", sprintf("%.3f", Correlation_sd), ")"),
FalsePosRate = paste0(sprintf("%.3f", FalsePosRate_mean), " (", sprintf("%.3f", FalsePosRate_sd), ")"),
FalseNegRate = paste0(sprintf("%.3f", FalseNegRate_mean), " (", sprintf("%.3f", FalseNegRate_sd), ")"),
F1measure = paste0(sprintf("%.3f", F1measure_mean), " (", sprintf("%.3f", F1measure_sd), ")"),
AUC = paste0(sprintf("%.3f", AUC_mean), " (", sprintf("%.3f", AUC_sd), ")"),
# proportion_within_ci = paste0(sprintf("%.3f", proportion_within_ci_mean), " (", sprintf("%.3f", proportion_within_ci_sd), ")")
) %>%
# We exclude the predicted mean
dplyr::select(Scenario, Method, MSE, -Bias2, Variance, -Mean, -Correlation, FalsePosRate, FalseNegRate, -F1measure, AUC) # %>% # , proportion_within_ci) %>%
# rename(`Variance Parameter Coverage` = proportion_within_ci)
# Step 2: Pivot the data to long format
results_long <- results_formatted %>%
pivot_longer(cols = -c(Scenario, Method), names_to = "Metric", values_to = "Value")
# Check for duplicates
results_long %>%
group_by(Metric, Scenario, Method) %>%
summarise(n = n(), .groups = 'drop') %>%
filter(n > 1)
# Handle duplicates by taking the first occurrence (or mean, max, etc.)
results_wide <- results_long %>%
unite("Scenario_Method", Scenario, Method, sep = " - ") %>% # Combine Scenario and Method
pivot_wider(names_from = Scenario_Method, values_from = Value, values_fn = list(Value = first)) %>%
t()
# Assuming `results_wide` is your matrix:
results_df <- as.data.frame(results_wide[-1, ])
colnames(results_df) <- results_wide[1, ] # Assign the first row as column names
results_df_named <- cbind(data.frame("Scenario - Method" = rownames(results_df)), results_df)
rownames(results_df_named) <- NULL
# Step 4: Convert to LaTeX table using xtable
latex_table <- xtable(results_df_named,
# caption = "Summary of model performance metrics by Scenario and Method. Metrics represent the mean (standard deviation).",
digits = 3) # Ensure 3 digits are printed)
# Print LaTeX table with caption placed at the top
print(latex_table, include.rownames = FALSE, caption.placement = "top")
# Make prediction scatterplots
# Filter the data for Seed == 1
filtered_data <- prediction_granular_results %>%
filter(Seed == 6)
# Create a named vector for the facet labels
scenario_labels <- c("1" = "Scenario 1", "2" = "Scenario 2", "3" = "Scenario 3")
# Create the scatterplots for each scenario, colored by Method
scatter_plot <- filtered_data %>%
filter(Method != "RandMVLearn") %>%
mutate(Method = ifelse(Method=="2step", "PCA2Step", Method)) %>%
mutate(Method = ifelse(Method=="CooperativeLearning", "Cooperative", Method)) %>%
ggplot(aes(x = Predicted_Y, y = True_Y, color = Method)) +
geom_point(alpha = 0.15) + # Add points with transparency
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # Perfect calibration line
facet_wrap(~ Scenario, nrow = 1, labeller = as_labeller(scenario_labels)) + # Custom facet labels
labs(x = expression(hat(y)), y = expression(y), title = NULL) +
theme_minimal() +
theme(legend.position = "top") # Move legend to the bottom for better visualization
# Obtain the third color from the default palette
default_colors <- scale_color_discrete()$palette(4) # Get first 3 colors from the default palette
# true_y_color <- default_colors[2] # Select the third color
# Define a named vector for colors, assigning "True Y" the third color from the default palette
cb_palette <- c(
"BIP" = default_colors[1],
"BIPmixed" = default_colors[2],
"Cooperative" = default_colors[3],
"PCA2Step" = default_colors[4],
"True Y" = "red")
# Create a combined data frame with True_Y and Predicted_Y by Method for plotting
combined_data <- filtered_data %>%
mutate(Method = ifelse(Method=="2step", "PCA2Step", Method)) %>%
mutate(Method = ifelse(Method=="CooperativeLearning", "Cooperative", Method)) %>%
pivot_longer(cols = c(True_Y, Predicted_Y),
names_to = "Type",
values_to = "Y_Value") %>%
mutate(Type = ifelse(Type == "True_Y", "True Y", Method)) %>%
filter(Method != "RandMVLearn")
# Create the combined density plot stratified by Method using the customized palette
combined_density_plot <- ggplot(combined_data, aes(x = Y_Value, color = Type)) +
geom_density(alpha = 0.3, size = 0.5) +
facet_wrap(~ Scenario, nrow = 1) +
scale_color_manual(values = cb_palette, guide = guide_legend(override.aes = list(color = cb_palette))) +
labs(x = "y (True and Predicted)", y = "Density", title = NULL) +
theme_minimal() +
theme(legend.title = element_blank(),
legend.position = "bottom",
panel.grid = element_blank(),
panel.border = element_rect(fill = NA, color = "black", size = 0.5),
plot.title = element_text(hjust = 0.5),
strip.text = element_blank()) # Hide facet labels
# Print the plot
print(combined_density_plot)
# Specify the layout matrix
layout_matrix <- rbind(
c(1), # First row is scatter_plot spanning 2 columns
c(1), # Second row is scatter_plot spanning 2 columns
c(2) # Third row is combined_density_plot spanning 2 columns
)
# Combine the plots with grid.arrange
combined_plot <- grid.arrange(
scatter_plot,
combined_density_plot,
layout_matrix = layout_matrix
)
# Display the combined plot
print(combined_plot)
# Specify the file path to save the plot
file_path <- "figures/combined_simulations_plot.png"
# Save the combined density plot to the specified path
ggsave(filename = file_path,
plot = combined_plot,
width = 8, # Width of the plot in inches
height = 8, # Height of the plot in inches
dpi = 300) # Resolution of the plot (dots per inch)