Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utilities clean up #9

Merged
merged 13 commits into from
Jun 24, 2021
161 changes: 90 additions & 71 deletions extras/compareResults.R
Original file line number Diff line number Diff line change
@@ -1,73 +1,92 @@
library("dplyr")
library("jsonlite")
library("ggplot2")
library("plotly")
#' Compare DQD results
#'
#' Matches checks from the two provided DQD results and saves two outputs:
#' \itemize{
#' \item a table with all checks that are different
#' \item an interactive plot (html) with old fail percentage versus new fail
#' percentage
#' }
#' This makes it easy to identify differences between subsequent runs of DQD.
#' Nothing will be created if there are no differences between the given DQD results.
#' See also: \link[TBD]{TBD - link to abstract}
#'
#' @param jsonPath.old the path to the old DQD json results file
#' @param jsonPath.new the path to the new DQD json results file
#' @param savingDir the path to the folder where the output should be written
#' @import dplyr
#' @import ggplot2
#'
#' @author Elena Garcia Lara, Maxim Moinat
#'
#' @return An interactive plotly figure or nothing if no differences are found.
#' @export
#'
#' @examples
#' \dontrun{
#' compareDqResults("dqd_results_1.json", "dqd_results_2.json", "output")
#' }
compareDqResults <- function(jsonPath.old, jsonPath.new, savingDir){

# Directories
setwd('/my/path/to/dqd/extra/scripts/')
file_old <- "json/results/folder/file1.json"
file_new <- "json/results/folder/file2.json"
saving_dir <- "results_compare/"
dir.create(file.path(getwd(), saving_dir), showWarnings = FALSE)


# List all differences
# ... between OLD
result_old <- jsonlite::fromJSON(file_old)
check_results_old <- tibble(result_old$CheckResults)

# ... and NEW
result_new <- jsonlite::fromJSON(file_new)
check_results_new <- tibble(result_new$CheckResults)

# ... only keep the different
combined_results <- check_results_old %>%
left_join(check_results_new,
by=c("CHECK_NAME", "CDM_TABLE_NAME", "CDM_FIELD_NAME",
"CONCEPT_ID", "UNIT_CONCEPT_ID"),
suffix=c(".old", ".new")) %>%
filter(PCT_VIOLATED_ROWS.old != PCT_VIOLATED_ROWS.new)

# Save as csv
write.csv(combined_results, file=file.path(getwd(), saving_dir, "different_checks.csv"))

# If no differences: add mock point
if(nrow(combined_results)==0){
stop("No differences found.")
# List all differences
# ... between OLD
result_old <- jsonlite::fromJSON(jsonPath.old)
check_results_old <- tibble(result_old$CheckResults)

# ... and NEW
result_new <- jsonlite::fromJSON(jsonPath.new)
check_results_new <- tibble(result_new$CheckResults)

# ... only keep the different
combined_results <- check_results_old %>%
left_join(check_results_new,
by=c("CHECK_NAME", "CDM_TABLE_NAME", "CDM_FIELD_NAME",
"CONCEPT_ID", "UNIT_CONCEPT_ID"),
suffix=c(".old", ".new")) %>%
filter(PCT_VIOLATED_ROWS.old != PCT_VIOLATED_ROWS.new)

# When No difference found, exit function
if(nrow(combined_results)==0){
stop("No differences found.")
}

# Save as csv
saving_name <- file.path(savingDir, paste("compare_dqd", Sys.Date(), sep="_"))
dir.create(file.path(savingDir), showWarnings = FALSE)
write.csv(combined_results, file=paste(saving_name, ".csv", sep=""))

# Visualization
p <- combined_results %>%
mutate(
fail_status = ifelse(FAILED.new==0, "Pass", "Fail"),
pct_old = round(PCT_VIOLATED_ROWS.old*100, digits=2),
pct_new = round(PCT_VIOLATED_ROWS.new*100, digits=2)
) %>%
ggplot(aes(x=pct_old,
y=pct_new,
colour=fail_status,
text=paste(
sprintf('<br><i>Check name: </i>%s', CHECK_NAME),
sprintf('<br><i>Table: </i>%s', CDM_TABLE_NAME),
sprintf('<br><i>Field: </i>%s', CDM_FIELD_NAME),
sprintf('<br><i>Threshold value: </i>%.1f%%', THRESHOLD_VALUE.new),
sprintf('<br><b><i>old: </i> %.2f%% </b>', pct_old),
sprintf('<br><b><i>new: </i> %.2f%% </b>', pct_new)
), alpha=0.6)) +
geom_point() +
geom_abline(colour="gray", linetype = "dashed")+
scale_colour_manual(labels = c("Fail", "Pass"),
values = c("Pass" = "darkblue", "Fail" = "chocolate1"))+
scale_alpha(guide = 'none') +
theme_minimal() +
theme(legend.title = element_blank()) +
lims(y=c(0,100), x=c(0,100)) +
labs(x="Previous % of row fails", y="Current % of row fails") +
annotate("text", label="Improved", x = 88.5, y = 12.5, colour="grey") +
annotate("text", label="Worsened", x = 12.5, y = 88.5, colour="grey")

p_interactive <- plotly::ggplotly(p, tooltip="text") %>%
plotly::style(hoveron="text")

htmlwidgets::saveWidget(p_interactive, file=paste(saving_name, ".html", sep=""))
p_interactive
}

# Visualization
p <- combined_results %>%
mutate(
fail_status = ifelse(FAILED.new==0, "Pass", "Fail"),
pct_old = round(PCT_VIOLATED_ROWS.old*100, digits=2),
pct_new = round(PCT_VIOLATED_ROWS.new*100, digits=2)
) %>%
ggplot(aes(x=pct_old,
y=pct_new,
colour=fail_status,
text=paste(
sprintf('<br><i>Check name: </i>%s', CHECK_NAME),
sprintf('<br><i>Table: </i>%s', CDM_TABLE_NAME),
sprintf('<br><i>Field: </i>%s', CDM_FIELD_NAME),
sprintf('<br><i>Threshold value: </i>%.1f%%', THRESHOLD_VALUE.new),
sprintf('<br><b><i>old: </i> %.2f%% </b>', pct_old),
sprintf('<br><b><i>new: </i> %.2f%% </b>', pct_new)
), alpha=0.6)) +
geom_point() +
geom_abline(colour="gray", linetype = "dashed")+
scale_colour_manual(labels = c("Fail", "Pass"),
values = c("Pass" = "darkblue", "Fail" = "chocolate1"))+
scale_alpha(guide = 'none') +
theme_minimal() +
theme(legend.title = element_blank()) +
lims(y=c(0,100), x=c(0,100)) +
labs(x="Previous % of row fails", y="Current % of row fails") +
annotate("text",label="Improved", x = 88.5, y = 12.5, colour="grey") +
annotate("text",label="Worsened", x = 12.5, y = 88.5, colour="grey")

p_interactive <- ggplotly(p, tooltip="text") %>%
style(hoveron="text")

htmlwidgets::saveWidget(p_interactive, file.path(getwd(), saving_dir, "fig_compare_DQD.html"))
p_interactive
189 changes: 102 additions & 87 deletions extras/conceptMappingCoveragePerDomain.R
Original file line number Diff line number Diff line change
@@ -1,87 +1,102 @@
library("dplyr")
library("jsonlite")
library("ggplot2")

setwd('/my/path/to/dqd/')
dqd_filename <- 'my_dqd_result.json'

# Load data
result <- jsonlite::fromJSON(dqd_filename)
check_results <- result$CheckResults %>%
select(CHECK_NAME, CDM_TABLE_NAME, CDM_FIELD_NAME,
NUM_VIOLATED_ROWS, NUM_DENOMINATOR_ROWS, PCT_VIOLATED_ROWS)

coverage_results <- check_results %>%
filter(CHECK_NAME %in% c("standardConceptRecordCompleteness", "sourceValueCompleteness")) %>%
# Not interested in era's as these are all derived
filter(!(CDM_TABLE_NAME %in% c("DRUG_ERA", "DOSE_ERA", "CONDITION_ERA"))) %>%
mutate(
CDM_FIELD_NAME = toupper(CDM_FIELD_NAME),
# First check is over all records, second over the unique source terms
coverageType = recode(CHECK_NAME,
standardConceptRecordCompleteness = "Records",
sourceValueCompleteness = "Terms"),
# Coverage is rows not failing
coveragePct = 1 - PCT_VIOLATED_ROWS,
# Naming of domains
domain = gsub("_(OCC\\w+|EXP\\w+|PLAN.+)$", "", CDM_TABLE_NAME),
variable = ifelse(CHECK_NAME=="standardConceptRecordCompleteness",
sub('_CONCEPT_ID', '', CDM_FIELD_NAME),
sub('_SOURCE_VALUE', '', CDM_FIELD_NAME)
),
domain_abbrev = recode(domain,
VISIT = "VST",
CONDITION = "COND",
PROCEDURE = "PROC",
OBSERVATION = "OBS",
MEASUREMENT = "MEAS",
SPECIMEN = "SPEC"
),
domainField = ifelse(domain==variable,
domain,
paste0(domain_abbrev,"-",variable)
)
)

# Table
table <- coverage_results %>%
mutate(
percentUnmapped = scales::percent(PCT_VIOLATED_ROWS, accuracy = 0.01),
nUnmapped = formatC(NUM_VIOLATED_ROWS, format="d", big.mark=","),
nTotal = formatC(NUM_DENOMINATOR_ROWS, format="d", big.mark=",")
) %>%
select(domainField, coverageType, percentUnmapped, nUnmapped, nTotal) %>%
arrange(domainField, desc(coverageType)) # by domain, terms first, then records

write.csv(table, file="concept_mapping_coverage.csv", row.names = FALSE)

# Coverage plot like fig 6 in EHDEN DoA
# note: coverage is percentage NOT failing to map
fig <- coverage_results %>%
# To keep things simple, we only look at the six main domains and units
filter(domainField %in% c("VISIT", "PROCEDURE", "DRUG", "CONDITION", "MEASUREMENT",
"OBSERVATION", "MEAS-UNIT", "OBS-UNIT")
) %>%
mutate(
coveragePct = 1 - PCT_VIOLATED_ROWS
) %>%
ggplot(aes(x=coverageType, y = coveragePct, fill = coverageType)) +
geom_col() +
geom_text(aes(label=scales::percent(coveragePct, accuracy = 0.01)),
position=position_stack(vjust=0.5),
size=3, colour="gray10", fontface="bold") +
theme_minimal() +
theme(
axis.text.y=element_text(size=10),
strip.placement="outside",
strip.text.y=element_text(angle=0, hjust=0.5, face="bold", size=6)
) +
coord_flip() +
facet_grid(domainField ~ ., scales="free_y", space="free_y", switch="y") +
guides(fill=FALSE) +
ylab("Percentage Coverage (%)") +
xlab("") +
scale_fill_manual(values=c("cornflowerblue", "skyblue"))

ggsave('concept_mapping_coverage.png', height = 8, width = 8 * 1.61803)

#' Plot concept mapping coverage
#'
#' Finds mapping coverage from a given DQD results file, and returns an accessible figure
#'
#' @param jsonPath the path to the DQD json results file
#' @param savingDir the path to the folder where the output should be written
#' @import ggplot2
#' @import dplyr
#'
#' @author Elena Garcia Lara, Maxim Moinat
#'
#' @return A figure to visualize concept mapping coverage per domain
#' @export
#'
#' @examples
#' \dontrun{
#' plotConceptCoverage("dqd_results.json", "output")
#' }
plotConceptCoverage <- function(jsonPath, savingDir){

# Load data
result <- jsonlite::fromJSON(jsonPath)
check_results <- result$CheckResults %>%
select(CHECK_NAME, CDM_TABLE_NAME, CDM_FIELD_NAME,
NUM_VIOLATED_ROWS, NUM_DENOMINATOR_ROWS, PCT_VIOLATED_ROWS)

coverage_results <- check_results %>%
filter(CHECK_NAME %in% c("standardConceptRecordCompleteness", "sourceValueCompleteness")) %>%
# Not interested in era's as these are all derived
filter(!(CDM_TABLE_NAME %in% c("DRUG_ERA", "DOSE_ERA", "CONDITION_ERA"))) %>%
mutate(
CDM_FIELD_NAME = toupper(CDM_FIELD_NAME),
# First check is over all records, second over the unique source terms
coverageType = recode(CHECK_NAME,
standardConceptRecordCompleteness = "Records",
sourceValueCompleteness = "Terms"),
# Coverage is rows not failing
coveragePct = 1 - PCT_VIOLATED_ROWS,
# Naming of domains
domain = gsub("_(OCC\\w+|EXP\\w+|PLAN.+)$", "", CDM_TABLE_NAME),
variable = ifelse(CHECK_NAME=="standardConceptRecordCompleteness",
sub('_CONCEPT_ID', '', CDM_FIELD_NAME),
sub('_SOURCE_VALUE', '', CDM_FIELD_NAME)
),
domain_abbrev = recode(domain,
VISIT = "VST",
CONDITION = "COND",
PROCEDURE = "PROC",
OBSERVATION = "OBS",
MEASUREMENT = "MEAS",
SPECIMEN = "SPEC"
),
domainField = ifelse(domain==variable,
domain,
paste0(domain_abbrev,"-",variable)
)
)

# Table
table <- coverage_results %>%
mutate(
percentUnmapped = scales::percent(PCT_VIOLATED_ROWS, accuracy = 0.01),
nUnmapped = formatC(NUM_VIOLATED_ROWS, format="d", big.mark=","),
nTotal = formatC(NUM_DENOMINATOR_ROWS, format="d", big.mark=",")
) %>%
select(domainField, coverageType, percentUnmapped, nUnmapped, nTotal) %>%
arrange(domainField, desc(coverageType)) # by domain, terms first, then records

write.csv(table, file="concept_mapping_coverage.csv", row.names = FALSE)

# Coverage plot like fig 6 in EHDEN DoA
# note: coverage is percentage NOT failing to map
fig <- coverage_results %>%
# To keep things simple, we only look at the six main domains and units
filter(domainField %in% c("VISIT", "PROCEDURE", "DRUG", "CONDITION", "MEASUREMENT",
"OBSERVATION", "MEAS-UNIT", "OBS-UNIT")
) %>%
mutate(
coveragePct = 1 - PCT_VIOLATED_ROWS
) %>%
ggplot(aes(x=coverageType, y = coveragePct, fill = coverageType)) +
geom_col() +
geom_text(aes(label=scales::percent(coveragePct, accuracy = 0.01)),
position=position_stack(vjust=0.5),
size=3, colour="gray10", fontface="bold") +
theme_minimal() +
theme(
axis.text.y=element_text(size=10),
strip.placement="outside",
strip.text.y=element_text(angle=0, hjust=0.5, face="bold", size=6)
) +
coord_flip() +
facet_grid(domainField ~ ., scales="free_y", space="free_y", switch="y") +
guides(fill=FALSE) +
ylab("Percentage Coverage (%)") +
xlab("") +
scale_fill_manual(values=c("cornflowerblue", "skyblue"))

saving_name <- file.path(savingDir, paste("concept_mapping_coverage", Sys.Date(), sep="_"))
dir.create(file.path(savingDir), showWarnings = FALSE)
ggsave(filename=paste(saving_name, ".png", sep=""), height = 8, width = 8 * 1.61803)
}
2 changes: 1 addition & 1 deletion extras/edit_thresholds/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ In the previous files, each check can be found using a combination of the check

| Level | checkName | cdmTableName | cdmFieldName | fkTableName |fkDomain|conceptId | unitConceptId| Threshold | Notes |
| ----| ---- | ----- | ----- |----- | ----- |----- | ----- |----- | ----- |
| FieLd | isRequired | MEASUREMENT | person_id | | | ||10| "one note" |
| Field | isRequired | MEASUREMENT | person_id | | | ||10| "one note" |
|Field | plausibleValueLow | PERSON | year_of_birth | | | ||100 | "another note" |
|Field | isForeignKey| MEASUREMENT| person_id | PERSON | | ||1 | |
|Field |fkDomain | PERSON | race_concept_id | | Race | ||49 | "and yet another one" |
Expand Down
Loading