Skip to content

Commit

Permalink
export small tweak and central processing update
Browse files Browse the repository at this point in the history
  • Loading branch information
vojtechhuser committed Mar 3, 2020
1 parent 26481bb commit b0489d7
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 152 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: DataQuality
Title: Data Quality study in OHDSI and data quality evaluation
Version: 4.0
Version: 4.1
Authors@R: "Vojtech Huser <[email protected]> [aut, cre]"
Description: This package has several functions. It supports Data Quality Dashboard and OHDSI Data Quality study.
It also allows Data Quality evaluation (even after closure of the formal study).
Expand Down
144 changes: 0 additions & 144 deletions extras/CentralProcessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,147 +29,3 @@ ttd<-combHeel %>% left_join(lkup_rules)
ttb2<-ttb %>% left_join(lkup_rules)




#DQD Centralized processing
#----------------------------------------
#----------------------------------------
#this script assummes coordinating center infrustructure for loading
#athena dictionaries (.rda files istead of in a database)
#now truly doing 2 sites

#load athena dictionary
library(tidyverse);library(magrittr);options(tibble.print_max = 200)
load('o:/athena/concept.rda')

#lkup<-concept %>% filter(vocabulary_id %in% c('CPT4','ICD9Proc','CDT','HCPCS','ICD9CM','ICD10CM','ICD10PCS'))

#reading a single site data (for now)
f<-'d:/OneDrive - National Institutes of Health/temp/dqd/export'
f<-'d:/OneDrive - National Institutes of Health/ohdsi/thresholds'

sfiles<-c(file.path(f,'1ThresholdsA.csv'))
sfiles<-c(file.path(f,'test-ThresholdsA.csv'))
sfiles<-c(file.path(f,'1ThresholdsA.csv'),file.path(f,'ThresholdsA.csv'),file.path(f,'test-ThresholdsA.csv'))
ll<-map(sfiles,read_csv)
ll

#ll<-map(p$pid,doProperty())
#strip name from full path trick
ll2<-map2(ll,basename(sfiles),~mutate(.x,site=.y))
d<-bind_rows(ll2)

#add terminology concepts
sconcept<-concept %>% select(concept_id,concept_name)
names(d) <- tolower(names(d))
names(d)
#remove no units rows and expand the CIDs
d2<-d %>% filter(stratum_1 != 0) %>% filter(stratum_2 != 0) %>% left_join(sconcept,by=c('stratum_1'='concept_id')) %>%
left_join(sconcept,by=c('stratum_2'='concept_id'))
names(d2)

#remove columns that are not needed
# d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5,-p25_value,-p75_value) %>%
# filter(count_value >=100 ) %>% arrange(stratum_1,desc(count_value) )

d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5) %>%
arrange(stratum_1,desc(count_value) )


d3 %>% count(site)
names(d3)
ba<-d3 %>% group_by(stratum_1,stratum_2,concept_name.x,concept_name.y) %>% summarize(tcnt=sum(count_value),n=n())
ba %>% filter(n>=2)



#24 test-unit pairs have 2 results



#tests with more units
d3 %>% count(stratum_1)


#only where multiple sites
d10<-d3 %>% inner_join(ba %>% filter(n>=2))





#even more removal of data
d4<-d3 %>% select(-count_value,-median_value,-stdev_value,-avg_value,-site)

d4 %>% write_csv('extras/DqdResults/thresholds-list-A.csv')
nrow(d4)


#read DD checks
library(stats);library(tidyverse);library(magrittr)
#message("\n*** Successfully loaded .Rprofile ***\n")


url='https://raw.githubusercontent.com/OHDSI/DataQualityDashboard/master/inst/csv/OMOP_CDMv5.3.1_Concept_Level.csv'
dqd<-read_csv(url)
str(dqd)
names(dqd)
dqd %>% dplyr::filter(cdmTableName=='MEASUREMENT' & cmdFieldName=='MEASUREMENT_CONCEPT_ID' )
dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' ) %>% nrow()
dqd %>% count(cdmTableName,cdmFieldName)


#compare data driven and expert drive sets
d$STRATUM_1 %<>% as.integer()
dqd$unitConceptId %<>% as.integer()
expert <-dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' )
nrow(expert)
names(expert)
elabs<-expert %>% group_by(conceptId,conceptName) %>% summarise(unitcnt=n(),units=paste(unitConceptName,collapse = "|"))

#330 lab tests
elabs %>% write_csv('extras/DqdResults/DQD-expert-driven-A-lab-list.csv')

names(expert)
ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2) %>% select(conceptId,unitConceptId) %>% unique()

names(d2)
ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2)
ddriven<-d2 %>% rename(conceptId=stratum_1,unitConceptId=stratum_2)
#ddriven %<>% filter(conceptId!=0)
#ddriven %<>% filter(unitConceptId!=0)

over=expert %>% inner_join(ddriven) #58 overlapping
View(over)
expert %>% anti_join(ddriven) #827 are in expert but not in data

not1<-ddriven %>% anti_join(expert) #14 are in data and not in expert


#compare the trehsholds
names(over)
over %>% select(conceptName,unitConceptName,plausibleValueLow,min_value)
over %>% select(conceptName,unitConceptName,plausibleValueHigh,max_value)
#%>% knitr::kable()


#expert thresholds don't follow unit conversion logic (max and min is same even if units indicate order of magniture difference)
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8713 gram per deciliter 5 5 2000 5 NA NA NA NA NA NA NA NA
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8840 milligram per deciliter 5 5 2000

#5g/dL into mg/dL (is 5000 mg/dL)
#in data is in fact unit/L



#unitmorph
#Protein [Mass/volume] in Serum or Plasma 7096851 4 gram per deciliter|unit|milligram per deciliter|gram per liter
# gram per deciliter| |milligram per deciliter | gram per liter
names(d3)
bb<-d3 %>% filter(site=='ThresholdsA.csv') %>% group_by(stratum_1,concept_name.x) %>%
summarize(tcnt=sum(count_value)
,n=n(),units=paste(concept_name.y,collapse = '|')
,cnts=paste(count_value ,collapse = '|')
,unitcids=paste(stratum_2,collapse = '|')
)
bb %>% write_csv('local/morphA.csv')
183 changes: 183 additions & 0 deletions extras/CentralProcessingDQDThresholds.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@


#DQD Centralized processing
#----------------------------------------
#----------------------------------------
#this script assummes coordinating center infrustructure for loading
#athena dictionaries (.rda files istead of in a database)
#now truly doing 2 sites

#load athena dictionary
library(tidyverse);library(magrittr);options(tibble.print_max = 200)
load('o:/athena/concept.rda')

#lkup<-concept %>% filter(vocabulary_id %in% c('CPT4','ICD9Proc','CDT','HCPCS','ICD9CM','ICD10CM','ICD10PCS'))

#reading a single site data (for now)
f<-'d:/OneDrive - National Institutes of Health/temp/dqd/export'
f<-'d:/OneDrive - National Institutes of Health/ohdsi/thresholds'

sfiles<-c(file.path(f,'1ThresholdsA.csv'))
sfiles<-c(file.path(f,'test-ThresholdsA.csv'))
sfiles<-c(file.path(f,'1ThresholdsA.csv'),file.path(f,'ThresholdsA.csv'),file.path(f,'test-ThresholdsA.csv'))
#3 sites processing +1
sfiles<-c(file.path(f,'01ThresholdsB.csv')
,file.path(f,'02ThresholdsB.csv')
,file.path(f,'03ThresholdsB.csv')
,file.path(f,'04ThresholdsB.csv')
,file.path(f,'05ThresholdsB.csv')
)
ll<-map(sfiles,read_csv)
ll

#ll<-map(p$pid,doProperty())
#strip name from full path trick

#make lowercase the column names
llmoded<-map(ll,~{names(.x)<-tolower(names(.x));return(.x)})
#llmoded[[1]]
#ll[[1]]
ll2<-map2(llmoded,basename(sfiles),~mutate(.x,site=.y))
d<-bind_rows(ll2)

#add terminology concepts
sconcept<-concept %>% select(concept_id,concept_name)
names(d) <- tolower(names(d))
names(d)
#remove no units rows and expand the CIDs
#stratum hav suffix id
# d2<-d %>% filter(stratum_1 != 0) %>% filter(stratum_2 != 0) %>% left_join(sconcept,by=c('stratum_1'='concept_id')) %>%
# left_join(sconcept,by=c('stratum_2'='concept_id'))


d2<-d %>% filter(count_value >=11 ) %>% filter(stratum1_id != 0) %>% filter(stratum2_id != 0) %>% left_join(sconcept,by=c('stratum1_id'='concept_id')) %>%
left_join(sconcept,by=c('stratum2_id'='concept_id')) %>% filter(!is.na(concept_name.x))
#test in 2B range are excluded by last filter

names(d2)

#overview of sites
soverview<-d2 %>% count(site)
soverview
#soverview %>% write_csv('extras/DqdResults/S1_overview.csv')

#remove columns that are not needed
# d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5,-p25_value,-p75_value) %>%
# filter(count_value >=100 ) %>% arrange(stratum_1,desc(count_value) )

d3<-d2 %>% select(-stratum_3,-stratum_4,-stratum_5) %>%
arrange(stratum_1,desc(count_value) )


#d3 %>% count(site)
#names(d3)
ba<-d2 %>% group_by(stratum1_id,stratum2_id,concept_name.x,concept_name.y) %>% summarize(tcnt=sum(count_value),n=n())
ba %>% filter(n>=2) %>% nrow()
nrow(ba)
#4465 distinct test-unit pairs
#872 test-unit paris have 2+ sites



#tests with more units
ba %>% ungroup() %>% count(stratum1_id,concept_name.x) %>% filter(n>=2)
#TODO improve later
#868 tests have 2+ units

#only where multiple sites
#d10<-d3 %>% inner_join(ba %>% filter(n>=2))





#even more removal of data
#d4<-d3 %>% select(-count_value,-median_value,-stdev_value,-avg_value,-site)

#d4 %>% write_csv('extras/DqdResults/thresholds-list-A.csv')
#nrow(d4)


#end of analysis of


#---------------comparison with expert driven

#read expert driven checks
library(stats);library(tidyverse);library(magrittr)
#message("\n*** Successfully loaded .Rprofile ***\n")


url='https://raw.githubusercontent.com/OHDSI/DataQualityDashboard/master/inst/csv/OMOP_CDMv5.3.1_Concept_Level.csv'
dqd<-read_csv(url)
str(dqd)
names(dqd)
dqd %>% dplyr::filter(cdmTableName=='MEASUREMENT' & cmdFieldName=='MEASUREMENT_CONCEPT_ID' )
dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' ) %>% nrow()
dqd %>% count(cdmTableName,cdmFieldName)


#compare data driven and expert drive sets
#d$STRATUM_1 %<>% as.integer()
dqd$unitConceptId %<>% as.integer()
expert <-dqd %>% dplyr::filter(cdmFieldName=='MEASUREMENT_CONCEPT_ID' )
nrow(expert)
#856 threshold checks are in expert driven KB
names(expert)
elabs<-expert %>% group_by(conceptId,conceptName) %>% summarise(unitcnt=n(),units=paste(unitConceptName,collapse = "|"))

# for 330 distinct lab tests
elabs %>% write_csv('extras/DqdResults/DQD-expert-driven-A-lab-list.csv')

names(expert)
#ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2) %>% select(conceptId,unitConceptId) %>% unique()

names(d2)
#ddriven<-d %>% rename(conceptId=STRATUM_1,unitConceptId=STRATUM_2)
ddriven<-d2 %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id)
ddriven2<-ba %>% rename(conceptId=stratum1_id,unitConceptId=stratum2_id)

#ddriven %<>% filter(conceptId!=0)
#ddriven %<>% filter(unitConceptId!=0)

over=expert %>% inner_join(ddriven2)
nrow(over)
#331 tests are overlapping between ddriven (data driven) and expert (expert driven)
#View(over)

not2<-expert %>% anti_join(ddriven2)
nrow(not2)
#525 are in expert list but not in data from any site

not1<-ddriven2 %>% anti_join(expert)
nrow(not1)
#4134 are in data but are absent in expert driven KB


#compare the trehsholds
names(over)
over %>% select(conceptName,unitConceptName,plausibleValueLow,min_value)
over %>% select(conceptName,unitConceptName,plausibleValueHigh,max_value)
#%>% knitr::kable()


#expert thresholds don't follow unit conversion logic (max and min is same even if units indicate order of magniture difference)
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8713 gram per deciliter 5 5 2000 5 NA NA NA NA NA NA NA NA
#MEASUREMENT MEASUREMENT_CONCEPT_ID 3013721 Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma 8840 milligram per deciliter 5 5 2000

#5g/dL into mg/dL (is 5000 mg/dL)
#in data is in fact unit/L



#unitmorph
#Protein [Mass/volume] in Serum or Plasma 7096851 4 gram per deciliter|unit|milligram per deciliter|gram per liter
# gram per deciliter| |milligram per deciliter | gram per liter
names(d3)
bb<-d3 %>% filter(site=='ThresholdsA.csv') %>% group_by(stratum_1,concept_name.x) %>%
summarize(tcnt=sum(count_value)
,n=n(),units=paste(concept_name.y,collapse = '|')
,cnts=paste(count_value ,collapse = '|')
,unitcids=paste(stratum_2,collapse = '|')
)
bb %>% write_csv('local/morphA.csv')
17 changes: 10 additions & 7 deletions inst/dqd/readme.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@ This file describes individual files included in the DataQuality export folder/e



ThresholdsB.csv
===============

This file is using custom SQL code (not achilles, but inspired by it) and has in addition percentiles 1,2,3,97,98,99




SuitableMeasurementsAndUnits.csv
===================================

This file has the suitable pairs of measurements and units.

(not used in latest version)


ThresholdsA.csv
===============

This file is using Achilles measure 1815 and has percentile 10 and 90


ThresholdsB.csv
===============

This file is using custom SQL code (not achilles, but inspired by it) and has in addition percentiles 1,2,3,97,98,99
(not used in latest version)

0 comments on commit b0489d7

Please sign in to comment.