##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(readr)
library(readxl)
library(httr)
library(purrr)
library(stringi)
library(tibble)
# library(gt)
# library(gtsummary)
source('utils.R')
github_link <- 'https://raw.githubusercontent.com/covidclinical/Phase1.1AggregateDataPerSite/master/'
demo_link <- paste0(github_link, 'Demographics_persite_fakeID.csv')
diag_link <- paste0(github_link, 'Diagnoses_persite_fakeID.csv')
temp_file_1 <- temp_file_2 <- tempfile(fileext = ".csv")
req <- GET(demo_link,
# authenticate using GITHUB_PAT
authenticate(Sys.getenv("GITHUB_PAT"), ""),
# write result to disk
write_disk(path = temp_file_1))
demo_all <- read_csv(temp_file_1)
unlink(temp_file_1)
req <- GET(diag_link,
authenticate(Sys.getenv("GITHUB_PAT"), ""),
write_disk(path = temp_file_2))
diag_all <- read_csv(temp_file_2)
unlink(temp_file_2)
Notes: - num_patients_ever_severe_icd1
: number of patients having the icd code and ever being severe - num_patients_never_severe_icd1
: number of patients having the icd code and never being severe - num_patients_ever_severe_icd0
: number of patients not having the icd code and ever being severe - num_patients_never_severe_icd0
: number of patients not having the icd code and never being severe
site_country <- read_csv('data/SiteID_Map_Non_Pediatric_3digit_toshare.csv') %>%
select(- Peds.only) %>%
mutate(Country = case_when(
Country == 'USA' ~ Country,
TRUE ~ stringr::str_to_title(Country)))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Country = col_character(),
## SiteID.new = col_character(),
## Peds.only = col_character()
## )
neuro_icds_10 <- read_excel('data/2020-09-10_neuro-icd10.xlsx') %>%
rename('icd' = `ICD-10`)
neuro_icds_9 <- readxl::read_excel('data/neuroicd10to9withdescript.xlsx') %>%
left_join(neuro_icds_10 %>% select(`Neurological Disease Category`, icd),
by = c('ICD10_CODE' = 'icd')) %>%
# select(- c(ICD9_CODE, ICD9_DESCRIPTION, ICD10_CODE)) %>%
rename('icd' = ICD9_THREE_DIGIT) %>%
filter(!is.na(icd)) %>% # ignore ICD10 code G46 and G65 not mapped to any ICD_9 code
{.}
neuro_icds_9 %>%
distinct(icd, ICD10_CODE) %>%
count(icd) %>%
filter(n > 1)
## # A tibble: 4 x 2
## icd n
## <chr> <int>
## 1 437 2
## 2 780 3
## 3 781 3
## 4 V41 2
`
For the two codes with non-specific description (780 and 781), we can include (in the Methods, Figure legend or supplementary material) the descriptions of all subcodes for each. See link here http://dbmi-ncats-test01.dbmi.pitt.edu/webclient/: no login required, and also the link below each code. I added the disease groups in parentheses for these four codes. The only group that is not consistent with the grouping based on ICD10 codes is vision/smell/taste. (For ICD10 codes, we separate vision and neuropathy, the latter of which includes smell and taste). V41 Problems with special senses and other special functions (vision/smell/taste) 437 Other and ill-defined cerebrovascular disease (vascular) 780 General symptoms (Other) https://icd.codes/icd9cm/780 781 Symptoms involving nervous and musculoskeletal systems (Other) https://icd.codes/icd9cm/781
phecode_icd9 <- read_csv('data/phecode_icd9_rolled.csv') %>%
select(icd = ICD9, icd9_desc = `ICD9 String`)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ICD9 = col_character(),
## `ICD9 String` = col_character(),
## PheCode = col_character(),
## Phenotype = col_character(),
## `Excl. Phecodes` = col_character(),
## `Excl. Phenotypes` = col_character(),
## Rollup = col_double(),
## Leaf = col_double()
## )
neuro_icds_9 <- neuro_icds_9 %>%
left_join(phecode_icd9, by = 'icd')
neuro_icds_9 %>% filter(icd %in% c('V41', '780', '781', '437'))
## # A tibble: 19 x 7
## icd ICD9_CODE ICD9_DESCRIPTION ICD10_CODE ICD10_DESCRIPTI… `Neurological D…
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 437 437.7 Transient globa… G45 Transient cereb… Vascular
## 2 780 780.79 Other malaise a… G93 Other disorders… Other
## 3 V41 V41.0 Problems with s… H54 Blindness and l… Vision
## 4 437 437.0 Cerebral athero… I67 Other cerebrova… Vascular
## 5 437 437.1 Other generaliz… I67 Other cerebrova… Vascular
## 6 437 437.2 Hypertensive en… I67 Other cerebrova… Vascular
## 7 437 437.3 Cerebral aneury… I67 Other cerebrova… Vascular
## 8 437 437.4 Cerebral arteri… I67 Other cerebrova… Vascular
## 9 437 437.5 Moyamoya disease I67 Other cerebrova… Vascular
## 10 437 437.6 Nonpyogenic thr… I67 Other cerebrova… Vascular
## 11 437 437.8 Other ill-defin… I67 Other cerebrova… Vascular
## 12 437 437.9 Unspecified cer… I67 Other cerebrova… Vascular
## 13 781 781.3 Lack of coordin… R27 Other lack of c… Coordination
## 14 780 780.93 Memory loss R41 Other symptoms … Consciousness
## 15 780 780.97 Altered mental … R41 Other symptoms … Consciousness
## 16 781 781.8 Neurologic negl… R41 Other symptoms … Consciousness
## 17 780 780.4 Dizziness and g… R42 Dizziness and g… Dizziness
## 18 781 781.1 Disturbances of… R43 Disturbances of… Neuropathy
## 19 V41 V41.5 Problems with s… R43 Disturbances of… Neuropathy
## # … with 1 more variable: icd9_desc <chr>
neuro_icds_9 <- neuro_icds_9 %>%
within(., icd9_desc[icd == 'V41'] <- 'Problems with special senses and other special functions') %>%
within(., `Neurological Disease Category`[icd == 'V41'] <- 'Vision/smell/taste') %>%
within(., icd9_desc[icd == '781'] <- 'Symptoms involving nervous and musculoskeletal systems') %>%
within(., `Neurological Disease Category`[icd == '781'] <- 'Neuropathy') %>% # and Consciousness
within(., icd9_desc[icd == '437'] <- 'Other and ill-defined cerebrovascular disease') %>%
within(., icd9_desc[icd == '780'] <- 'General symptoms') %>%
within(., `Neurological Disease Category`[icd == '780'] <- 'Other') %>%
distinct() %>%
select(`Neurological Disease Category`, icd, icd9_desc) %>%
distinct() %>%
arrange(icd)
neuro_icds_9 %>%
write_csv('results/icd9_tab.csv')
Notes:
“-99” indicate masked small numbers (for obfuscation) and “-999” indicate missing, mismatches, not applicable, etc.
Diagnoses_persite_fakeID.csv
has 45 unique siteid’s. Demographics_persite_fakeID.csv
has 45 unique siteid’s.
demo_ana <- demo_all %>%
filter(race == 'all', age_group == 'all', sex == 'all') %>%
# left_join(site_to_country, by = 'siteid') %>%
select(- c(sex, age_group, race)) %>%
bind_rows(# Compute patients_all for SITE734 manually
demo_all %>%
filter(siteid == 'SITE734', race == 'all', age_group == 'all') %>%
group_by(siteid) %>%
summarise(across(c(num_patients_all, num_patients_ever_severe), .fns = sum),
.groups = 'drop')
) %>%
mutate(num_patients_never_severe = num_patients_all - num_patients_ever_severe)
split_patient_type <- function(x) {
temp <- strsplit(sub('(^[^_]+_[^_]+)_(.*)$', '\\1 \\2', stri_reverse(x)), ' ')
map(temp, stri_reverse)
}
diag_ana <- diag_all %>%
mutate(
num_patients_never_severe_before_admission = num_patients_all_before_admission - num_patients_ever_severe_before_admission,
num_patients_never_severe_since_admission = num_patients_all_since_admission - num_patients_ever_severe_since_admission,
) %>%
pivot_longer(cols = starts_with('num'), names_to = 'patient_type', values_to = 'count') %>%
bind_cols(
split_patient_type(.[['patient_type']]) %>%
do.call(rbind, .) %>%
as.data.frame() %>%
`colnames<-`(c('time', 'severe'))
) %>%
mutate(severe = recode(severe,
num_patients_all = 'num_patients_icd',
num_patients_ever_severe= 'num_patients_ever_severe_icd1',
num_patients_never_severe= 'num_patients_never_severe_icd1'),
time = recode(time,
before_admission = 'Before admission',
since_admission = 'After admission')) %>%
rename('icd' = icd_code_3chars) %>%
select(- patient_type) %>%
drop_na() %>%
left_join(site_country, by = c('siteid' = 'SiteID.new')) %>%
{.}
diag_icd_10 <- diag_ana %>%
filter(icd_version == 10) %>%
select(- icd_version) %>%
pivot_wider(names_from = severe, values_from = count) %>%
right_join(neuro_icds_10, by = 'icd') %>%
distinct() %>%
left_join(demo_ana, by = 'siteid') %>%
mutate(full_icd = paste0(`ICD-10 Description`, ' (', icd, ')'),
num_patients_never_severe_icd0 = num_patients_never_severe - num_patients_never_severe_icd1,
num_patients_ever_severe_icd0 = num_patients_ever_severe - num_patients_ever_severe_icd1,
) %>%
drop_na(num_patients_icd) %>%
{.}
diag_icd_9 <- diag_ana %>%
filter(icd_version == 9) %>%
select(- icd_version) %>%
pivot_wider(names_from = severe, values_from = count) %>%
right_join(neuro_icds_9, by = 'icd') %>%
distinct() %>%
left_join(demo_ana, by = 'siteid') %>%
mutate(full_icd = paste0(icd9_desc, ' (', icd, ')'),
num_patients_never_severe_icd0 = num_patients_never_severe - num_patients_never_severe_icd1,
num_patients_ever_severe_icd0 = num_patients_ever_severe - num_patients_ever_severe_icd1,
) %>%
drop_na(num_patients_icd) %>%
{.}
40 sites with icd-10 code. 11 sites have icd-9.
country_sum_icd10 <- diag_icd_10 %>%
select(siteid, Country, num_patients_all) %>%
distinct() %>%
group_by(Country) %>%
summarise(all_pats_country = sum(num_patients_all), .groups = 'drop')
country_sum_icd9 <- diag_icd_9 %>%
select(siteid, Country, num_patients_all) %>%
distinct() %>%
group_by(Country) %>%
summarise(all_pats_country = sum(num_patients_all), .groups = 'drop')
diag_icd_10 <- diag_icd_10 %>%
merge(country_sum_icd10, by = 'Country')
diag_icd_9 <- diag_icd_9 %>%
merge(country_sum_icd9, by = 'Country')
age_group_check <- demo_all %>%
filter(sex == 'all', race == 'all', age_group != 'all') %>%
group_by(siteid) %>%
summarise(age_group_sum = sum(num_patients_all), .groups = 'drop') %>%
right_join(demo_ana) %>%
mutate(mismatches = num_patients_all - age_group_sum) %>%
# filter(abs(mismatches) != 0) %>%
select(-contains('severe')) %>%
arrange(desc(abs(mismatches))) %>%
{.}
## Joining, by = "siteid"
race_check <- demo_all %>%
filter(sex == 'all', race != 'all', age_group == 'all') %>%
group_by(siteid) %>%
summarise(race_sum = sum(num_patients_all), .groups = 'drop') %>%
right_join(demo_ana) %>%
mutate(mismatches = num_patients_all - race_sum) %>%
# filter(abs(mismatches) != 0) %>%
select(-contains('severe')) %>%
arrange(desc(abs(mismatches))) %>%
{.}
## Joining, by = "siteid"
sex_check <- demo_all %>%
filter(race == 'all', sex != 'all', age_group == 'all') %>%
group_by(siteid) %>%
summarise(sex_sum = sum(num_patients_all), .groups = 'drop') %>%
right_join(demo_ana) %>%
mutate(mismatches = num_patients_all - sex_sum) %>%
# filter(mismatches != 0) %>%
select(-contains('severe')) %>%
arrange(desc(abs(mismatches))) %>%
{.}
## Joining, by = "siteid"
sites_no_age <- demo_all %>%
filter(siteid %in% (age_group_check %>% filter(is.na(age_group_sum)) %>% pull(siteid)),
age_group != 'all', sex != 'all', race != 'all') %>%
group_by(siteid, age_group) %>%
summarise(size = sum(num_patients_all), .groups = 'drop')
age_group_desc <- demo_all %>%
filter(sex == 'all', race == 'all', age_group != 'all') %>%
group_by(age_group, siteid) %>%
summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>%
bind_rows(sites_no_age) %>%
pivot_wider(names_from = age_group, values_from = size, values_fill = list(size = 0)) %>%
mutate(sum_age_cols = rowSums(.[, -1])) %>%
right_join(age_group_check) %>%
replace(is.na(.), 0) %>%
mutate(unknown_other = other + num_patients_all - sum_age_cols) %>%
{.}
## Joining, by = "siteid"
age_comp <- age_group_desc %>%
column_to_rownames('siteid') %>%
rename('80to100' = `80plus`) %>%
select(contains('to')) %>%
apply(., 1, cumsum) %>%
t()
median_age(age_comp[2,])
## [1] 67.94007
age_med_df <- data.frame(median_age = apply(age_comp, 1, median_age)) %>%
rownames_to_column('siteid')
age_group_desc <- age_group_desc %>%
left_join(age_med_df)
## Joining, by = "siteid"
sites_no_race <- demo_all %>%
filter(siteid %in% (race_check %>% filter(is.na(race_sum)) %>% pull(siteid)),
age_group != 'all', sex != 'all', race != 'all') %>%
group_by(siteid, race) %>%
summarise(size = sum(num_patients_all), .groups = 'drop')
sites_no_race %>%
left_join(site_country, by = c('siteid' = 'SiteID.new'))
## # A tibble: 10 x 4
## siteid race size Country
## <chr> <chr> <dbl> <chr>
## 1 SITE116 asian 160 Singapore
## 2 SITE116 other 105 Singapore
## 3 SITE116 white 13 Singapore
## 4 SITE185 other 78 Germany
## 5 SITE434 black 0 Italy
## 6 SITE434 hispanic_latino 0 Italy
## 7 SITE434 other 587 Italy
## 8 SITE434 white 0 Italy
## 9 SITE926 other 945 Italy
## 10 SITE974 white 62 Italy
race_desc <- demo_all %>%
filter(sex == 'all', race != 'all', age_group == 'all') %>%
group_by(race, siteid) %>%
summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>%
bind_rows(sites_no_race) %>%
pivot_wider(names_from = race, values_from = size, values_fill = list(size = 0)) %>%
mutate(sum_race_cols = rowSums(.[, -1])) %>%
right_join(race_check) %>%
replace(is.na(.), 0) %>%
mutate(unknown_other_race = other + num_patients_all - sum_race_cols) %>%
{.}
## Joining, by = "siteid"
sex_desc <- demo_all %>%
filter(sex != 'all', race == 'all', age_group == 'all') %>%
group_by(sex, siteid) %>%
summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>%
pivot_wider(names_from = sex, values_from = size, values_fill = list(size = 0)) %>%
mutate(sum_sex_cols = rowSums(.[, -1])) %>%
right_join(sex_check) %>%
replace(is.na(.), 0) %>%
mutate(unknown_other = other + num_patients_all - sex_sum) %>%
{.}
## Joining, by = "siteid"
# race_desc <- demo_all %>%
# filter(sex == 'all', race != 'all', age_group == 'all') %>%
# group_by(race, siteid) %>%
# summarise(size = sum(num_patients_all, na.rm = T), .groups = 'drop') %>%
# pivot_wider(names_from = race, values_from = size, values_fill = list(size = 0)) %>%
# mutate(sum_race_cols = rowSums(.[, -1])) %>%
# right_join(race_check) %>%
# replace(is.na(.), 0) %>%
# mutate(unknown_other_race = other + num_patients_all - race_sum) %>%
# {.}
demo_desc <- left_join(sex_desc, age_group_desc, by = 'siteid', suffix = c("_sex", "_age_group")) %>%
left_join(race_desc, by = 'siteid') %>%
# Reduce(function(...) left_join(..., by = 'siteid', , '_race')),
# list(, race_desc), ) %>%
select(- c(contains('um'), contains('mismatches'))) %>%
right_join(demo_ana, ., by = 'siteid') %>%
left_join(site_country, by = c('siteid' = 'SiteID.new')) %>%
select(siteid, Country, everything())
demo_perc <- demo_desc %>%
# mutate(across(- contains('num'), ~ .x/.data$num_patients_all))
mutate_at(vars(- num_patients_all, - siteid, - Country, - median_age),
list(~ round(. / .data$num_patients_all * 100, 1)))
demo_desc %>% write_csv('results/demo_prelim.csv')
demo_perc %>% write_csv('results/demo_perc_prelim.csv')
# some NAs are because there are just not a row for with that value
sex_age <- demo_all %>%
filter(age_group != 'all', sex != 'all', race != 'all') %>%
group_by(siteid, age_group, sex) %>%
summarise(size = sum(num_patients_all), .groups = 'drop') %>%
pivot_wider(names_from = c(sex, age_group), values_from = size, values_fill = list(size = 0)) %>%
mutate(sum_age_cols = rowSums(.[, -1])) %>%
right_join(age_group_check) %>%
replace(is.na(.), 0) %>%
mutate(unknown_other = female_other + male_other + other_other + num_patients_all - sum_age_cols) %>%
select(- c(contains('um'), contains('mismatches'))) %>%
right_join(demo_ana, ., by = 'siteid') %>%
left_join(site_country, by = c('siteid' = 'SiteID.new')) %>%
select(siteid, Country, everything()) %>%
{.}
## Joining, by = "siteid"
# sex_desc <- demo_all %>%
# filter(sex != 'all', race == 'all', age_group == 'all') %>%
# group_by(sex) %>%
# summarise(!!sum_sum := sum(num_patients_all), .groups = 'drop') %>%
# mutate(category = 'Sex') %>%
# rename('Characteristics' = sex)
#
# race_desc <- demo_all %>%
# filter(sex == 'all', race != 'all', age_group == 'all') %>%
# group_by(race) %>%
# summarise(!!sum_sum := sum(num_patients_all), .groups = 'drop') %>%
# mutate(category = 'Race/Ethnicity') %>%
# rename('Characteristics' = race)
#
# bind_rows(age_group_desc, sex_desc, race_desc) %>%
# group_by(category) %>%
# gt() %>%
# tab_footnote(
# footnote = "Obfuscation at each site may affect the total counts.",
# locations = cells_column_labels(
# columns = 2)
# ) %>%
# {.}
Check NAs:
# va_vec <- c("VA1", "VA10", "VA12", "VA15", "VA16", "VA17", "VA19", "VA2", "VA20", "VA21", "VA22", "VA23", "VA4", "VA5", "VA6", "VA7", "VA8", "VA9")
# sites <- c("APHP", "ASSTPAVIA", "BIDMC", "C2WF", "FRBDX", "H12O", "HPG23", "ICSM1", "KUMC", "MCWCTSI", "MGB", "MUSC", "NUH", "NWU", "POLIMI", "RIVHS", "SLHN", "UCLA", "UKER", "UKFR", "UKY", "UMICH", "UMM", "UNICZ", "UPenn", "UPITT", "UTSW", "VA")
#
# obfuscation_df <- googlesheets4::read_sheet(
# # 'https://docs.google.com/spreadsheets/d/1VhtKIbzEOeGFG1Iw27D_gq_l3eJpKn0LiJr_f_hFdwc/edit?ts=5ec17a70#gid=0',
# 'https://docs.google.com/spreadsheets/d/1Xl9juDBXt86P3xQtsoTaBl2zPl1BIiAG9DI3Rotyqp8/edit#gid=212461777',
# skip = 1,
# col_names = c('SLHN', 'site_name', 'contact_name', 'contact_email', 'city', 'country',
# 'low_count_threshold', 'small_count_rows_deleted', 'blur_range', 'notes',
# 'irb_statement')) %>%
# filter(SLHN != 'FICHOS') %>%
# mutate(low_count_threshold = low_count_threshold %>%
# replace(low_count_threshold == '<=10', '<11') %>%
# replace(low_count_threshold == '<=5', '<6'))
#
# library(ggplot2)
# obfuscation_df %>%
# ggplot(aes(x = low_count_threshold)) +
# geom_bar()