library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(rcartocolor)
library(forcats)
library(purrr)
library(DT)
library(cowplot)
load('data/processed-data.Rdata')
theme_set(theme_bw() + theme(legend.title = element_blank()))
source('utils.R')Calculate percentages:
code_prevalence <- diag_icd_9 %>%
mutate(time = fct_relevel(time, c('After admission', 'Before admission'))) %>%
select(- contains('ICD10_')) %>%
group_by(siteid, time, icd, `Neurological Disease Category`, full_icd) %>%
mutate(percent_pats_site = num_patients_icd/num_patients_all,
siteid = as.factor(siteid)) %>%
ungroup() %>%
group_by(Country, time, icd, `Neurological Disease Category`, full_icd) %>%
mutate(percent_pats_country = sum(num_patients_icd)/all_pats_country) %>%
ungroup()diff_code <- code_prevalence %>%
select(location = siteid, icd, `Neurological Disease Category`, time, percent_pats_site) %>%
pivot_wider(names_from = time, values_from = percent_pats_site,
id_cols = c(location, icd, `Neurological Disease Category`),
values_fill = list(percent_pats_site = 0)) %>%
mutate(percent_diff = `After admission` - `Before admission`,
loc_type = 'Site', location = tolower(location))
diff_code_country <- code_prevalence %>%
select(location = Country, icd, `Neurological Disease Category`, time, percent_pats_country) %>%
distinct() %>%
pivot_wider(names_from = time, values_from = percent_pats_country,
id_cols = c(location, icd, `Neurological Disease Category`),
values_fill = list(percent_pats_country = 0)) %>%
mutate(percent_diff = `After admission` - `Before admission`,
loc_type = 'Country')
diff_code_loc <- diff_code %>%
bind_rows(diff_code_country)
diff_heat <- diff_code_loc %>%
ggplot(aes(y = icd, x = location, fill = percent_diff)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'After - Before') +
scale_x_discrete(drop = F) +
scale_fill_gradient2(#009392,#39b185,#9ccb86,#e9e29c,#eeb479,#e88471,#cf597e
low = '#798234',
high = '#cf597e',
labels = scales::percent_format(accuracy = 1)) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_bottom() +
theme(panel.spacing.x = unit(10, "points"),
plot.margin = unit(c(1,8,0.5,12), "lines"),
axis.text.x = element_text(hjust = 1)) +
# theme() +
NULL
diff_heat # ggsave('figs/icd9_diff_heatmap.png', diff_heat, height = 4, width = 8)my_t <- function(icdi){
diff_icd <- diff_code %>% filter(icd == icdi)
data.frame(
icd = icdi,
t.test(diff_icd$percent_diff) %>%
broom::tidy()
)
}
prevalence_stats <- unique(diff_code$icd) %>%
lapply(my_t) %>%
bind_rows()
prevalence_stats$p_value_holm <- p.adjust(prevalence_stats$p.value, 'holm')
prevalence_stats$p_value_bh <- p.adjust(prevalence_stats$p.value, 'BH')
prevalence_stats %>%
filter(p_value_bh < 0.05)## [1] icd estimate statistic p.value parameter
## [6] conf.low conf.high method alternative p_value_holm
## [11] p_value_bh
## <0 rows> (or 0-length row.names)
prevalence_stats %>%
arrange(desc(estimate)) %>%
mutate(ci = paste0('(', round(conf.low*100, 2), '%, ',
round(conf.high*100, 2), '%)')) %>%
select(- c(parameter, method, alternative, conf.low, conf.high)) %>%
datatable(rownames = FALSE, filter = 'top') %>%
formatRound(c('statistic'), 1) %>%
formatPercentage('estimate', 1) %>%
formatSignif(c('p.value', 'p_value_holm', 'p_value_bh'), 3) %>%
{.}prevalence_stats %>%
write_csv('results/icd9_prevalence_stats.csv')of the mean proportion of patients diagnosed with each ICD
alpha_threshold <- qnorm(0.975)
ci_prevalence <- code_prevalence %>%
group_by(time, full_icd) %>%
add_count() %>%
summarise(
mean_prop = mean(percent_pats_site, na.rm = T),
sd_prob = sd(percent_pats_site, na.rm = T),
n = mean(n),
me_prop = alpha_threshold * sd_prob / sqrt(n)
) %>%
ungroup()## `summarise()` regrouping output by 'time' (override with `.groups` argument)
icd9_time <- diag_icd_9 %>%
mutate(time = fct_relevel(time, c('After admission', 'Before admission'))) %>%
group_by(full_icd, time) %>%
summarise(pats_icd9_time = sum(num_patients_icd, na.rm = T), .groups = 'drop')
sorted_icds <- code_prevalence %>%
ungroup() %>%
select(time, percent_pats_site, full_icd, siteid) %>%
pivot_wider(names_from = time, values_from = percent_pats_site, values_fill = list(percent_pats_site = 0)) %>%
group_by(full_icd) %>%
summarise(after = mean(`After admission`, na.rm = T),
before = mean(`Before admission`, na.rm = T),
diff_prev = after - before,
.groups = 'drop') %>%
arrange(diff_prev) %>%
pull(full_icd)
total_icd <- icd9_time %>%
ggplot(aes(x = pats_icd9_time, y = fct_relevel(full_icd, sorted_icds), fill = time)) +
scale_fill_carto_d(palette = 4, guide = guide_legend(reverse = TRUE)) +
geom_col(position = 'dodge') +
theme_minimal() +
scale_x_reverse(expand = expansion(add = c(0,0))) +
scale_y_discrete(labels = NULL) +
theme(legend.position = c(0.3, 0.15),
legend.title = element_blank(),
legend.key.height = unit(4, 'mm'),
plot.margin = margin(t = 1, r = 0.5, l = 0.5, unit = 'lines')) +
labs(x = 'Total number of patients at all sites', y = NULL)
percent_icd <-
ci_prevalence %>%
ggplot(aes(group = time)) +
ggstance::geom_pointrangeh(
aes(x = mean_prop,
y = fct_relevel(full_icd, sorted_icds),
xmin = mean_prop - me_prop,
xmax = mean_prop + me_prop,
color = time),
position = position_dodge(width = 0.3),
stroke = 0.1, fatten = 3, size = 0.7
) +
scale_color_carto_d(palette = 4, guide = NULL) +
theme_minimal() +
theme(legend.position = c(0.75, 0.1),
legend.title = element_blank(),
axis.text.y = element_text(hjust = 0.5),
plot.margin = margin(t = 1, unit = 'lines')) +
scale_x_continuous(expand = expansion(add = c(0, 0.03)),
labels = scales::percent_format(accuracy = 1)) +
labs(x = 'Proportion of patients each site', y = NULL)
icd9_prevalence_plots <- cowplot::plot_grid(total_icd, percent_icd, ncol = 2,
rel_widths = c(1, 2.5))
icd9_prevalence_plots# ggsave('figs/icd9_prevalence.png', icd9_prevalence_plots, height = 5, width = 10)plot_grid(diff_heat, icd9_prevalence_plots, rel_heights = c(1, 0.85),
ncol = 1, labels = 'AUTO') %>%
ggsave('figs/icd9_prevalence_AB.png', ., height = 8, width = 9.7)
plot_grid(diff_heat, icd9_prevalence_plots, rel_heights = c(1, 0.85),
ncol = 1, labels = 'AUTO') %>%
ggsave('figs/tiffs/efig_4.tiff', ., height = 8, width = 9.7, dpi = 300)before_code_prev <- diff_code_loc %>%
ggplot(aes(y = icd, x = location, fill = `Before admission`)) +
geom_tile() +
labs(y = NULL, x = NULL) +
#f6d2a9,#f5b78e,#f19c7c,#ea8171,#dd686c,#ca5268,#b13f64
scale_fill_gradient(
low = '#d1eeea', high = '#2a5674',
labels = scales::percent_format(accuracy = 1),
limits = c(0, max(diff_code_loc$`After admission`)),
guide = guide_legend(override.aes = list(fill = "white"))) +
scale_x_discrete(drop = F) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_top() +
NULL
after_code_prev <- diff_code_loc %>%
ggplot(aes(y = icd, x = location, fill = `After admission`)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Patient % per
site/country') +
scale_x_discrete(drop = F) +
scale_fill_gradient(low = '#d1eeea', high = '#2a5674',
labels = scales::percent_format(accuracy = 1),
limits = c(0, max(diff_code_loc$`After admission`))) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_bottom() +
theme(legend.title = element_text(hjust = 0)) +
NULL
heats <- cowplot::plot_grid(before_code_prev, after_code_prev, ncol = 1,
rel_heights = c(1, 1.15),
labels = c('A. Before admission', 'B. After admission'))
heatsggsave('figs/icd9_heatmap.png', heats, height = 10, width = 7)
ggsave('figs/tiffs/efig_3.tiff', heats, height = 10, width = 7, dpi = 300)diff_code %>%
filter(icd == 'R41') %>%
ungroup() %>%
count(percent_diff > 0)## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
diff_code %>%
filter(icd == 'G93') %>%
ungroup() %>%
count(percent_diff > 0)## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
diff_code %>%
filter(icd == 'R42') %>%
ungroup() %>%
count(percent_diff > 0)## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
What ICD code has the most number severe patients? Sites with more severe patients?
severe_code <- diag_icd_9 %>%
mutate(percent_severe_site = num_patients_ever_severe_icd1/num_patients_icd,
siteid = as.factor(siteid))
severe_bef_heat <- severe_code %>%
filter(time == 'Before admission') %>%
ggplot(aes(y = icd, x = siteid, fill = percent_severe_site)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Severe % per ICD') +
#3d5941,#778868,#b5b991,#f6edbd,#edbb8a,#de8a5a,#ca562c
scale_fill_gradient(
# low = '#f6d2a9', high = '#b13f64',
low = 'white', high = '#ca562c',
guide = guide_legend(override.aes = list(fill = "white")),
) +
scale_x_discrete(drop = F) +
facet_grid(rows = vars(`Neurological Disease Category`), space = 'free', scale = 'free') +
heat_theme_top() +
NULL
severe_aft_heat <- severe_code %>%
filter(time == 'After admission') %>%
ggplot(aes(y = icd, x = siteid, fill = percent_severe_site)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Severe % per ICD') +
scale_x_discrete(drop = F) +
#3d5941,#778868,#b5b991,#f6edbd,#edbb8a,#de8a5a,#ca562c
scale_fill_gradient(
low = 'white', high = '#ca562c',
labels = scales::percent_format(accuracy = 1)) +
facet_grid(rows = vars(`Neurological Disease Category`), space = 'free', scale = 'free') +
heat_theme_bottom() +
NULL
severe_heats <- cowplot::plot_grid(severe_bef_heat, severe_aft_heat, ncol = 1,
rel_heights = c(1, 1.15),
labels = c('A. Before admission', 'B. After admission'))
severe_heatsggsave('figs/icd9_severe_heatmap.png', severe_heats, height = 10, width = 7)For each ICD code, the proportion of severe patients who were diagnosed with that ICD code is similar to the proportion of never-severe patients who were diagnosed with that same ICD code.
For the sake of simplicity in this notebook, we’re going to denote patients whose symptoms have been categorized as severe (based on respiratory status +/- requiring ICU) at least once as severe patients, and patients whose symptoms have NEVER been categorized as severe as non-severe patients.
For each ICD code, we computed the expected number of severe patients by multiplying the proportion of non-severe patients who were diagnosed with that code with the total number of severe patients. We performed an enrichment analysis to examine the difference of severe patients proportions across ICD codes. We calculated each ICD code’s enrichment by dividing the observed proportion of honorees by the expected proportion of honorees and reported a value of log2 enrichment (LOE) and its 95% confidence intervals. The 95% confidence interval of the LOE was estimated using the Poisson model method [@isbn:9780849394447].
Note that several ICD-9 codes have very small sample size (small number of patients associated with these codes). Therefore, we should take that into account when interpreting the result of the statistical tests for these codes.
First, remove ICD with very small number of patients across site:
small_icds <- diag_icd_9 %>%
group_by(icd) %>%
summarise(num_pats_icd = sum(num_patients_icd, na.rm = TRUE), .groups = 'drop') %>%
filter(num_pats_icd < 5) %>%
pull(icd)
diag_icd_9 <- diag_icd_9 %>%
filter(!icd %in% small_icds)contingency_df <- diag_icd_9 %>%
# filter(time == 'Before admission') %>%
group_by(full_icd, time) %>%
summarise(across(contains('severe'), .fns = sum, na.rm = T), .groups = 'drop') %>%
mutate(Observed = num_patients_ever_severe_icd1,
num_non_severe = num_patients_never_severe_icd1 + num_patients_never_severe_icd0,
num_severe = num_patients_ever_severe_icd1 + num_patients_ever_severe_icd0,
Expected = num_patients_never_severe_icd1/num_non_severe*num_severe,
over_sev = Observed - Expected)
nested_obs_exp <- contingency_df %>%
select(full_icd,
time,
num_patients_never_severe_icd0,
num_patients_never_severe_icd1,
num_patients_ever_severe_icd0,
num_patients_ever_severe_icd1) %>%
group_by(full_icd, time) %>%
nest()
fish_obs_exp <- nested_obs_exp %>%
mutate(fish = map(data, my_fish)) %>%
dplyr::select(-data) %>%
unnest(cols = c(fish)) %>%
mutate(upper = ifelse(is.na(upper), Inf, upper),
ci = paste0('(', round(log2(lower), 1), ', ',
round(log2(upper), 1), ')'),
lestimate = log2(estimate))
fish_obs_exp$`P value (Holm)` <- p.adjust(fish_obs_exp$p_value, 'holm')
fish_obs_exp$`P value (FDR)` <- p.adjust(fish_obs_exp$p_value, 'BH')While the Warning messages mentioned Chi-squared, the p-values were actually calculated using Fisher’s Exact test (see more in epitools::tab2by2.test()).
Note: small number of observations for ICD-10 code G04, G03, G65:
contingency_df %>%
filter(grepl('G04|G03|G65', full_icd)) %>%
datatable()The full table with all ICD codes and their corresponding enrichment can be browsed interactively below:
library(DT)
fish_tab <- fish_obs_exp %>%
left_join(contingency_df, by = c('full_icd', 'time')) %>%
select(full_icd, time, Observed, Expected, over_sev,
estimate, lestimate, ci, p_value,
`P value (Holm)`, `P value (FDR)`) %>%
arrange(desc(over_sev)) %>%
rename('ICD' = 'full_icd',
'Observed - Expected' = 'over_sev',
'Enrichment' = 'estimate',
'Log2(enrichment)' = 'lestimate',
'95% Confidence interval' = 'ci',
'P value (raw)' = 'p_value')
fish_tab %>%
datatable(rownames = FALSE, filter = 'top') %>%
formatRound(c('Observed', 'Expected', 'Observed - Expected',
'Enrichment', 'Log2(enrichment)'), 1) %>%
formatSignif(c('P value (raw)', 'P value (Holm)', 'P value (FDR)'), 3) %>%
{.}fish_tab %>%
write_csv('results/icd9_fish_tab.csv')
fish_obs_exp %>%
filter(`P value (FDR)` < 0.05) %>%
arrange(time) %>%
mutate(lestimate = round(lestimate, 2),
drr = round((estimate - 1)*100, 0),
lower_drr = (lower - 1)*100,
upper_drr = (upper - 1)*100,
ci_drr = paste0('(', round(lower_drr, 0), ', ',
round(upper_drr, 0), ')'),
p_fdr = format(`P value (FDR)`, digits = 2)) %>%
select(time, full_icd, lestimate, drr, ci_drr, p_fdr) %>%
write_csv('results/icd9_signi.csv')A positive value of LOE indicates a higher proportion of severe patients with that ICD code compared to non-severe patients. A LOE value of 1 represents a one-fold enrichment (i.e., observed number of severe patients is twice as much as expected). We found an excess of severe patients with the following ICD codes:
filtered_obs_exp <- contingency_df %>%
left_join(fish_obs_exp, by = c('full_icd', 'time')) %>%
mutate(
distance_to_null = case_when(
lower > 1 ~ lower - 1,
TRUE ~ upper - 2
),
presentation = case_when(
lower > 1 & `P value (FDR)` < 0.05 ~ '#d46780',
upper < 1 & `P value (FDR)` < 0.05 ~ '#798234',
TRUE ~ 'grey20'
)) %>%
{.}sorted_icds <- filtered_obs_exp %>%
filter(time == 'After admission') %>%
arrange(Expected) %>%
pull(full_icd)
plot_obs_exp <- filtered_obs_exp %>%
mutate(lestimate = log2(estimate),
llower = log2(lower),
lupper = log2(upper)) %>%
select(full_icd, time, lestimate, llower, lupper, presentation, over_sev, Observed, Expected) %>%
pivot_longer(- c(full_icd, time, presentation, over_sev), names_to = 'type') %>%
mutate(subtype = ifelse(type == 'Expected' | type == 'Observed',
'Sqrt(number of honorees)',
'Log2 enrichment, 95% CI')) %>%
pivot_wider(names_from = type) %>%
mutate(presentation = as.factor(presentation),
full_icd = fct_relevel(full_icd, sorted_icds))
plot_obs_exp_right <- plot_obs_exp %>% filter(subtype == 'Sqrt(number of honorees)')
plot_obs_exp_left <- plot_obs_exp %>% filter(subtype != 'Sqrt(number of honorees)') enrichment_plot_before <- plot_enrich(
plot_obs_exp_left %>% filter(time == 'Before admission') %>%
mutate(presentation = droplevels(presentation)),
plot_obs_exp_right %>% filter(time == 'Before admission') %>%
slice(match(sorted_icds, full_icd)) %>%
mutate(presentation = droplevels(presentation)),
nudge = 1.5)## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
enrichment_plot_after <- plot_enrich(
plot_obs_exp_left %>% filter(time == 'After admission') %>%
mutate(presentation = droplevels(presentation)),
plot_obs_exp_right %>% filter(time == 'After admission') %>% slice(match(sorted_icds, full_icd)) %>%
mutate(presentation = droplevels(presentation)),
nudge = 1.5)## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
enrichment_plot <- cowplot::plot_grid(
enrichment_plot_before,
enrichment_plot_after,
ncol = 1, hjust = -0.05,
labels = c('A. Before admission', 'B. After admission')
)
enrichment_plotggsave('figs/icd9_severe_after.png', enrichment_plot_after, width = 9.5, height = 3.5)
ggsave('figs/tiffs/efig_5.tiff', enrichment_plot_after, width = 9.5, height = 3.5, dpi = 300)Figure caption: Each ICD code’s log2 enrichment (LOE) and its 95% confidence interval (left), and the absolute difference between observed (triangle) and expected (circle) number of severe patients (right) before admission. Positive value of LOE indicates a higher proportion of severe patients with that ICD code compared to non-severe patients. Neurological ICD codes are ordered based on the number of severe patients. Difference has been rounded.
plot_enrich_both <- plot_obs_exp %>%
ggplot(aes(y = fct_relevel(full_icd, sorted_icds))) +
geom_vline(aes(xintercept = 0), linetype = 2) +
ggstance::geom_pointrangeh(
aes(x = lestimate,
xmin = llower,
xmax = lupper,
color = time),
position = position_dodge(width = 0.3),
stroke = 0.1, fatten = 3, size = 0.7
) +
labs(y = NULL, x = bquote(Log[2] ~ 'enrichment, 95% CI')) +
theme(
legend.position = 'bottom',
legend.background = element_blank(),
axis.title = element_text(size = 9),
plot.margin = margin(5.5, 2, 5.5, 5.5, unit = 'pt')
) +
scale_color_carto_d(palette = 4, direction = -1) +
NULL
plot_enrich_both## Warning: Removed 50 rows containing missing values (geom_pointrangeh).
ggsave('figs/icd9_severe.png', plot_enrich_both, width = 9, height = 5)## Warning: Removed 50 rows containing missing values (geom_pointrangeh).