library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(rcartocolor)
library(forcats)
library(purrr)
library(DT)
library(cowplot)
load('data/processed-data.Rdata')
theme_set(theme_bw() + theme(legend.title = element_blank()))
source('utils.R')
Calculate percentages:
<- diag_icd_9 %>%
code_prevalence mutate(time = fct_relevel(time, c('After admission', 'Before admission'))) %>%
select(- contains('ICD10_')) %>%
group_by(siteid, time, icd, `Neurological Disease Category`, full_icd) %>%
mutate(percent_pats_site = num_patients_icd/num_patients_all,
siteid = as.factor(siteid)) %>%
ungroup() %>%
group_by(Country, time, icd, `Neurological Disease Category`, full_icd) %>%
mutate(percent_pats_country = sum(num_patients_icd)/all_pats_country) %>%
ungroup()
<- code_prevalence %>%
diff_code select(location = siteid, icd, `Neurological Disease Category`, time, percent_pats_site) %>%
pivot_wider(names_from = time, values_from = percent_pats_site,
id_cols = c(location, icd, `Neurological Disease Category`),
values_fill = list(percent_pats_site = 0)) %>%
mutate(percent_diff = `After admission` - `Before admission`,
loc_type = 'Site', location = tolower(location))
<- code_prevalence %>%
diff_code_country select(location = Country, icd, `Neurological Disease Category`, time, percent_pats_country) %>%
distinct() %>%
pivot_wider(names_from = time, values_from = percent_pats_country,
id_cols = c(location, icd, `Neurological Disease Category`),
values_fill = list(percent_pats_country = 0)) %>%
mutate(percent_diff = `After admission` - `Before admission`,
loc_type = 'Country')
<- diff_code %>%
diff_code_loc bind_rows(diff_code_country)
<- diff_code_loc %>%
diff_heat ggplot(aes(y = icd, x = location, fill = percent_diff)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'After - Before') +
scale_x_discrete(drop = F) +
scale_fill_gradient2(#009392,#39b185,#9ccb86,#e9e29c,#eeb479,#e88471,#cf597e
low = '#798234',
high = '#cf597e',
labels = scales::percent_format(accuracy = 1)) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_bottom() +
theme(panel.spacing.x = unit(10, "points"),
plot.margin = unit(c(1,8,0.5,12), "lines"),
axis.text.x = element_text(hjust = 1)) +
# theme() +
NULL
diff_heat
# ggsave('figs/icd9_diff_heatmap.png', diff_heat, height = 4, width = 8)
<- function(icdi){
my_t <- diff_code %>% filter(icd == icdi)
diff_icd data.frame(
icd = icdi,
t.test(diff_icd$percent_diff) %>%
::tidy()
broom
)
}<- unique(diff_code$icd) %>%
prevalence_stats lapply(my_t) %>%
bind_rows()
$p_value_holm <- p.adjust(prevalence_stats$p.value, 'holm')
prevalence_stats$p_value_bh <- p.adjust(prevalence_stats$p.value, 'BH')
prevalence_stats%>%
prevalence_stats filter(p_value_bh < 0.05)
## [1] icd estimate statistic p.value parameter
## [6] conf.low conf.high method alternative p_value_holm
## [11] p_value_bh
## <0 rows> (or 0-length row.names)
%>%
prevalence_stats arrange(desc(estimate)) %>%
mutate(ci = paste0('(', round(conf.low*100, 2), '%, ',
round(conf.high*100, 2), '%)')) %>%
select(- c(parameter, method, alternative, conf.low, conf.high)) %>%
datatable(rownames = FALSE, filter = 'top') %>%
formatRound(c('statistic'), 1) %>%
formatPercentage('estimate', 1) %>%
formatSignif(c('p.value', 'p_value_holm', 'p_value_bh'), 3) %>%
{.}
%>%
prevalence_stats write_csv('results/icd9_prevalence_stats.csv')
of the mean proportion of patients diagnosed with each ICD
<- qnorm(0.975)
alpha_threshold
<- code_prevalence %>%
ci_prevalence group_by(time, full_icd) %>%
add_count() %>%
summarise(
mean_prop = mean(percent_pats_site, na.rm = T),
sd_prob = sd(percent_pats_site, na.rm = T),
n = mean(n),
me_prop = alpha_threshold * sd_prob / sqrt(n)
%>%
) ungroup()
## `summarise()` regrouping output by 'time' (override with `.groups` argument)
<- diag_icd_9 %>%
icd9_time mutate(time = fct_relevel(time, c('After admission', 'Before admission'))) %>%
group_by(full_icd, time) %>%
summarise(pats_icd9_time = sum(num_patients_icd, na.rm = T), .groups = 'drop')
<- code_prevalence %>%
sorted_icds ungroup() %>%
select(time, percent_pats_site, full_icd, siteid) %>%
pivot_wider(names_from = time, values_from = percent_pats_site, values_fill = list(percent_pats_site = 0)) %>%
group_by(full_icd) %>%
summarise(after = mean(`After admission`, na.rm = T),
before = mean(`Before admission`, na.rm = T),
diff_prev = after - before,
.groups = 'drop') %>%
arrange(diff_prev) %>%
pull(full_icd)
<- icd9_time %>%
total_icd ggplot(aes(x = pats_icd9_time, y = fct_relevel(full_icd, sorted_icds), fill = time)) +
scale_fill_carto_d(palette = 4, guide = guide_legend(reverse = TRUE)) +
geom_col(position = 'dodge') +
theme_minimal() +
scale_x_reverse(expand = expansion(add = c(0,0))) +
scale_y_discrete(labels = NULL) +
theme(legend.position = c(0.3, 0.15),
legend.title = element_blank(),
legend.key.height = unit(4, 'mm'),
plot.margin = margin(t = 1, r = 0.5, l = 0.5, unit = 'lines')) +
labs(x = 'Total number of patients at all sites', y = NULL)
<-
percent_icd %>%
ci_prevalence ggplot(aes(group = time)) +
::geom_pointrangeh(
ggstanceaes(x = mean_prop,
y = fct_relevel(full_icd, sorted_icds),
xmin = mean_prop - me_prop,
xmax = mean_prop + me_prop,
color = time),
position = position_dodge(width = 0.3),
stroke = 0.1, fatten = 3, size = 0.7
+
) scale_color_carto_d(palette = 4, guide = NULL) +
theme_minimal() +
theme(legend.position = c(0.75, 0.1),
legend.title = element_blank(),
axis.text.y = element_text(hjust = 0.5),
plot.margin = margin(t = 1, unit = 'lines')) +
scale_x_continuous(expand = expansion(add = c(0, 0.03)),
labels = scales::percent_format(accuracy = 1)) +
labs(x = 'Proportion of patients each site', y = NULL)
<- cowplot::plot_grid(total_icd, percent_icd, ncol = 2,
icd9_prevalence_plots rel_widths = c(1, 2.5))
icd9_prevalence_plots
# ggsave('figs/icd9_prevalence.png', icd9_prevalence_plots, height = 5, width = 10)
plot_grid(diff_heat, icd9_prevalence_plots, rel_heights = c(1, 0.85),
ncol = 1, labels = 'AUTO') %>%
ggsave('figs/icd9_prevalence_AB.png', ., height = 8, width = 9.7)
plot_grid(diff_heat, icd9_prevalence_plots, rel_heights = c(1, 0.85),
ncol = 1, labels = 'AUTO') %>%
ggsave('figs/tiffs/efig_4.tiff', ., height = 8, width = 9.7, dpi = 300)
<- diff_code_loc %>%
before_code_prev ggplot(aes(y = icd, x = location, fill = `Before admission`)) +
geom_tile() +
labs(y = NULL, x = NULL) +
#f6d2a9,#f5b78e,#f19c7c,#ea8171,#dd686c,#ca5268,#b13f64
scale_fill_gradient(
low = '#d1eeea', high = '#2a5674',
labels = scales::percent_format(accuracy = 1),
limits = c(0, max(diff_code_loc$`After admission`)),
guide = guide_legend(override.aes = list(fill = "white"))) +
scale_x_discrete(drop = F) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_top() +
NULL
<- diff_code_loc %>%
after_code_prev ggplot(aes(y = icd, x = location, fill = `After admission`)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Patient % per
site/country') +
scale_x_discrete(drop = F) +
scale_fill_gradient(low = '#d1eeea', high = '#2a5674',
labels = scales::percent_format(accuracy = 1),
limits = c(0, max(diff_code_loc$`After admission`))) +
facet_grid(rows = vars(`Neurological Disease Category`),
cols = vars(fct_rev(loc_type)), space = 'free', scale = 'free') +
heat_theme_bottom() +
theme(legend.title = element_text(hjust = 0)) +
NULL
<- cowplot::plot_grid(before_code_prev, after_code_prev, ncol = 1,
heats rel_heights = c(1, 1.15),
labels = c('A. Before admission', 'B. After admission'))
heats
ggsave('figs/icd9_heatmap.png', heats, height = 10, width = 7)
ggsave('figs/tiffs/efig_3.tiff', heats, height = 10, width = 7, dpi = 300)
%>%
diff_code filter(icd == 'R41') %>%
ungroup() %>%
count(percent_diff > 0)
## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
%>%
diff_code filter(icd == 'G93') %>%
ungroup() %>%
count(percent_diff > 0)
## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
%>%
diff_code filter(icd == 'R42') %>%
ungroup() %>%
count(percent_diff > 0)
## # A tibble: 0 x 2
## # … with 2 variables: `percent_diff > 0` <lgl>, n <int>
What ICD code has the most number severe patients? Sites with more severe patients?
<- diag_icd_9 %>%
severe_code mutate(percent_severe_site = num_patients_ever_severe_icd1/num_patients_icd,
siteid = as.factor(siteid))
<- severe_code %>%
severe_bef_heat filter(time == 'Before admission') %>%
ggplot(aes(y = icd, x = siteid, fill = percent_severe_site)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Severe % per ICD') +
#3d5941,#778868,#b5b991,#f6edbd,#edbb8a,#de8a5a,#ca562c
scale_fill_gradient(
# low = '#f6d2a9', high = '#b13f64',
low = 'white', high = '#ca562c',
guide = guide_legend(override.aes = list(fill = "white")),
+
) scale_x_discrete(drop = F) +
facet_grid(rows = vars(`Neurological Disease Category`), space = 'free', scale = 'free') +
heat_theme_top() +
NULL
<- severe_code %>%
severe_aft_heat filter(time == 'After admission') %>%
ggplot(aes(y = icd, x = siteid, fill = percent_severe_site)) +
geom_tile() +
labs(y = NULL, x = NULL, fill = 'Severe % per ICD') +
scale_x_discrete(drop = F) +
#3d5941,#778868,#b5b991,#f6edbd,#edbb8a,#de8a5a,#ca562c
scale_fill_gradient(
low = 'white', high = '#ca562c',
labels = scales::percent_format(accuracy = 1)) +
facet_grid(rows = vars(`Neurological Disease Category`), space = 'free', scale = 'free') +
heat_theme_bottom() +
NULL
<- cowplot::plot_grid(severe_bef_heat, severe_aft_heat, ncol = 1,
severe_heats rel_heights = c(1, 1.15),
labels = c('A. Before admission', 'B. After admission'))
severe_heats
ggsave('figs/icd9_severe_heatmap.png', severe_heats, height = 10, width = 7)
For each ICD code, the proportion of severe patients who were diagnosed with that ICD code is similar to the proportion of never-severe patients who were diagnosed with that same ICD code.
For the sake of simplicity in this notebook, we’re going to denote patients whose symptoms have been categorized as severe (based on respiratory status +/- requiring ICU) at least once as severe patients, and patients whose symptoms have NEVER been categorized as severe as non-severe patients.
For each ICD code, we computed the expected number of severe patients by multiplying the proportion of non-severe patients who were diagnosed with that code with the total number of severe patients. We performed an enrichment analysis to examine the difference of severe patients proportions across ICD codes. We calculated each ICD code’s enrichment by dividing the observed proportion of honorees by the expected proportion of honorees and reported a value of log2 enrichment (LOE) and its 95% confidence intervals. The 95% confidence interval of the LOE was estimated using the Poisson model method [@isbn:9780849394447].
Note that several ICD-9 codes have very small sample size (small number of patients associated with these codes). Therefore, we should take that into account when interpreting the result of the statistical tests for these codes.
First, remove ICD with very small number of patients across site:
<- diag_icd_9 %>%
small_icds group_by(icd) %>%
summarise(num_pats_icd = sum(num_patients_icd, na.rm = TRUE), .groups = 'drop') %>%
filter(num_pats_icd < 5) %>%
pull(icd)
<- diag_icd_9 %>%
diag_icd_9 filter(!icd %in% small_icds)
<- diag_icd_9 %>%
contingency_df # filter(time == 'Before admission') %>%
group_by(full_icd, time) %>%
summarise(across(contains('severe'), .fns = sum, na.rm = T), .groups = 'drop') %>%
mutate(Observed = num_patients_ever_severe_icd1,
num_non_severe = num_patients_never_severe_icd1 + num_patients_never_severe_icd0,
num_severe = num_patients_ever_severe_icd1 + num_patients_ever_severe_icd0,
Expected = num_patients_never_severe_icd1/num_non_severe*num_severe,
over_sev = Observed - Expected)
<- contingency_df %>%
nested_obs_exp select(full_icd,
time,
num_patients_never_severe_icd0,
num_patients_never_severe_icd1,
num_patients_ever_severe_icd0, %>%
num_patients_ever_severe_icd1) group_by(full_icd, time) %>%
nest()
<- nested_obs_exp %>%
fish_obs_exp mutate(fish = map(data, my_fish)) %>%
::select(-data) %>%
dplyrunnest(cols = c(fish)) %>%
mutate(upper = ifelse(is.na(upper), Inf, upper),
ci = paste0('(', round(log2(lower), 1), ', ',
round(log2(upper), 1), ')'),
lestimate = log2(estimate))
$`P value (Holm)` <- p.adjust(fish_obs_exp$p_value, 'holm')
fish_obs_exp$`P value (FDR)` <- p.adjust(fish_obs_exp$p_value, 'BH') fish_obs_exp
While the Warning messages mentioned Chi-squared, the p-values were actually calculated using Fisher’s Exact test (see more in epitools::tab2by2.test()
).
Note: small number of observations for ICD-10 code G04, G03, G65:
%>%
contingency_df filter(grepl('G04|G03|G65', full_icd)) %>%
datatable()
The full table with all ICD codes and their corresponding enrichment can be browsed interactively below:
library(DT)
<- fish_obs_exp %>%
fish_tab left_join(contingency_df, by = c('full_icd', 'time')) %>%
select(full_icd, time, Observed, Expected, over_sev,
estimate, lestimate, ci, p_value,`P value (Holm)`, `P value (FDR)`) %>%
arrange(desc(over_sev)) %>%
rename('ICD' = 'full_icd',
'Observed - Expected' = 'over_sev',
'Enrichment' = 'estimate',
'Log2(enrichment)' = 'lestimate',
'95% Confidence interval' = 'ci',
'P value (raw)' = 'p_value')
%>%
fish_tab datatable(rownames = FALSE, filter = 'top') %>%
formatRound(c('Observed', 'Expected', 'Observed - Expected',
'Enrichment', 'Log2(enrichment)'), 1) %>%
formatSignif(c('P value (raw)', 'P value (Holm)', 'P value (FDR)'), 3) %>%
{.}
%>%
fish_tab write_csv('results/icd9_fish_tab.csv')
%>%
fish_obs_exp filter(`P value (FDR)` < 0.05) %>%
arrange(time) %>%
mutate(lestimate = round(lestimate, 2),
drr = round((estimate - 1)*100, 0),
lower_drr = (lower - 1)*100,
upper_drr = (upper - 1)*100,
ci_drr = paste0('(', round(lower_drr, 0), ', ',
round(upper_drr, 0), ')'),
p_fdr = format(`P value (FDR)`, digits = 2)) %>%
select(time, full_icd, lestimate, drr, ci_drr, p_fdr) %>%
write_csv('results/icd9_signi.csv')
A positive value of LOE indicates a higher proportion of severe patients with that ICD code compared to non-severe patients. A LOE value of 1 represents a one-fold enrichment (i.e., observed number of severe patients is twice as much as expected). We found an excess of severe patients with the following ICD codes:
<- contingency_df %>%
filtered_obs_exp left_join(fish_obs_exp, by = c('full_icd', 'time')) %>%
mutate(
distance_to_null = case_when(
> 1 ~ lower - 1,
lower TRUE ~ upper - 2
),presentation = case_when(
> 1 & `P value (FDR)` < 0.05 ~ '#d46780',
lower < 1 & `P value (FDR)` < 0.05 ~ '#798234',
upper TRUE ~ 'grey20'
%>%
)) {.}
<- filtered_obs_exp %>%
sorted_icds filter(time == 'After admission') %>%
arrange(Expected) %>%
pull(full_icd)
<- filtered_obs_exp %>%
plot_obs_exp mutate(lestimate = log2(estimate),
llower = log2(lower),
lupper = log2(upper)) %>%
select(full_icd, time, lestimate, llower, lupper, presentation, over_sev, Observed, Expected) %>%
pivot_longer(- c(full_icd, time, presentation, over_sev), names_to = 'type') %>%
mutate(subtype = ifelse(type == 'Expected' | type == 'Observed',
'Sqrt(number of honorees)',
'Log2 enrichment, 95% CI')) %>%
pivot_wider(names_from = type) %>%
mutate(presentation = as.factor(presentation),
full_icd = fct_relevel(full_icd, sorted_icds))
<- plot_obs_exp %>% filter(subtype == 'Sqrt(number of honorees)')
plot_obs_exp_right <- plot_obs_exp %>% filter(subtype != 'Sqrt(number of honorees)') plot_obs_exp_left
<- plot_enrich(
enrichment_plot_before %>% filter(time == 'Before admission') %>%
plot_obs_exp_left mutate(presentation = droplevels(presentation)),
%>% filter(time == 'Before admission') %>%
plot_obs_exp_right slice(match(sorted_icds, full_icd)) %>%
mutate(presentation = droplevels(presentation)),
nudge = 1.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
<- plot_enrich(
enrichment_plot_after %>% filter(time == 'After admission') %>%
plot_obs_exp_left mutate(presentation = droplevels(presentation)),
%>% filter(time == 'After admission') %>% slice(match(sorted_icds, full_icd)) %>%
plot_obs_exp_right mutate(presentation = droplevels(presentation)),
nudge = 1.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
<- cowplot::plot_grid(
enrichment_plot
enrichment_plot_before,
enrichment_plot_after,ncol = 1, hjust = -0.05,
labels = c('A. Before admission', 'B. After admission')
) enrichment_plot
ggsave('figs/icd9_severe_after.png', enrichment_plot_after, width = 9.5, height = 3.5)
ggsave('figs/tiffs/efig_5.tiff', enrichment_plot_after, width = 9.5, height = 3.5, dpi = 300)
Figure caption: Each ICD code’s log2 enrichment (LOE) and its 95% confidence interval (left), and the absolute difference between observed (triangle) and expected (circle) number of severe patients (right) before admission. Positive value of LOE indicates a higher proportion of severe patients with that ICD code compared to non-severe patients. Neurological ICD codes are ordered based on the number of severe patients. Difference has been rounded.
<- plot_obs_exp %>%
plot_enrich_both ggplot(aes(y = fct_relevel(full_icd, sorted_icds))) +
geom_vline(aes(xintercept = 0), linetype = 2) +
::geom_pointrangeh(
ggstanceaes(x = lestimate,
xmin = llower,
xmax = lupper,
color = time),
position = position_dodge(width = 0.3),
stroke = 0.1, fatten = 3, size = 0.7
+
) labs(y = NULL, x = bquote(Log[2] ~ 'enrichment, 95% CI')) +
theme(
legend.position = 'bottom',
legend.background = element_blank(),
axis.title = element_text(size = 9),
plot.margin = margin(5.5, 2, 5.5, 5.5, unit = 'pt')
+
) scale_color_carto_d(palette = 4, direction = -1) +
NULL
plot_enrich_both
## Warning: Removed 50 rows containing missing values (geom_pointrangeh).
ggsave('figs/icd9_severe.png', plot_enrich_both, width = 9, height = 5)
## Warning: Removed 50 rows containing missing values (geom_pointrangeh).