vignettes/vig_04_mi.Rmd
vig_04_mi.Rmd
Missing data is the norm in real-life data analysis. Multiple imputation via the mice
package is a popular option in R. Here we introduce simple missingness and demonstrate use of regmedint
along with mice
.
For demonstration purpose, missing data is introduced here.
set.seed(138087069)
library(regmedint)
library(tidyverse)
## Prepare dataset
data(vv2015)
vv2015 <- vv2015 %>%
select(-cens) %>%
## Generate exposure-dependent missing of mediator
mutate(logit_p_m_miss = -1 + 0.5 * x,
p_m_miss = exp(logit_p_m_miss) / (1 + exp(logit_p_m_miss)),
## Indicator draw
ind_m_miss = rbinom(n = length(p_m_miss), size = 1, prob = p_m_miss),
m_true = m,
m = if_else(ind_m_miss == 1L, as.numeric(NA), m))
Taking the advantage of the simulated setting, the true model is fit here.
regmedint_true <-
regmedint(data = vv2015,
## Variables
yvar = "y",
avar = "x",
mvar = "m_true",
cvar = c("c"),
eventvar = "event",
## Values at which effects are evaluated
a0 = 0,
a1 = 1,
m_cde = 1,
c_cond = 0.5,
## Model types
mreg = "logistic",
yreg = "survAFT_weibull",
## Additional specification
interaction = TRUE,
casecontrol = FALSE)
summary(regmedint_true)
## ### Mediator model
##
## Call:
## glm(formula = m_true ~ x + c, family = binomial(link = "logit"),
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5143 -1.1765 0.9177 1.1133 1.4602
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3545 0.3252 -1.090 0.276
## x 0.3842 0.4165 0.922 0.356
## c 0.2694 0.2058 1.309 0.191
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.59 on 99 degrees of freedom
## Residual deviance: 136.08 on 97 degrees of freedom
## AIC: 142.08
##
## Number of Fisher Scoring iterations: 4
##
## ### Outcome model
##
## Call:
## survival::survreg(formula = Surv(y, event) ~ x + m_true + x:m_true +
## c, data = data, dist = "weibull")
## Value Std. Error z p
## (Intercept) -1.0424 0.1903 -5.48 4.3e-08
## x 0.4408 0.3008 1.47 0.14
## m_true 0.0905 0.2683 0.34 0.74
## c -0.0669 0.0915 -0.73 0.46
## x:m_true 0.1003 0.4207 0.24 0.81
## Log(scale) -0.0347 0.0810 -0.43 0.67
##
## Scale= 0.966
##
## Weibull distribution
## Loglik(model)= -11.4 Loglik(intercept only)= -14.5
## Chisq= 6.31 on 4 degrees of freedom, p= 0.18
## Number of Newton-Raphson Iterations: 5
## n= 100
##
## ### Mediation analysis
## est se Z p lower upper
## cde 0.541070807 0.29422958 1.8389409 0.06592388 -0.03560858 1.11775019
## pnde 0.488930417 0.21049248 2.3227928 0.02019028 0.07637274 0.90148809
## tnie 0.018240025 0.03706111 0.4921608 0.62260566 -0.05439841 0.09087846
## tnde 0.498503455 0.21209540 2.3503737 0.01875457 0.08280410 0.91420281
## pnie 0.008666987 0.02730994 0.3173565 0.75097309 -0.04485951 0.06219348
## te 0.507170442 0.21090051 2.4047853 0.01618197 0.09381303 0.92052785
## pm 0.045436278 0.09119614 0.4982259 0.61832484 -0.13330488 0.22417743
##
## Evaluated at:
## avar: x
## a1 (intervened value of avar) = 1
## a0 (reference value of avar) = 0
## mvar: m_true
## m_cde (intervend value of mvar for cde) = 1
## cvar: c
## c_cond (covariate vector value) = 0.5
##
## Note that effect estimates can vary over m_cde and c_cond values when interaction = TRUE.
regmedint_cca <- vv2015 %>%
filter(!is.na(m)) %>%
regmedint(data = .,
## Variables
yvar = "y",
avar = "x",
mvar = "m",
cvar = c("c"),
eventvar = "event",
## Values at which effects are evaluated
a0 = 0,
a1 = 1,
m_cde = 1,
c_cond = 0.5,
## Model types
mreg = "logistic",
yreg = "survAFT_weibull",
## Additional specification
interaction = TRUE,
casecontrol = FALSE)
summary(regmedint_cca)
## ### Mediator model
##
## Call:
## glm(formula = m ~ x + c, family = binomial(link = "logit"), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.306 -1.133 -1.028 1.183 1.360
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2500 0.3880 -0.644 0.519
## x 0.1278 0.4883 0.262 0.794
## c 0.1587 0.2415 0.657 0.511
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 99.758 on 71 degrees of freedom
## Residual deviance: 99.287 on 69 degrees of freedom
## AIC: 105.29
##
## Number of Fisher Scoring iterations: 3
##
## ### Outcome model
##
## Call:
## survival::survreg(formula = Surv(y, event) ~ x + m + x:m + c,
## data = data, dist = "weibull")
## Value Std. Error z p
## (Intercept) -1.2689 0.2229 -5.69 1.2e-08
## x 0.7213 0.3315 2.18 0.03
## m 0.4517 0.2985 1.51 0.13
## c -0.0652 0.1110 -0.59 0.56
## x:m -0.2579 0.4750 -0.54 0.59
## Log(scale) -0.0581 0.0958 -0.61 0.54
##
## Scale= 0.944
##
## Weibull distribution
## Loglik(model)= -6.1 Loglik(intercept only)= -10.5
## Chisq= 8.79 on 4 degrees of freedom, p= 0.066
## Number of Newton-Raphson Iterations: 5
## n= 72
##
## ### Mediation analysis
## est se Z p lower upper
## cde 0.463323084 0.34636957 1.3376553 0.18100884 -0.21554880 1.14219497
## pnde 0.582515084 0.24557485 2.3720470 0.01768984 0.10119722 1.06383295
## tnie 0.006182822 0.02643657 0.2338738 0.81508294 -0.04563191 0.05799755
## tnde 0.574385442 0.24784035 2.3175623 0.02047312 0.08862728 1.06014360
## pnie 0.014312464 0.05541066 0.2582980 0.79617690 -0.09429043 0.12291536
## te 0.588697906 0.24809627 2.3728608 0.01765092 0.10243816 1.07495766
## pm 0.013852661 0.05856686 0.2365273 0.81302354 -0.10093628 0.12864160
##
## Evaluated at:
## avar: x
## a1 (intervened value of avar) = 1
## a0 (reference value of avar) = 0
## mvar: m
## m_cde (intervend value of mvar for cde) = 1
## cvar: c
## c_cond (covariate vector value) = 0.5
##
## Note that effect estimates can vary over m_cde and c_cond values when interaction = TRUE.
This specific data setting is a little tricky in that the outcome variable is a censored survival time variable. Here we will use a log transformed survival time.
library(mice)
vv2015_mod <- vv2015 %>%
mutate(log_y = log(y)) %>%
select(x,m,c,log_y,event)
## Run mice
vv2015_mice <- mice(data = vv2015_mod, m = 50, printFlag = FALSE)
## Object containig 50 imputed dataset
vv2015_mice
## Class: mids
## Number of multiple imputations: 50
## Imputation methods:
## x m c log_y event
## "" "pmm" "" "" ""
## PredictorMatrix:
## x m c log_y event
## x 0 1 1 1 1
## m 1 0 1 1 1
## c 1 1 0 1 1
## log_y 1 1 1 0 1
## event 1 1 1 1 0
After creating such MI datasets, mediation analysis can be performed in each dataset.
## Fit in each MI dataset
vv2015_mice_regmedint <-
vv2015_mice %>%
## Stacked up dataset
mice::complete("long") %>%
as_tibble() %>%
mutate(y = exp(log_y)) %>%
group_by(.imp) %>%
## Nested data frame
nest() %>%
mutate(fit = map(data, function(data) {
regmedint(data = data,
## Variables
yvar = "y",
avar = "x",
mvar = "m",
cvar = c("c"),
eventvar = "event",
## Values at which effects are evaluated
a0 = 0,
a1 = 1,
m_cde = 1,
c_cond = 0.5,
## Model types
mreg = "logistic",
yreg = "survAFT_weibull",
## Additional specification
interaction = TRUE,
casecontrol = FALSE)
})) %>%
## Extract point estimates and variance estimates
mutate(coef_fit = map(fit, coef),
vcov_fit = map(fit, vcov))
vv2015_mice_regmedint
## # A tibble: 50 × 5
## # Groups: .imp [50]
## .imp data fit coef_fit vcov_fit
## <int> <list> <list> <list> <list>
## 1 1 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 2 2 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 3 3 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 4 4 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 5 5 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 6 6 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 7 7 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 8 8 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 9 9 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## 10 10 <tibble [100 × 7]> <regmednt [4]> <dbl [7]> <dbl [7 × 7]>
## # … with 40 more rows
The results can be combined using the mitools package.
regmedint_mi <- mitools::MIcombine(results = vv2015_mice_regmedint$coef_fit,
variances = vv2015_mice_regmedint$vcov_fit)
regmedint_mi_summary <- summary(regmedint_mi)
## Multiple imputation results:
## MIcombine.default(results = vv2015_mice_regmedint$coef_fit, variances = vv2015_mice_regmedint$vcov_fit)
## results se (lower upper) missInfo
## cde 0.35334826 0.31280990 -0.25988229 0.9665788 9 %
## pnde 0.48716281 0.21292928 0.06982892 0.9044967 0 %
## tnie 0.00486299 0.03096690 -0.05584982 0.0655758 11 %
## tnde 0.47324382 0.21699611 0.04793551 0.8985521 2 %
## pnie 0.01878197 0.06178469 -0.10246510 0.1400291 23 %
## te 0.49202580 0.21430966 0.07198654 0.9120651 0 %
## pm 0.01232649 0.07897926 -0.14251938 0.1671724 11 %
We can observe the MI estimtates are generally more in alignment with the true estimates than the complete-case analysis estimates.
## true cca mi
## cde 0.541070807 0.463323084 0.35334826
## pnde 0.488930417 0.582515084 0.48716281
## tnie 0.018240025 0.006182822 0.00486299
## tnde 0.498503455 0.574385442 0.47324382
## pnie 0.008666987 0.014312464 0.01878197
## te 0.507170442 0.588697906 0.49202580
## pm 0.045436278 0.013852661 0.01232649