library(mvnimpute)This tutorial describes how to use the mvnimpute
package. We will go through an example generated by the
data.generation function in the package.
#### data generation
n <- 500; p <- 6
# set.seed(133)
m <- c(1, 1, 2, 4, 3, 5)
v <- clusterGeneration::genPositiveDefMat(p, "eigen")$Sigma
miss.var <- c(1, 2)
censor.var <- c(3, 5)
example.data <- data.generation(num_ind = n,
mean_vec = m,
cov_mat = v,
miss_var = miss.var,
miss_mech = "MAR",
miss_prob = NULL,
censor_var = censor.var,
censor_type = "interval",
censor_param = 0.5)
names(example.data)
#> [1] "full.data" "observe.data" "bounds" "indicator"We generate a dataset with sample size 500, with specified mean
vector consists of six specified values, and a random positive definite
covariance matrix. The output includes four objects which include,
full.data, the data matrix that would have been observed
without missing and censoring information; observe.data,
the truly observed data matrix with missing and censoring information;
bounds, the bounds information of the variables that have
missing or censored values. Specifically, we take missing as a special
type of interval censoring with no censoring limits (censoring limits
are set as \(\pm \infty\));
indicator, the matrix that indicating the data type: 0 as
the missing data, 1 as the observed data, and 2 as the censored data.
Now, let us take a look at the dataset
# data
full <- example.data$full.data
observed <- example.data$observe.data
censoring.bounds <- example.data$bounds
indicator <- example.data$indicator
# summary
tail(observed) # observed data
#> y1 y2 y3 y4 y5 y6
#> [495,] -1.547567 NA NaN 2.315895 NaN 3.907107
#> [496,] NA -0.2994182 NaN 4.492385 6.786569 2.777854
#> [497,] 1.952071 0.9228392 3.27131928 4.488331 NaN 6.952747
#> [498,] -2.256520 -1.4893098 -0.06679569 4.408467 NaN 2.853682
#> [499,] 2.890608 -2.3451872 -5.92138055 6.106857 4.939494 4.159683
#> [500,] 2.510733 2.0193359 NaN 4.387197 3.525996 6.315236
tail(censoring.bounds[[1]]);tail(censoring.bounds[[2]]) # censoring bounds information
#> y1 y2 y3 y4 y5 y6
#> [495,] -1.547567 NA 0.15034900 2.315895 1.4321201 3.907107
#> [496,] NA -0.2994182 0.85767392 4.492385 6.7865694 2.777854
#> [497,] 1.952071 0.9228392 3.27131928 4.488331 3.1594592 6.952747
#> [498,] -2.256520 -1.4893098 -0.06679569 4.408467 0.0873068 2.853682
#> [499,] 2.890608 -2.3451872 -5.92138055 6.106857 4.9394944 4.159683
#> [500,] 2.510733 2.0193359 0.26097282 4.387197 3.5259965 6.315236
#> y1 y2 y3 y4 y5 y6
#> [495,] -1.547567 NA 0.69562111 2.315895 6.794582 3.907107
#> [496,] NA -0.2994182 3.20116375 4.492385 6.786569 2.777854
#> [497,] 1.952071 0.9228392 3.27131928 4.488331 5.118294 6.952747
#> [498,] -2.256520 -1.4893098 -0.06679569 4.408467 4.010630 2.853682
#> [499,] 2.890608 -2.3451872 -5.92138055 6.106857 4.939494 4.159683
#> [500,] 2.510733 2.0193359 5.08191846 4.387197 3.525996 6.315236
tail(indicator) # indicator matrix
#> y1 y2 y3 y4 y5 y6
#> [495,] 1 0 4 1 4 1
#> [496,] 0 1 4 1 1 1
#> [497,] 1 1 1 1 4 1
#> [498,] 1 1 1 1 4 1
#> [499,] 1 1 1 1 1 1
#> [500,] 1 1 4 1 1 1From the observed data matrix, the missing values are denoted as
NA, the censored values are denoted as NaN.
The missing data has censoring limits set as \(\pm 10,000\) (we set \(\pm 10,000\) as proxies for \(\pm \infty\)). The indicator matrix labels
the missing value as 0 and the interval censored value as 4. Then the
plot that visually shows the percentages of observed, missing and
censored values can be generated by the visual.plot
function.
visual.plot(indicator, title = NULL)For doing multiple imputation, we take a Normal-Inverse-Wishart distribution for joint prior distribution of the mean vector and covariance matrix. We run MCMC simulation for 1,000 iterations.
### prior specifications
prior.spec <- list(
mu.0 = rep(0, p),
Lambda.0 = diag(10, p),
kappa.0 = 100,
nu.0 = p * (p + 1) / 2
)
start.vals <- list(
mu = rep(1, p),
sigma = diag(p)
)
### MCMC simulation
iter <- 1000
sim.res <- multiple.imputation(
censoring.bounds,
prior.spec,
start.vals,
iter,
TRUE
)conv.plot(sim.res$simulated.mu, 0, iter, title = "convergence plot of the mean values")conv.plot(sim.res$simulated.sig, 0, iter, title = "convergence plot of the variance values")title = paste("acf: variable", 1:p)
acf.calc(sim.res$simulated.mu, title = title)acf.calc(sim.res$simulated.sig, title = title)From the convergence plots and the autocorrelation plots, we see that the posterior distributions of both the mean and variance seem to converge to equilibrium in first few iterations.
We draw and compare the marginal density plots using the full data, the CC data and the imputed data from the last iteration of MCMC using the mvnimpute package.
for (i in c(1, 2, 3, 5)) {
plot(density(full[, i]), main = colnames(full)[i])
lines(density(observed[!is.na(observed[, i]), i]), col = 6, lty = 2)
lines(density(sim.res$imputed.data[[iter]][, i]), col = 4, lty = 3)
}The black line is the density plot of the full data, the blue dotted line is the imputed data using the mvnimpute pacakge, the pink dashed line is the CC data. We see that the imputed data density align with the full data density quite well.
full.reg <- coef(lm(y4 ~ y1 + y2 + y3 + y5 + y6, data = data.frame(full)))
cc.reg <- coef(lm(y4 ~ y1 + y2 + y3 + y5 + y6, data = data.frame(observed)))
### imputed data
gibbs <- seq(800, 1000, 50)
#### mvnimpute
mvnimpute.dat <- list()
model.param <- list()
for (i in 1:length(gibbs)) {
mvnimpute.dat[[i]] <- sim.res$imputed.data[[gibbs[i]]]
colnames(mvnimpute.dat[[i]]) <- paste0("y", 1:p)
model.param[[i]] <- lm(y4 ~ y1 + y2 + y3 + y5 + y6, data.frame(mvnimpute.dat[[i]]))
}
mvnimpute.mod <- summary(mice::pool(model.param))
sim.reg <- mvnimpute.mod[, 2]
reg.compare <- data.frame(
true = true.beta,
full = full.reg,
mvnimpute = sim.reg,
cc = cc.reg
)
reg.compare
#> true full mvnimpute cc
#> (Intercept) 3.73975211 4.07379080 4.06506771 4.215508888
#> y1 0.03742948 0.02917612 0.02735768 -0.004483437
#> y2 0.02449541 0.09440116 0.10648160 0.069068147
#> y3 0.01128354 0.01674152 0.01285773 0.017741180
#> y5 0.01602758 0.05832823 0.05773384 0.055510475
#> y6 0.02553464 -0.05083920 -0.04895012 -0.035768977
options(old)