-->

multinominal regression with imputed data

2019-02-28 06:59发布

问题:

I need to impute missing data and then coduct multinomial regression with the generated datasets. I have tried using mice for the imputing and then multinom function from nnet for the multnomial regression. But this gives me unreadable output. Here is an example using the nhanes2 dataset available with the mice package:

library(mice)
library(nnet)

test <- mice(nhanes2, meth=c('sample','pmm','logreg','norm'))
#age is categorical, bmi is continuous
m <- with(test, multinom(age ~ bmi, model = T))
summary(pool(m))

m1 <- with(test, lm(bmi ~ age, model = T))
summary(pool(m1))

With the code above, the output of 'm' lists the coefficients as 1, 2, 3 and 4 instead of their variable names. The output for m1 when using the lm command instead comes out correctly. This seems to be an issue with running the multinomial command on a mira object. Does anyone have any suggestions on how to fix this or suggestions for other imputation and multinomial regression methods that would work better?

回答1:

EDIT

This has now been updated on the development branch of mice, and executes as expected: see https://github.com/stefvanbuuren/mice/issues/85


There are a couple of issues when using mulitnom. The pool method mixes up the ordering of the coefficients and standard errors - so (asaik) the object returned from using pool is not correct (actually mixes up is a bit strong - there is no specific pool method for multinom models, and so it uses the default which doesn't quite work in this case) .

Also, as you mention, then names are dropped - this is due to pool calling names(coef(modelobject)), but multinom models return a matrix of coefficients, and so don't have names (they have rownames & colnames).

So you can alter the pool function to cater for multinom models - see pooly below (actually you could write a much smaller function that just deals with this model class, but I chose to write a quick, more general approach which should hopefully not break for other model classes, but with caveat, I have not fully tested that it didn't.)

Test with your example

library(mice)
library(nnet)

test <- mice(nhanes2, meth=c('polyreg','pmm','logreg','norm'), print=0)
m <- with(test, multinom(age ~ bmi, model = T))
summary(pooly(m))
#                          est        se         t       df   Pr(>|t|)      lo 95       hi 95 nmis       fmi    lambda
# 40-59:(Intercept)  5.8960594 4.5352921  1.300040 12.82882 0.21646037 -3.9151498 15.70726867   NA 0.2951384 0.1931975
# 40-59:bmi         -0.2356516 0.1669807 -1.411250 12.81702 0.18198189 -0.5969156  0.12561248   NA 0.2955593 0.1935923
# 60-99:(Intercept)  8.2723321 4.7656701  1.735817 15.55876 0.10235284 -1.8537729 18.39843700   NA 0.1989371 0.1021831
# 60-99:bmi         -0.3364014 0.1832718 -1.835533 15.03938 0.08627846 -0.7269469  0.05414413   NA 0.2174394 0.1198595
# 

Compare to original which (I think) mixes up the coefs & se's

summary(pool(m))

Define function to accept multinom models. The added code has comments next to it.

pooly <- function (object, method = "smallsample") {
    call <- match.call()
    if (!is.mira(object)) 
        stop("The object must have class 'mira'")
    m <- length(object$analyses)
    fa <- getfit(object, 1)
    if (m == 1) {
        warning("Number of multiple imputations m=1. No pooling done.")
        return(fa)
    }
    analyses <- getfit(object)
    if (class(fa)[1] == "lme" && !requireNamespace("nlme", quietly = TRUE)) 
        stop("Package 'nlme' needed fo this function to work. Please install it.", 
            call. = FALSE)
    if ((class(fa)[1] == "mer" || class(fa)[1] == "lmerMod" || 
        inherits(fa, "merMod")) && !requireNamespace("lme4", 
        quietly = TRUE)) 
        stop("Package 'lme4' needed fo this function to work. Please install it.", 
            call. = FALSE)
    mess <- try(coef(fa), silent = TRUE)
    if (inherits(mess, "try-error")) 
        stop("Object has no coef() method.")
    mess <- try(vcov(fa), silent = TRUE)
    if (inherits(mess, "try-error")) 
        stop("Object has no vcov() method.")
    if (class(fa)[1] == "mer" || class(fa)[1] == "lmerMod" || 
        inherits(fa, "merMod")) {
        k <- length(lme4::fixef(fa))
        names <- names(lme4::fixef(fa))
    }
    else if (class(fa)[1] == "polr") {
        k <- length(coef(fa)) + length(fa$zeta)
        names <- c(names(coef(fa)), names(fa$zeta))
    }
    # added this ---------------------------------
    else if (class(fa)[1] == "multinom") {
        k <- length(coef(fa)) 
        names <- rownames(vcov(fa))
    }
    # --------------------------------------------
    else {
        k <- length(coef(fa))
        names <- names(coef(fa)) 
    }
    qhat <- matrix(NA, nrow = m, ncol = k, dimnames = list(seq_len(m), 
        names))
    u <- array(NA, dim = c(m, k, k), dimnames = list(seq_len(m), 
        names, names))
    for (i in seq_len(m)) {
        fit <- analyses[[i]]
        if (class(fit)[1] == "mer") {
            qhat[i, ] <- lme4::fixef(fit)
            ui <- as.matrix(vcov(fit))
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class mer, fixef(fit): ", 
                  ncol(qhat), ", as.matrix(vcov(fit)): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        else if (class(fit)[1] == "lmerMod" || inherits(fa, "merMod")) {
            qhat[i, ] <- lme4::fixef(fit)
            ui <- vcov(fit)
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class lmerMod, fixed(fit): ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        else if (class(fit)[1] == "lme") {
            qhat[i, ] <- fit$coefficients$fixed
            ui <- vcov(fit)
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class lme, fit$coefficients$fixef: ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        else if (class(fit)[1] == "polr") {
            qhat[i, ] <- c(coef(fit), fit$zeta)
            ui <- vcov(fit)
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class polr, c(coef(fit, fit$zeta): ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        else if (class(fit)[1] == "survreg") {
            qhat[i, ] <- coef(fit)
            ui <- vcov(fit)
            parnames <- dimnames(ui)[[1]]
            select <- !(parnames %in% "Log(scale)")
            ui <- ui[select, select]
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class survreg, coef(fit): ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        # added this block -------------------------------------
        else if (class(fit)[1] == "multinom") {
            qhat[i, ] <- c(t(coef(fit))) # transpose to get same order as standard errors
            ui <- vcov(fit)
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: class multinom, c(coef(fit)): ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
        # ----------------------------------------------------
        else {
            qhat[i, ] <- coef(fit)
            ui <- vcov(fit)
            ui <- expandvcov(qhat[i, ], ui)
            if (ncol(ui) != ncol(qhat)) 
                stop("Different number of parameters: coef(fit): ", 
                  ncol(qhat), ", vcov(fit): ", ncol(ui))
            u[i, , ] <- array(ui, dim = c(1, dim(ui)))
        }
    }
    qbar <- apply(qhat, 2, mean)
    ubar <- apply(u, c(2, 3), mean)
    e <- qhat - matrix(qbar, nrow = m, ncol = k, byrow = TRUE)
    b <- (t(e) %*% e)/(m - 1)
    t <- ubar + (1 + 1/m) * b
    r <- (1 + 1/m) * diag(b/ubar)
    lambda <- (1 + 1/m) * diag(b/t)
    dfcom <- df.residual(object)
    df <- mice.df(m, lambda, dfcom, method)
    fmi <- (r + 2/(df + 3))/(r + 1)
    names(r) <- names(df) <- names(fmi) <- names(lambda) <- names
    fit <- list(call = call, call1 = object$call, call2 = object$call1, 
        nmis = object$nmis, m = m, qhat = qhat, u = u, qbar = qbar, 
        ubar = ubar, b = b, t = t, r = r, dfcom = dfcom, df = df, 
        fmi = fmi, lambda = lambda)
    oldClass(fit) <- c("mipo", oldClass(object))
    return(fit)
}

environment(pooly) <- environment(mice)