Export R data.frame to SPSS

2019-05-26 17:13发布

问题:

There is a package foreign with a function write.foreign() that can write a SPS and CSV file. The SPS file than can read the CSV fiel into SPSS including labels. Fine so far, but there are some issues with that function:

  1. Newer SPSS versions may show an error that you have too few format definitions in DATA LIST
  2. If there are "labels" for numeric variables stored via attr(), these are lost.
  3. Even if the SPSS vesion supports strings up to 32767, the function write.foreign() stops if there are more than 255 in any variable.
  4. Theres a star (*) if any character variables are used, but newer SPSS versions cannot handle that.
  5. The CSV file is comma-separated and does (can) not use quotes, therefore no commas are allowed in strings (character)
  6. Non-ASCII caracters (e.g. umlauts) will crash the import
  7. Should you have a character that contains any NA value, you'll see...

... an error message like this:

Error in if (any(lengths > 255L)) stop("Cannot handle character variables longer than 255") : 
    missing value where TRUE/FALSE needed

I spent a lot of time with that and then found a good posting (http://r.789695.n4.nabble.com/SPSS-export-in-R-package-foreign-td921491.html) to start on and make it better. Here's my result, I'd like to share with you.

回答1:

To export an R data.frame to SPSS, use write_sav from the haven package:

library(haven)
write_sav(mtcars, "mtcars.sav")


回答2:

The SPSS extension command STATS GET R can read a data frame directly into an SPSS dataset from a saved R workspace. If this extension command is not already installed (it will show up on the File menu), it can be installed from the Utilities menu (Statistics 22-23) or the Extensions menu (Statistics 24+).



回答3:

This function is a replacement for foreign:write.foreign to handle the issues stated above.

Note: To avoid issues with SPSS finding the CSV file, please specify the full path (!) at least for datafile (also if using the original foreign:write.foreign()).

Note: This script will replace a tabulator (TAB) and other spacing (incl. CR+LF) in strings by a blank without warning. One may consider using GET DATA instead of the troublesome DATA LIST to solve that limitation.

Note: There may be a warning In FUN(X[[i]], ...) : probable complete loss of accuracy in modulus - this refers to counting the decimals and can be ignored.

Note: POSIXlt and POSIXct variables are not yet handled by the script properly.

writeForeignMySPSS = function (df, datafile, codefile, varnames = NULL, len = 32767) {
    adQuote <-  function (x) paste("\"", x, "\"", sep = "")

    # Last variable must not be empty for DATA LIST
    if (any(is.na(df[[length(df)]]))) {
        df$END_CASE = 0
    }

    # http://stackoverflow.com/questions/5173692/how-to-return-number-of-decimal-places-in-r
    decimalplaces <- function(x) {
        y = x[!is.na(x)]
        if (length(y) == 0) {
            return(0)
        }
        if (any((y %% 1) != 0)) {
            info = strsplit(sub('0+$', '', as.character(y)), ".", fixed=TRUE)
            info = info[sapply(info, FUN=length) == 2]
            if (length(info) >= 2) {
              dec = nchar(unlist(info))[seq(2, length(info), 2)]
            } else {
              return(0)
            }
            return(max(dec, na.rm=T))
        } else {
            return(0)
        }
    }

    dfn <- lapply(df, function(x) if (is.factor(x))
        as.numeric(x)
        else x)

    # Boolean variables (dummy coding)
    bv = sapply(dfn, is.logical)
    for (v in which(bv)) {
        dfn[[v]] = ifelse(dfn[[v]], 1, 0)
    }

    varlabels <- names(df)
    # Use comments where applicable
    for (i in 1:length(df)) {
      cm = comment(df[[i]])
      if (is.character(cm) && (length(cm) > 0)) {
        varlabels[i] = comment(df[[i]])
      }
    }

    if (is.null(varnames)) {
        varnames <- abbreviate(names(df), 8L)
        if (any(sapply(varnames, nchar) > 8L))
            stop("I cannot abbreviate the variable names to eight or fewer letters")
        if (any(varnames != varlabels))
            warning("some variable names were abbreviated")
    }
    varnames <- gsub("[^[:alnum:]_\\$@#]", "\\.", varnames)
    dl.varnames <- varnames
    chv = sapply(df, is.character)
    if (any(chv)) {
        for (v in which(chv)) {
            dfn[[v]] = gsub("\\s", " ", dfn[[v]])
        }
        lengths <- sapply(df[chv], function(v) max(nchar(v), na.rm=T))
        if (any(lengths > len)) {
            warning(paste("Clipped strings in", names(df[chv]), "to", len, "characters"))
            for (v in which(chv)) {
                df[[v]] = substr(df[[v]], start=1, stop=len)
            }
        }
        lengths[is.infinite(lengths)] = 0
        lengths[lengths < 1] = 1
        lengths <- paste("(A", lengths, ")", sep = "")
        # star <- ifelse(c(FALSE, diff(which(chv) > 1)), " *",
        dl.varnames[chv] <- paste(dl.varnames[chv], lengths)
    }

    # decimals and bools
    nmv = sapply(df, is.numeric)
    dbv = sapply(df, is.numeric)
    nv = (nmv | dbv)
    decimals = sapply(df[nv], FUN=decimalplaces)
    dl.varnames[nv] = paste(dl.varnames[nv], " (F", decimals+8, ".", decimals, ")", sep="")
    if (length(bv) > 0) {
        dl.varnames[bv] = paste(dl.varnames[bv], "(F1.0)")
    }
    rmv = !(chv | nv | bv)
    if (length(rmv) > 0) {
        dl.varnames[rmv] = paste(dl.varnames[rmv], "(F8.0)")
    }
    # Breaks in output
    brv = seq(1, length(dl.varnames), 10)
    dl.varnames[brv] = paste(dl.varnames[brv], "\n", sep=" ")

    cat("SET LOCALE = ENGLISH.\n", file = codefile)
    cat("DATA LIST FILE=", adQuote(datafile), " free (TAB)\n", file = codefile, append = TRUE)
    cat("/", dl.varnames, " .\n\n", file = codefile, append = TRUE)
    cat("VARIABLE LABELS\n", file = codefile, append = TRUE)
    cat(paste(varnames, adQuote(varlabels), "\n"), ".\n", file = codefile,
        append = TRUE)
    factors <- sapply(df, is.factor)
    if (any(factors)) {
        cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
        for (v in which(factors)) {
            cat("/\n", file = codefile, append = TRUE)
            cat(varnames[v], " \n", file = codefile, append = TRUE)
            levs <- levels(df[[v]])
            cat(paste(1:length(levs), adQuote(levs), "\n", sep = " "),
                file = codefile, append = TRUE)
        }
        cat(".\n", file = codefile, append = TRUE)
    }

    # Labels stored in attr()
    attribs <- !unlist(lapply(sapply(df, FUN=attr, which="1"), FUN=is.null))
    if (any(attribs)) {
        cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
        for (v in which(attribs)) {
            cat("/\n", file = codefile, append = TRUE)
            cat(varnames[v], " \n", file = codefile, append = TRUE)
            # Check labeled values
            tc = list()
            for (tcv in dimnames(table(df[[v]]))[[1]]) {
                if (!is.null(tcl <- attr(df[[v]], tcv))) {
                    tc[tcv] = tcl
                }
            }
            cat(paste(names(tc), tc, "\n", sep = " "),
                file = codefile, append = TRUE)
        }
        cat(".\n", file = codefile, append = TRUE)
    }

    ordinal <- sapply(df, is.ordered)
    if (any(ordinal)) {
        tmp = varnames[ordinal]
        brv = seq(1, length(tmp), 10)
        tmp[brv] = paste(tmp[brv], "\n")
        cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(ORDINAL).\n"),
            file = codefile, append = TRUE)
    }
    num <- sapply(df, is.numeric)
    if (any(num)) {
        tmp = varnames[num]
        brv = seq(1, length(tmp), 10)
        tmp[brv] = paste(tmp[brv], "\n")
        cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(SCALE).\n"),
            file = codefile, append = TRUE)
    }
    cat("\nEXECUTE.\n", file = codefile, append = TRUE)

    write.table(dfn, file = datafile, row = FALSE, col = FALSE,
                sep = "\t", quote = F, na = "", eol = "\n", fileEncoding="UTF-8")
}

On the long term, the changes might be considered to be merged into the foreignpackage. Unfortunately, the bug reporting system for the r-project is currently limited to previously registered developers.



标签: r csv export spss