Convert a dataframe to presence absence matrix

I have a table which has unequal number of element in string format

File1 A  B  C
File2 A  B  D
File3 E  F

I want to convert into a format as follows

        A B C D E F
File1   1 1 1 0 0 0 
FIle2   1 1 0 1 0 0
File3   0 0 0 0 1 1

I tried to do it using reshape2 but was not successful.

Sample data:

mydata <- structure(list(V1 = c("File1", "File2", "File3"), 
                         V2 = c("A", "A", "E"), V3 = c("B", "B", "F"), 
                         V4 = c("C", "D", "")), 
                   .Names = c("V1", "V2", "V3", "V4"), 
                   class = "data.frame", row.names = c(NA, -3L))

回答1:

One possibility:

library(reshape2)
df2 <- melt(df, id.var = "V1")
with(df2, table(V1, value))

#         value
# V1      A B C D E F
#   File1 1 1 1 0 0 0
#   File2 1 1 0 1 0 0
#   File3 0 0 0 0 1 1

回答2:

A reasonably efficient approach is to use the (presently) non-exported charMat function from my "splitstackshape" package. Since it's not exported, you will have to use ::: to access it.

library(splitstackshape)
cbind(mydata[1], splitstackshape:::charMat(
  split.default(mydata[-1], sequence(ncol(mydata)-1)), fill=0))
#      V1 V1 A B C D E F
# 1 File1  0 1 1 1 0 0 0
# 2 File2  0 1 1 0 1 0 0
# 3 File3  1 0 0 0 0 1 1

Under the hood, charMat makes use of matrix indexing to process everything pretty efficiently. Step-by-step, this is what charMat does.

X <- split.default(mydata[-1], sequence(ncol(mydata)-1))
len <- length(X)
vec <- unlist(X, use.names=FALSE)
lvl <- sort(unique(vec))
out <- matrix(0L, nrow = len, ncol = length(lvl), dimnames = list(NULL, lvl))
i.idx <- rep(seq.int(len), vapply(X, length, integer(1L)))
j.idx <- match(vec, lvl)
out[cbind(i.idx, j.idx)] <- 1
out
#        A B C D E F
# [1,] 0 1 1 1 0 0 0
# [2,] 0 1 1 0 1 0 0
# [3,] 1 0 0 0 0 1 1

That looks like a mouthful, but it is actually quite a fast operation, made faster by using the charMat function :-)

Update: Benchmarks

The following benchmarks test Henrik's answer with my charMat answer, and also adapts Henrik's answer to use "data.table" instead, for better efficiency.

Two tests were run. The first is on a similar dataset with 90K rows, and the second on one with 900K rows.

Here's the sample data:

biggerdata <- do.call(rbind, replicate(30000, mydata, simplify = FALSE))
biggerdata$V1 <- make.unique(biggerdata$V1)
dim(biggerdata)
# [1] 90000     4

evenBigger <- do.call(rbind, replicate(10, biggerdata, simplify = FALSE))
evenBigger$V1 <- make.unique(evenBigger$V1)
dim(evenBigger)
# [1] 900000      4

Here are the functions to benchmark:

fun1 <- function(indf) {
  cbind(indf[1], splitstackshape:::charMat(
    split.default(indf[-1], sequence(ncol(indf)-1)), fill=0))
}

library(reshape2)
fun2 <- function(indf) {
  df2 <- melt(indf, id.var = "V1")
  with(df2, table(V1, value))
}

library(data.table)
library(reshape2)
DT <- data.table(biggerdata)
DT2 <- data.table(evenBigger)

fun3 <- function(inDT) {
  DTL <- melt(inDT, id.vars="V1")
  dcast.data.table(DTL, V1 ~ value, fun.aggregate=length)
}

And the results of the benchmarking.

library(microbenchmark)
microbenchmark(fun1(biggerdata), fun2(biggerdata), fun3(DT), times = 20)
# Unit: milliseconds
#                        expr       min        lq    median        uq       max neval
#            fun1(biggerdata)  185.3652  199.8725  289.0206  308.5826  327.4185    20
#            fun2(biggerdata) 1453.8791 1605.6053 1639.8567 1758.3984 1797.2229    20
#  suppressMessages(fun3(DT))  469.8979  570.4664  586.4715  598.6229  675.2961    20

microbenchmark(fun1(evenBigger), fun2(evenBigger), fun3(DT2), times = 5)
# Unit: seconds
#              expr       min        lq    median        uq       max neval
#  fun1(evenBigger)  1.871611  1.896351  2.071355  2.140580  2.464569     5
#  fun2(evenBigger) 26.911523 27.212910 27.363442 27.469812 27.938178     5
#         fun3(DT2)  7.103615  7.131603  7.141908  7.205006  7.218321     5