I have a table which has unequal number of element in string format
File1 A B C
File2 A B D
File3 E F
I want to convert into a format as follows
A B C D E F
File1 1 1 1 0 0 0
FIle2 1 1 0 1 0 0
File3 0 0 0 0 1 1
I tried to do it using reshape2 but was not successful.
Sample data:
mydata <- structure(list(V1 = c("File1", "File2", "File3"),
V2 = c("A", "A", "E"), V3 = c("B", "B", "F"),
V4 = c("C", "D", "")),
.Names = c("V1", "V2", "V3", "V4"),
class = "data.frame", row.names = c(NA, -3L))
One possibility:
library(reshape2)
df2 <- melt(df, id.var = "V1")
with(df2, table(V1, value))
# value
# V1 A B C D E F
# File1 1 1 1 0 0 0
# File2 1 1 0 1 0 0
# File3 0 0 0 0 1 1
A reasonably efficient approach is to use the (presently) non-exported charMat
function from my "splitstackshape" package. Since it's not exported, you will have to use :::
to access it.
library(splitstackshape)
cbind(mydata[1], splitstackshape:::charMat(
split.default(mydata[-1], sequence(ncol(mydata)-1)), fill=0))
# V1 V1 A B C D E F
# 1 File1 0 1 1 1 0 0 0
# 2 File2 0 1 1 0 1 0 0
# 3 File3 1 0 0 0 0 1 1
Under the hood, charMat
makes use of matrix indexing to process everything pretty efficiently. Step-by-step, this is what charMat
does.
X <- split.default(mydata[-1], sequence(ncol(mydata)-1))
len <- length(X)
vec <- unlist(X, use.names=FALSE)
lvl <- sort(unique(vec))
out <- matrix(0L, nrow = len, ncol = length(lvl), dimnames = list(NULL, lvl))
i.idx <- rep(seq.int(len), vapply(X, length, integer(1L)))
j.idx <- match(vec, lvl)
out[cbind(i.idx, j.idx)] <- 1
out
# A B C D E F
# [1,] 0 1 1 1 0 0 0
# [2,] 0 1 1 0 1 0 0
# [3,] 1 0 0 0 0 1 1
That looks like a mouthful, but it is actually quite a fast operation, made faster by using the charMat
function :-)
Update: Benchmarks
The following benchmarks test Henrik's answer with my charMat
answer, and also adapts Henrik's answer to use "data.table" instead, for better efficiency.
Two tests were run. The first is on a similar dataset with 90K rows, and the second on one with 900K rows.
Here's the sample data:
biggerdata <- do.call(rbind, replicate(30000, mydata, simplify = FALSE))
biggerdata$V1 <- make.unique(biggerdata$V1)
dim(biggerdata)
# [1] 90000 4
evenBigger <- do.call(rbind, replicate(10, biggerdata, simplify = FALSE))
evenBigger$V1 <- make.unique(evenBigger$V1)
dim(evenBigger)
# [1] 900000 4
Here are the functions to benchmark:
fun1 <- function(indf) {
cbind(indf[1], splitstackshape:::charMat(
split.default(indf[-1], sequence(ncol(indf)-1)), fill=0))
}
library(reshape2)
fun2 <- function(indf) {
df2 <- melt(indf, id.var = "V1")
with(df2, table(V1, value))
}
library(data.table)
library(reshape2)
DT <- data.table(biggerdata)
DT2 <- data.table(evenBigger)
fun3 <- function(inDT) {
DTL <- melt(inDT, id.vars="V1")
dcast.data.table(DTL, V1 ~ value, fun.aggregate=length)
}
And the results of the benchmarking.
library(microbenchmark)
microbenchmark(fun1(biggerdata), fun2(biggerdata), fun3(DT), times = 20)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1(biggerdata) 185.3652 199.8725 289.0206 308.5826 327.4185 20
# fun2(biggerdata) 1453.8791 1605.6053 1639.8567 1758.3984 1797.2229 20
# suppressMessages(fun3(DT)) 469.8979 570.4664 586.4715 598.6229 675.2961 20
microbenchmark(fun1(evenBigger), fun2(evenBigger), fun3(DT2), times = 5)
# Unit: seconds
# expr min lq median uq max neval
# fun1(evenBigger) 1.871611 1.896351 2.071355 2.140580 2.464569 5
# fun2(evenBigger) 26.911523 27.212910 27.363442 27.469812 27.938178 5
# fun3(DT2) 7.103615 7.131603 7.141908 7.205006 7.218321 5