My dataframe looks like this:
ID | value 1 | value 2 | value 3 | value 4
1 | M | D | F | A
2 | F | M | G | B
3 | M | D | F | A
4 | L | D | E | B
I want to get something like this.
value 1 | value 2 | value 3 | value 4| Number of combinations
M | D | F | A | 2
F | M | G | B | 1
L | D | E | B | 1
e.g. to count the number of unique combinations of the columns value 1 - value 4.
count
in plyr
package will do that task.
> df
ID value.1 value.2 value.3 value.4
1 1 M D F A
2 2 F M G B
3 3 M D F A
4 4 L D E B
> library(plyr)
> count(df[, -1])
value.1 value.2 value.3 value.4 freq
1 F M G B 1
2 L D E B 1
3 M D F A 2
N <- 10000
d <- data.frame(
ID=seq(1, N),
v1=sample(c("M","F", "M", "L"), N, replace = TRUE),
v2=sample(c("D","M","D","D"), N, replace = TRUE),
v3=sample(c("F","G","F","E"), N, replace = TRUE),
v4=sample(c("A","B","A","B"), N, replace = TRUE)
)
With data.table (fastest)
dt <- data.table::as.data.table(d)
dt[, .N, by = c('v1','v2','v3','v4')]
With dplyr
dplyr::count_(d, vars = c('v1','v2','v3','v4'))
With plyr
plyr::count(d, vars = c('v1','v2','v3','v4'))
plyr::ddply(d, .variables = c('v1','v2','v3','v4'), nrow)
With aggregate (slowest)
aggregate(ID ~ ., d, FUN = length)
Benchmark
microbenchmark::microbenchmark(dt[, .N, by = c('v1','v2','v3','v4')],
plyr::count(d, vars = c('v1','v2','v3','v4')),
plyr::ddply(d, .variables = c('v1','v2','v3','v4'), nrow),
dplyr::count_(d, vars = c('v1','v2','v3','v4')),
aggregate(ID ~ ., d, FUN = length),
times = 1000)
Unit: microseconds
expr min lq mean median uq max neval cld
dt[, .N, by = c("v1", "v2", "v3", "v4")] 887.807 1107.543 1263.777 1174.258 1289.724 4263.156 1000 a
plyr::count(d, vars = c("v1", "v2", "v3", "v4")) 3912.791 4270.387 5379.080 4498.053 5791.743 157146.103 1000 c
plyr::ddply(d, .variables = c("v1", "v2", "v3", "v4"), nrow) 7737.874 8553.370 10630.849 9018.266 11126.517 187301.696 1000 d
dplyr::count_(d, vars = c("v1", "v2", "v3", "v4")) 2126.913 2432.957 2763.499 2568.251 2789.386 12549.669 1000 b
aggregate(ID ~ ., d, FUN = length) 7395.440 8121.828 10546.659 8776.371 10858.263 210139.759 1000 d
It seems best to simply use data.table
instead of the data.frame
as it is fastest and doesn't need an other function or library to count. Note also that aggregate
function performs much slower on large data sets.
Final note: feel free to update with new methods.
Without plyr.
aggregate(ID ~ ., d, FUN=length)# . means all variables in d except ID
Here a solution using the plyr
package
library(plyr)
d <- data.frame(
ID=seq(1,4), v1=c("M","F", "M", "L"),
v2=c("D","M","D","D"), v3=c("F","G","F","E"), v4=c("A","B","A","B")
)
ddply(d,.(v1,v2,v3,v4), nrow)
I hope this was not homework...