Speedy/elegant way to unite many pairs of columns

2019-03-02 01:37发布

Is there an elegant/fastR way to combine all pairs of columns in a data.frame?

For example, using mapply() and paste() we can turn this data.frame:

mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
head(mydf)
  a.1 a.2 b.1 b.2
1   a  26   a   1
2   b  25   b   2
3   c  24   c   3
4   d  23   d   4
5   e  22   e   5
6   f  21   f   6

into this data.frame:

mydf2 <- mapply(function(x, y) {
     paste(x, y, sep = ".")},
     mydf[ ,seq(1, ncol(mydf), by = 2)],
     mydf[ ,seq(2, ncol(mydf), by = 2)])
head(mydf2)
     a.1    b.1  
[1,] "a.26" "a.1"
[2,] "b.25" "b.2"
[3,] "c.24" "c.3"
[4,] "d.23" "d.4"
[5,] "e.22" "e.5"
[6,] "f.21" "f.6"

However, this feels clumsy and is a bit slow when applied to big datasets. Any suggestions, perhaps using a Hadley package?

EDIT: The ideal solution would easily scale to large numbers of columns, such that the names of the columns would not need to be included in the function call. Thanks!

3条回答
淡お忘
2楼-- · 2019-03-02 02:14

It's amusing to note that the OP's solution appears to be the fastest one:

f1 <- function(mydf) {
    mapply(function(x, y) {
        paste(x, y, sep = ".")},
        mydf[ ,seq(1, ncol(mydf), by = 2)],
        mydf[ ,seq(2, ncol(mydf), by = 2)])
}

f.thelatemail <- function(mydf) {
    mapply(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep=".")
}

require(dplyr)

f.on_the_shores_of_linux_sea <- function(mydf) {
    transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2)) 
}

f.jazurro <- function(mydf) {
    odd <- seq(1, ncol(mydf), 2);
    lapply(odd, function(x) paste(mydf[,x], mydf[,x+1], sep = ".")) %>% 
        do.call(cbind,.)
}

library(data.table) 
f.akrun <- function(mydf) {
    res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
    indx <- seq(1, ncol(mydf), 2)
    setDT(mydf)
    for(j in seq_along(indx)){
        set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]], 
                                           mydf[[indx[j]+1]], sep='.'))
    }
    res
}

mydf <- data.frame(a.1 = letters, a.2 = 26:1, b.1 = letters, b.2 = 1:26)
mydf <- mydf[rep(1:nrow(mydf),5000),]


library(rbenchmark)
benchmark(f1(mydf),f.thelatemail(mydf),f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf))

Results:

#                                 test replications elapsed relative user.self sys.self user.child sys.child
# 5                      f.akrun(mydf)          100  14.000   75.269    13.673    0.296          0         0
# 4                    f.jazurro(mydf)          100   0.388    2.086     0.314    0.071          0         0
# 3 f.on_the_shores_of_linux_sea(mydf)          100  15.585   83.790    15.293    0.280          0         0
# 2                f.thelatemail(mydf)          100  26.416  142.022    25.736    0.639          0         0
# 1                           f1(mydf)          100   0.186    1.000     0.169    0.017          0         0

[Updated Benchmark]

I've added one solution from @thelatemail, which I missed in the original answer, and one solution from @akrun:

f.thelatemail2 <- function(mydf) {
    data.frame(Map(paste,mydf[c(TRUE,FALSE)],mydf[c(FALSE,TRUE)],sep="."))
}

f.akrun2 <- function(mydf) {    
    setDT(mydf)
    indx <- as.integer(seq(1, ncol(mydf), 2))
    mydf2 <- copy(mydf)
    for(j in indx){
        set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
                                             mydf2[[j+1]], sep="."))
    }
    mydf2[,indx, with=FALSE]
}

Benchmark:

library(rbenchmark)

benchmark(f1(mydf),f.thelatemail(mydf), f.thelatemail2(mydf), f.on_the_shores_of_linux_sea(mydf),f.jazurro(mydf),f.akrun(mydf),f.akrun2(mydf))
#                                 test replications elapsed relative user.self sys.self user.child sys.child
# 6                      f.akrun(mydf)          100  13.247   69.356    12.897    0.340          0         0
# 7                     f.akrun2(mydf)          100  12.746   66.733    12.405    0.339          0         0
# 5                    f.jazurro(mydf)          100   0.327    1.712     0.254    0.073          0         0
# 4 f.on_the_shores_of_linux_sea(mydf)          100  16.347   85.586    15.838    0.445          0         0
# 2                f.thelatemail(mydf)          100  26.307  137.733    25.536    0.708          0         0
# 3               f.thelatemail2(mydf)          100  15.938   83.445    15.136    0.750          0         0
# 1                           f1(mydf)          100   0.191    1.000     0.156    0.036          0         0
查看更多
混吃等死
3楼-- · 2019-03-02 02:22

An option using set from data.table. It should be fast for large datasets as it modifies by reference and the overhead of [.data.table is avoided. Assuming that the columns are ordered for each pair of columns.

library(data.table)
res <- as.data.table(matrix(, ncol=ncol(mydf)/2, nrow=nrow(mydf)))
indx <- seq(1, ncol(mydf), 2)
setDT(mydf)
for(j in seq_along(indx)){
   set(res, i=NULL, j=j, value= paste(mydf[[indx[j]]], 
                           mydf[[indx[j]+1]], sep='.'))
 }
head(res)
#    V1  V2
#1: a.26 a.1
#2: b.25 b.2
#3: c.24 c.3
#4: d.23 d.4
#5: e.22 e.5
#6: f.21 f.6

Instead of creating a new result dataset, we can also update the same or a copy of the original dataset. There will be some warnings about type conversion, but I guess this would be a bit faster (not benchmarked)

setDT(mydf)
mydf2 <- copy(mydf)
for(j in indx){
  set(mydf2, i=NULL, j=j, value= paste(mydf2[[j]],
   mydf2[[j+1]], sep="."))
 }
 mydf2[,indx, with=FALSE]

Benchmarks

I tried the benchmarks on a slightly bigger data with many columns.

data

set.seed(24)
d1 <- as.data.frame(matrix(sample(letters,500*10000, replace=TRUE), 
    ncol=500), stringsAsFactors=FALSE)
set.seed(4242)
d2 <- as.data.frame(matrix(sample(1:200,500*10000,
            replace=TRUE), ncol=500))
d3 <- cbind(d1,d2)
mydf <- d3[,order(c(1:ncol(d1), 1:ncol(d2)))]
mydf1 <- copy(mydf) 

Compared f1, f.jazurro (fastest) (from @Marat Talipov's post) with f.akrun2

   microbenchmark(f1(mydf), f.jazurro(mydf), f.akrun2(mydf1),
         unit='relative', times=20L)
   #Unit: relative
   #        expr      min        lq     mean   median       uq      max neval
   #      f1(mydf) 3.420448 2.3217708 2.714495 2.653178 2.819952 2.736376    20
   #f.jazurro(mydf) 1.000000 1.0000000 1.000000 1.000000 1.000000 1.000000    20
   #f.akrun2(mydf1) 1.204488 0.8015648 1.031248 1.042262 1.097136 1.066671    20
   #cld
   #b
   #a 
   #a 

In this, f.jazurro is slighly better than f.akrun2. I think if I increase the group size, nrows etc, it would be an interesting comparison

查看更多
【Aperson】
4楼-- · 2019-03-02 02:26

I'm not sure this is the best approach. See if the below code gives any speed improvement

require(dplyr)
transmute(mydf,x1=paste0( a.1,'.', a.2),x2=paste0( b.1,'.', b.2)) 

Answer updated based on comment :-)

查看更多
登录 后发表回答