Complicated reshaping

2019-02-01 22:03发布

站内文章 / 后端开发

16 0

三岁会撩人

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I want to reshape my dataframe from long to wide format and I loose some data that I'd like to keep. For the following example:

df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
                 Par2 = unlist(strsplit("DDEEFFF","")),
                 ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
                 Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
                 Val = c(10,20,30,40,50,60,70))

   #     Par1 Par2 ParD Type Val
   #   1    A    D  foo  pre  10
   #   2    A    D  bar post  20
   #   3    B    E  baz  pre  30
   #   4    B    E  qux post  40
   #   5    C    F  bla  pre  50
   #   6    C    F  xyz post  60
   #   7    C    F  meh post  70

dfw <- dcast(df,
             formula = Par1 + Par2 ~ Type,
             value.var = "Val",
             fun.aggregate = mean)

 #     Par1 Par2 post pre
 #   1    A    D   20  10
 #   2    B    E   40  30
 #   3    C    F   65  50

this is almost what I need but I would like to have

some field keeping data from ParD field (for example, as single merged string),
number of observations used for aggregation.

i.e. I would like the resulting data.frame to be as follows:

    #     Par1 Par2 post pre Num.pre Num.post ParD
    #   1    A    D   20  10      1      1    foo_bar 
    #   2    B    E   40  30      1      1    baz_qux
    #   3    C    F   65  50      1      2    bla_xyz_meh

I would be grateful for any ideas. For example, I tried to solve the second task by writing in dcast: fun.aggregate=function(x) c(Val=mean(x),Num=length(x)) - but this causes an error.

回答1:

Solution in 2 steps using ddply ( i am not happy with , but I get the result)

dat <- ddply(df,.(Par1,Par2),function(x){
  data.frame(ParD=paste(paste(x$ParD),collapse='_'),
             Num.pre =length(x$Type[x$Type =='pre']),
             Num.post = length(x$Type[x$Type =='post']))
})

merge(dfw,dat)
 Par1 Par2 post pre        ParD Num.pre Num.post
1    A    D  2.0   1     foo_bar       1        1
2    B    E  4.0   3     baz_qux       1        1
3    C    F  6.5   5 bla_xyz_meh       1        2

回答2:

Late to the party, but here's another alternative using data.table:

require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]), 
          post=mean(Val[Type == "post"]), 
          pre.num=length(Val[Type == "pre"]), 
          post.num=length(Val[Type == "post"]), 
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

[from Matthew] +1 Some minor improvements to save repeating the same ==, and to demonstrate local variables inside j.

dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
          post=mean(Val[.post <- Type=="post"]),  # save .post
          pre.num=sum(.pre),                      # reuse .pre
          post.num=sum(.post),                    # reuse .post
          ParD = paste(ParD, collapse="_")), 
by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo_bar
# 2:    B    E  30   40       1        1     baz_qux
# 3:    C    F  50   65       1        2 bla_xyz_meh

And if a list column is ok rather than a paste, then this should be faster :

dt[, { .pre <- Type=="pre"
       .post <- Type=="post"
       list(pre=mean(Val[.pre]), 
            post=mean(Val[.post]),
            pre.num=sum(.pre),
            post.num=sum(.post), 
            ParD = list(ParD)) }     # list() faster than paste()
, by=list(Par1, Par2)]

#    Par1 Par2 pre post pre.num post.num        ParD
# 1:    A    D  10   20       1        1     foo,bar
# 2:    B    E  30   40       1        1     baz,qux
# 3:    C    F  50   65       1        2 bla,xyz,meh

回答3:

You could do a merge of two dcasts and an aggregate, here all wrapped into one large expression mostly to avoid having intermediate objects hanging around afterwards:

Reduce(merge, list(
    dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=mean),
    setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
        fun.aggregate=length), c("Par1", "Par2", "Num.post",
        "Num.pre")),
    aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
    ))

回答4:

I'll post but agstudy's puts me to shame:

step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
    piece1 <- tapply(x$Val, x$Type, mean)
    piece2 <- tapply(x$Type, x$Type, length)
    names(piece2) <- paste0("Num.", names(piece2))
    out <- x[1, 1:2]
    out[, 3:6] <- c(piece1, piece2)
    names(out)[3:6] <-  names(c(piece1, piece2))
    out$ParD <- paste(unique(x$ParD), collapse="_")
    out
})
data.frame(do.call(rbind, step3), row.names=NULL)

Yielding:

  Par1 Par2 post pre Num.post Num.pre        ParD
1    A    D  2.0   1        1       1     foo_bar
2    B    E  4.0   3        1       1     baz_qux
3    C    F  6.5   5        2       1 bla_xyz_meh

回答5:

What a great opprotunity to benchmark! Below are some runs of the plyr method (as suggested by @agstudy) compared with the data.table method (as suggested by @Arun) using different sample sizes (N = 900, 2700, 10800)

Summary:
The data.table method outperforms the plyr method by a factor of 7.5

#-------------------#
#   M E T H O D S   #
#-------------------#

  # additional methods below, in the updates

  # Method 1  -- suggested by @agstudy
  plyrMethod <- quote({
                  dfw<-dcast(df,
                         formula = Par1+Par2~Type,
                         value.var="Val",
                         fun.aggregate=mean)
                  dat <- ddply(df,.(Par1,Par2),function(x){
                    data.frame(ParD=paste(paste(x$ParD),collapse='_'),
                               Num.pre =length(x$Type[x$Type =='pre']),
                               Num.post = length(x$Type[x$Type =='post']))
                  })
                  merge(dfw,dat)
                })

  # Method 2 -- suggested by @Arun
  dtMethod <- quote(
                dt[, list(pre=mean(Val[Type == "pre"]), 
                          post=mean(Val[Type == "post"]), 
                          Num.pre=length(Val[Type == "pre"]), 
                          Num.post=length(Val[Type == "post"]), 
                          ParD = paste(ParD, collapse="_")), 
                by=list(Par1, Par2)]
              ) 

 # Method 3 -- suggested by @regetz
 reduceMethod <- quote(
                  Reduce(merge, list(
                      dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=mean),
                      setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
                          fun.aggregate=length), c("Par1", "Par2", "Num.post",
                          "Num.pre")),
                      aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
                      ))
                  )

 # Method 4 -- suggested by @Ramnath
 castddplyMethod <- quote(
                      reshape::cast(Par1 + Par2 + ParD ~ Type, 
                           data = ddply(df, .(Par1, Par2), transform, 
                           ParD = paste(ParD, collapse = "_")), 
                           fun  = c(mean, length)
                          )
                      )



# SAMPLE DATA #
#-------------#

library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)


  # for Par1, ParD
  LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
  lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")

  # max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
  size <- 17568
  size <- 10800
  size <- 900  

  set.seed(1)
  df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
                 , ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
                 , Type=rep(c("pre","post"), size/2)
                 , Val =sample(seq(10,100,10), size, TRUE)
                 )

  dt <- data.table(df, key=c("Par1", "Par2"))


# Confirming Same Results # 
#-------------------------#
  # Evaluate
  DF1 <- eval(plyrMethod)
  DF2 <- eval(dtMethod)

  # Convert to DF and sort columns and sort ParD levels, for use in identical
  colOrder <- sort(names(DF1))
  DF1 <- DF1[, colOrder]
  DF2 <- as.data.frame(DF2)[, colOrder]
  DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
  identical((DF1), (DF2))
  # [1] TRUE
#-------------------------#

RESULTS

#--------------------#
#     BENCHMARK      #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
          replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
          order="relative")


# SAMPLE SIZE = 900
  relative      test elapsed user.self sys.self replications
     1.000    reduce   0.392     0.375    0.018            5
     1.003        dt   0.393     0.377    0.016            5
     7.064      plyr   2.769     2.721    0.047            5
     8.003 castddply   3.137     3.030    0.106            5

# SAMPLE SIZE = 2,700
  relative   test elapsed user.self sys.self replications
     1.000     dt   1.371     1.327    0.090            5
     2.205 reduce   3.023     2.927    0.102            5
     7.291   plyr   9.996     9.644    0.377            5

# SAMPLE SIZE = 10,800
  relative      test elapsed user.self sys.self replications
     1.000        dt   8.678     7.168    1.507            5
     2.769    reduce  24.029    23.231    0.786            5
     6.946      plyr  60.277    52.298    7.947            5
    13.796 castddply 119.719   113.333   10.816            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000     dt  27.421    13.042   14.470            5
     4.030 reduce 110.498    75.853   34.922            5
     5.414   plyr 148.452   105.776   43.156            5

Update : Added results for baseMethod1

# Used only sample size of 90, as it was taking long
relative  test elapsed user.self sys.self replications
   1.000    dt   0.044     0.043    0.001            5
   7.773  plyr   0.342     0.339    0.003            5
  65.614 base1   2.887     2.866    0.028            5

Where
   baseMethod1 <- quote({
                  step1 <- with(df, split(df, list(Par1, Par2)))
                  step2 <- step1[sapply(step1, nrow) > 0]
                  step3 <- lapply(step2, function(x) {
                      piece1 <- tapply(x$Val, x$Type, mean)
                      piece2 <- tapply(x$Type, x$Type, length)
                      names(piece2) <- paste0("Num.", names(piece2))
                      out <- x[1, 1:2]
                      out[, 3:6] <- c(piece1, piece2)
                      names(out)[3:6] <-  names(c(piece1, piece2))
                      out$ParD <- paste(unique(x$ParD), collapse="_")
                      out
                  })
                  data.frame(do.call(rbind, step3), row.names=NULL)
                })

Update 2: Added keying the DT as part of the metric

Adding the indexing step to the benchmark for fairness as per @MatthewDowle s comment.
However, presumably, if data.table is used, it will be in place of the data.frame and hence the indexing will occur once and not simply for this procedure

   dtMethod.withkey <- quote({
                       dt <- data.table(df, key=c("Par1", "Par2"))       
                       dt[, list(pre=mean(Val[Type == "pre"]), 
                                 post=mean(Val[Type == "post"]), 
                                 Num.pre=length(Val[Type == "pre"]), 
                                 Num.post=length(Val[Type == "post"]), 
                                 ParD = paste(ParD, collapse="_")), 
                       by=list(Par1, Par2)]
                     }) 

# SAMPLE SIZE = 10,800
  relative       test elapsed user.self sys.self replications
     1.000         dt   9.155     7.055    2.137            5
     1.043 dt.withkey   9.553     7.245    2.353            5
     3.567     reduce  32.659    31.196    1.586            5
     6.703       plyr  61.364    54.080    7.600            5

Update 3: Benchmarking @MD's edits to @Arun's original answer

dtMethod.MD1 <- quote(
                  dt[, list(pre=mean(Val[.pre <- Type=="pre"]),     # save .pre
                            post=mean(Val[.post <- Type=="post"]),  # save .post
                            pre.num=sum(.pre),                      # reuse .pre
                            post.num=sum(.post),                    # reuse .post
                            ParD = paste(ParD, collapse="_")), 
                     by=list(Par1, Par2)]
                  )

dtMethod.MD2 <- quote(
                  dt[, { .pre <- Type=="pre"                  # or save .pre and .post up front 
                         .post <- Type=="post"
                         list(pre=mean(Val[.pre]), 
                              post=mean(Val[.post]),
                              pre.num=sum(.pre),
                              post.num=sum(.post), 
                              ParD = paste(ParD, collapse="_")) }
                  , by=list(Par1, Par2)]
                  )

dtMethod.MD3 <- quote(
                dt[, { .pre <- Type=="pre"
                       .post <- Type=="post"
                       list(pre=mean(Val[.pre]), 
                            post=mean(Val[.post]),
                            pre.num=sum(.pre),
                            post.num=sum(.post), 
                            ParD = list(ParD)) }     # list() faster than paste()
                , by=list(Par1, Par2)]
                )

benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
      replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), 
      order="relative")

#--------------------#

Comparing the different data.table methods amongst themselves


# SAMPLE SIZE = 900
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3   0.198     0.197    0.001            5  <~~~ "list()" Method
     1.242 dt.M1   0.246     0.243    0.004            5
     1.253 dt.M2   0.248     0.242    0.007            5
     1.884    dt   0.373     0.367    0.007            5

# SAMPLE SIZE = 17,568
  relative  test elapsed user.self sys.self replications
     1.000 dt.M3  33.492    24.487    9.122            5   <~~~ "list()" Method
     1.086 dt.M1  36.388    11.442   25.086            5
     1.086 dt.M2  36.388    10.845   25.660            5
     1.126    dt  37.701    13.256   24.535            5

Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session  (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
       Then re-ran in the *same* session, with reps=5. Results very different._


benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568;  CLEAN SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1   8.885     4.260    4.617            1
     1.633 dt.M3  14.506    12.821    1.677            1

# SAMPLE SIZE=17,568;  *SAME* SESSION
  relative  test elapsed user.self sys.self replications
     1.000 dt.M1  33.443    10.200   23.226            5
     1.048 dt.M3  35.060    26.127    8.915            5

#--------------------#

New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_

# SAMPLE SIZE = 900
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1   0.254     0.247    0.008            5
     1.705 reduce   0.433     0.425    0.010            5
    11.280   plyr   2.865     2.842    0.031            5

# SAMPLE SIZE = 17,568
  relative   test elapsed user.self sys.self replications
     1.000  dt.M1  24.826    10.427   14.458            5
     4.348 reduce 107.935    70.107   38.314            5
     5.942   plyr 147.508   106.958   41.083            5

回答6:

One Step solution combining reshape::cast with plyr::ddply

cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform, 
  ParD = paste(ParD, collapse = "_")), fun  = c(mean, length))

NOTE that the dcast function in reshape2 does not allow multiple aggregate functions to be passed, while the cast function in reshape does.

回答7:

I believe this base R solution is comparable with @Arun's data table solution. (Which isn't to say I would prefer it; that code is much simpler!)

baseMethod2 <- quote({
    is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
    i1 <- sapply(is, `[`, 1)
    out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
    js <- lapply(is, function(i) split(i, df$Type[i]))
    out$post <- sapply(js, function(j) mean(df$Val[j$post]))
    out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
    out$Num.pre <- sapply(js, function(j) length(j$pre))
    out$Num.post <- sapply(js, function(j) length(j$post))
    out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
    out
})

Using @RicardoSaporta's timing code with 900, 2700, and 10,800, respectively:

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.230     0.229        0            5
1    1.130          dt   0.260     0.257        0            5
2    8.752        plyr   2.013     2.006        0            5

> relative        test elapsed user.self sys.self replications
3    1.000 baseMethod2   0.877     0.872        0            5
1    1.068          dt   0.937     0.934        0            5
2    8.060        plyr   7.069     7.043        0            5

> relative        test elapsed user.self sys.self replications
1    1.000          dt   6.232     6.178    0.031            5
3    1.085 baseMethod2   6.763     6.683    0.054            5
2    7.263        plyr  45.261    44.983    0.104            5

回答8:

Trying to wrap different aggregation expressions into a self-contained function (expressions should yield atomic values)...

multi.by <- function(X, INDEX,...) {
    expressions <- substitute(...())
    duplicates <- duplicated(INDEX)
    res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part) 
        sapply(expressions,eval,part,simplify=F),simplify=F))
    if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
    else rownames(res) <- INDEX[!duplicates]
    res
}

multi.by(df,df[,1:2],
    pre=mean(Val[Type=="pre"]), 
    post=mean(Val[Type=="post"]),
    Num.pre=sum(Type=="pre"),
    Num.post=sum(Type=="post"),
    ParD=paste(ParD, collapse="_"))