Make new feature using 2 tables

table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                 product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                 first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                 last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                 order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                 days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))

I want to add new feature to table1 using table2. New feature is order interval of each product by each user.

For example, let's see the table1. First row has (user_id==1), (product_id==14), (first order==1) and (last order ==4). It means product 14 was ordered at order 1, 2, 3, 4. And we can find this order number in table 2. New feature is about order interval between first order and last order. We can derive it in table 2, using "days_cumsum" which is cumulative sum of days after prior order. First row's new feature value is 26(=26-0).

I think it may be done with join, but I can't use join because each table is very big actually.

So I'm using this function below with for loop :

f <- function(i){
  a <- table2 %>% 
    filter(user_id==table1[i, 1]) %>% 
    filter(order_number==table1[i, 3] | order_number==table1[i, 4])

  ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
}

It calculates each new feature value row by row, but it is very slow and needs many calculation. I often encounter this problem(make new feature using two tables), but every time I have difficulties.

Is there better code? I'm waiting for your help.

标签： r dplyr sqldf

2条回答

我欲成王，谁敢阻挡

2楼-- · 2019-05-26 07:19

Can you share the runtime/computation time comparisons for join and the version using your loop?

Below is a solution using joins.

library(tidyverse)

df1 <- as.data.frame(table1)
df2 <- as.data.frame(table2)


df1 %>%
  left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
  rename(dayMin = days_cumsum) %>%
  left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
  rename(dayMax = days_cumsum) %>%
  mutate(newVar = dayMax-dayMin) %>%
  select(user_id, product_id, first_order, last_order, newVar)

which gives:

   user_id product_id first_order last_order newVar
     <dbl>      <dbl>       <dbl>      <dbl>  <dbl>
 1       1         14           1          4     26
 2       1         24           2          7     46
 3       1         38           1          5     34
 4       1         40           4          8     33
 5       1         66           5          8     25
 6       1          2           3          3      0
 7       1         19           2          4     19
 8       1         30           4          7     27
 9       1         71           2          5     27
10       1         98           4          9     40
11       2          7           2          4     13
12       2         16           3          5     12

0人赞添加讨论(0) 举报

唯我独甜

3楼-- · 2019-05-26 07:29

For comparaison, some solutions using data.table.

table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                     product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                     first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                     last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                     order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                     days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))

library(data.table)

setDT(table1)
setDT(table2)

table1
#>     user_id product_id first_order last_order
#>  1:       1         14           1          4
#>  2:       1         24           2          7
#>  3:       1         38           1          5
#>  4:       1         40           4          8
#>  5:       1         66           5          8
#>  6:       1          2           3          3
#>  7:       1         19           2          4
#>  8:       1         30           4          7
#>  9:       1         71           2          5
#> 10:       1         98           4          9
#> 11:       2          7           2          4
#> 12:       2         16           3          5
table2
#>     user_id order_number days_cumsum
#>  1:       1            1           0
#>  2:       1            2           7
#>  3:       1            3          15
#>  4:       1            4          26
#>  5:       1            5          34
#>  6:       1            6          43
#>  7:       1            7          53
#>  8:       1            8          59
#>  9:       1            9          66
#> 10:       1           10          74
#> 11:       1           11          82
#> 12:       1           12          91
#> 13:       2            1           5
#> 14:       2            2          11
#> 15:       2            3          17
#> 16:       2            4          24
#> 17:       2            5          29
#> 18:       2            6          35

DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
setnames(DayMin, "days_cumsum", "dayMin")
DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
setnames(DayMax, "days_cumsum", "dayMax")
res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
# calculate diff and delete column
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res[]
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         38           1          5   34
#>  5:       1         71           2          5   27
#>  6:       1         24           2          7   46
#>  7:       1         30           4          7   27
#>  8:       1         40           4          8   33
#>  9:       1         66           5          8   25
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12

"piped-like" version without renaming

table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
    table2, on = .(user_id , last_order = order_number), nomatch = 0][
      , `:=`(
        diff = i.days_cumsum - days_cumsum, 
        days_cumsum = NULL, 
        i.days_cumsum = NULL
      )][]
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         38           1          5   34
#>  5:       1         71           2          5   27
#>  6:       1         24           2          7   46
#>  7:       1         30           4          7   27
#>  8:       1         40           4          8   33
#>  9:       1         66           5          8   25
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12

using reshaping for one merge only

tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         24           2          7   46
#>  5:       1         30           4          7   27
#>  6:       1         38           1          5   34
#>  7:       1         40           4          8   33
#>  8:       1         66           5          8   25
#>  9:       1         71           2          5   27
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12

0人赞添加讨论(0) 举报

Make new feature using 2 tables

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间