Creating user sessions with fast computation

I have a data frame with three columns: "uuid" (that is class factor) and "created_at" (that is class POSIXct),and "trainer_item_id" (factor) and I created a third column that is named "Sessions". The column Sessions represents time sessions for each uuid ordered by time, such that the time difference between any consecutive pair of events is at most one hour (3600seconds).

I have created the column Sessions using a "for loop" and iteration. The problem is that I have more than a million of observations and it will take 8 hours to create Sessions. Is there an easier and faster way to create it than my code below? Thanks in advance for your help!

Here is a sample of the original dataset --> https://gist.github.com/einsiol/5b4e633ce69d3a8e43252f383231e4b8

Here is my code -->

library(dplyr)
    # Converting the data frame trial to tibble in order to use the function group_by
    trial <- tbl_df(trial); trial <- group_by(trial, uuid)

    # Ordering by timestamp (created_at)
    trial <- arrange(trial, created_at)

    # Creating empty vector of time difference tdiff
    time <- trial$created_at
    tdiff <- vector(mode = "numeric",length = 0)
    trial$Sessions <- vector(mode = "character",length = length(trial))

        count <-1

            for(i in 1:(length(trial$uuid)-1)) {

                tdiff[i] <- difftime(time[i+1], time[i],units = "secs")

                # If same user ID

                if (trial$uuid[i+1]==trial$uuid[i]){

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count


                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    # If different user ID
                }else{

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count

                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    count <- 1
                }
            }

UPDATE: I have found the answer to my question and a fast alternative to this code that you can find below!

标签： r performance session for-loop large-data

3条回答

倾城　Initia

2楼-- · 2019-08-21 20:17

Since you already started with dplyr:

trial <- 

trial %>% 
  arrange(uuid, created_at) %>% 
  group_by(uuid) %>% 
  mutate(diff = difftime(created_at, lag(created_at), units = 'secs'), # calculate timediff for each row
    diff = as.numeric(diff >= 3600), # flags each new session with the number 1
    diff = ifelse(is.na(diff), 1, diff), %>% #replaces the first row of each group with 1 
    Sessions = cumsum(diff)) %>% #sum all the sessions for each group
  select(-diff) # remove diff column

0人赞添加讨论(0) 举报

祖国的老花朵

3楼-- · 2019-08-21 20:23

I have found a very effective and fast way to make it work using vectorial calculus. It took me 30 seconds to run the code (instead of average 5 hours!)

   library(data.table);library(sqldf)

        # Ordering by uuid and created_at
        LID<-LID[order(LID$uuid,LID$created_at),]

        # Computing time difference (sec) between the current and previous ligne 
        LID$created_at <- as.POSIXct(as.character(LID$created_at)) 
        LID$diff<-c(9999,LID$created_at[-1]-LID$created_at[-nrow(LID)])
        options(stringAsFactor = FALSE) 

        # Lines corresponding to a new uuid 
        w<-which(LID$uuid[-1]!=LID$uuid[-nrow(LID)])

        # Putting the duration to NA when there is a change of uuid
        LID$diff[w+1]<-9999

        # Identifying sessions changes that are greater than 3600 sec (1 hour)
        LID$chg_session<-as.numeric(LID$diff>3600)

        # Cumulating and determining the id_sessions with the inverse of Differencing
        LID$idsession<-diffinv(LID$chg_session)[-1]

0人赞添加讨论(0) 举报

兄弟一词,经得起流年.

4楼-- · 2019-08-21 20:35

You can try to do this with data.table:

require(data.table)
N <- 4

trial <- data.table(uuid = rep(1:2, each = N),
                    created_at = as.POSIXct(60* 10 *rep(1:N, times = 2)*
                                              rep(1:N, times = 2),
                                            origin = "1990-01-01"))

setkey(trial, uuid, created_at)
trial
#    uuid          created_at
# 1:    1 1990-01-01 02:10:00
# 2:    1 1990-01-01 02:40:00
# 3:    1 1990-01-01 03:30:00
# 4:    1 1990-01-01 04:40:00
# 5:    2 1990-01-01 02:10:00
# 6:    2 1990-01-01 02:40:00
# 7:    2 1990-01-01 03:30:00
# 8:    2 1990-01-01 04:40:00

trial[, dif := c(1, as.numeric(diff(created_at), units = "secs"))]
trial[, ii := .GRP, by = uuid]
trial[, ii := ii - lag(ii)]
trial[is.na(ii), ii := 1L]
trial[, i := ifelse(dif < 3600, 0L, 1L)]
trial[ii == 1L, i := 0L]
trial[, Sessions := cumsum(i), by = uuid]
trial[, Sessions := Sessions + 1L, by = uuid]
trial
#    uuid          created_at   dif ii i Sessions
# 1:    1 1990-01-01 02:10:00     1  1 0        1
# 2:    1 1990-01-01 02:40:00  1800  0 0        1
# 3:    1 1990-01-01 03:30:00  3000  0 0        1
# 4:    1 1990-01-01 04:40:00  4200  0 1        2
# 5:    2 1990-01-01 02:10:00 -9000  1 0        1
# 6:    2 1990-01-01 02:40:00  1800  0 0        1
# 7:    2 1990-01-01 03:30:00  3000  0 0        1
# 8:    2 1990-01-01 04:40:00  4200  0 1        2

0人赞添加讨论(0) 举报

Creating user sessions with fast computation

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间