Creating user sessions with fast computation

2019-08-21 20:15发布

问题:

I have a data frame with three columns: "uuid" (that is class factor) and "created_at" (that is class POSIXct),and "trainer_item_id" (factor) and I created a third column that is named "Sessions". The column Sessions represents time sessions for each uuid ordered by time, such that the time difference between any consecutive pair of events is at most one hour (3600seconds).

I have created the column Sessions using a "for loop" and iteration. The problem is that I have more than a million of observations and it will take 8 hours to create Sessions. Is there an easier and faster way to create it than my code below? Thanks in advance for your help!

Here is a sample of the original dataset --> https://gist.github.com/einsiol/5b4e633ce69d3a8e43252f383231e4b8

Here is my code -->

library(dplyr)
    # Converting the data frame trial to tibble in order to use the function group_by
    trial <- tbl_df(trial); trial <- group_by(trial, uuid)

    # Ordering by timestamp (created_at)
    trial <- arrange(trial, created_at)

    # Creating empty vector of time difference tdiff
    time <- trial$created_at
    tdiff <- vector(mode = "numeric",length = 0)
    trial$Sessions <- vector(mode = "character",length = length(trial))

        count <-1

            for(i in 1:(length(trial$uuid)-1)) {

                tdiff[i] <- difftime(time[i+1], time[i],units = "secs")

                # If same user ID

                if (trial$uuid[i+1]==trial$uuid[i]){

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count


                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    # If different user ID
                }else{

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count

                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    count <- 1
                }
            }

UPDATE: I have found the answer to my question and a fast alternative to this code that you can find below!

回答1:

Since you already started with dplyr:

trial <- 

trial %>% 
  arrange(uuid, created_at) %>% 
  group_by(uuid) %>% 
  mutate(diff = difftime(created_at, lag(created_at), units = 'secs'), # calculate timediff for each row
    diff = as.numeric(diff >= 3600), # flags each new session with the number 1
    diff = ifelse(is.na(diff), 1, diff), %>% #replaces the first row of each group with 1 
    Sessions = cumsum(diff)) %>% #sum all the sessions for each group
  select(-diff) # remove diff column


回答2:

You can try to do this with data.table:

require(data.table)
N <- 4

trial <- data.table(uuid = rep(1:2, each = N),
                    created_at = as.POSIXct(60* 10 *rep(1:N, times = 2)*
                                              rep(1:N, times = 2),
                                            origin = "1990-01-01"))

setkey(trial, uuid, created_at)
trial
#    uuid          created_at
# 1:    1 1990-01-01 02:10:00
# 2:    1 1990-01-01 02:40:00
# 3:    1 1990-01-01 03:30:00
# 4:    1 1990-01-01 04:40:00
# 5:    2 1990-01-01 02:10:00
# 6:    2 1990-01-01 02:40:00
# 7:    2 1990-01-01 03:30:00
# 8:    2 1990-01-01 04:40:00

trial[, dif := c(1, as.numeric(diff(created_at), units = "secs"))]
trial[, ii := .GRP, by = uuid]
trial[, ii := ii - lag(ii)]
trial[is.na(ii), ii := 1L]
trial[, i := ifelse(dif < 3600, 0L, 1L)]
trial[ii == 1L, i := 0L]
trial[, Sessions := cumsum(i), by = uuid]
trial[, Sessions := Sessions + 1L, by = uuid]
trial
#    uuid          created_at   dif ii i Sessions
# 1:    1 1990-01-01 02:10:00     1  1 0        1
# 2:    1 1990-01-01 02:40:00  1800  0 0        1
# 3:    1 1990-01-01 03:30:00  3000  0 0        1
# 4:    1 1990-01-01 04:40:00  4200  0 1        2
# 5:    2 1990-01-01 02:10:00 -9000  1 0        1
# 6:    2 1990-01-01 02:40:00  1800  0 0        1
# 7:    2 1990-01-01 03:30:00  3000  0 0        1
# 8:    2 1990-01-01 04:40:00  4200  0 1        2


回答3:

I have found a very effective and fast way to make it work using vectorial calculus. It took me 30 seconds to run the code (instead of average 5 hours!)

   library(data.table);library(sqldf)

        # Ordering by uuid and created_at
        LID<-LID[order(LID$uuid,LID$created_at),]

        # Computing time difference (sec) between the current and previous ligne 
        LID$created_at <- as.POSIXct(as.character(LID$created_at)) 
        LID$diff<-c(9999,LID$created_at[-1]-LID$created_at[-nrow(LID)])
        options(stringAsFactor = FALSE) 

        # Lines corresponding to a new uuid 
        w<-which(LID$uuid[-1]!=LID$uuid[-nrow(LID)])

        # Putting the duration to NA when there is a change of uuid
        LID$diff[w+1]<-9999

        # Identifying sessions changes that are greater than 3600 sec (1 hour)
        LID$chg_session<-as.numeric(LID$diff>3600)

        # Cumulating and determining the id_sessions with the inverse of Differencing
        LID$idsession<-diffinv(LID$chg_session)[-1]