Expand data.frame by creating duplicates based on

2019-02-28 07:52发布

问题:

Starting from this SO question.

Example data.frame:

df = read.table(text = 'ID  Day Count   Count_group
            18  1933    6   15
            33  1933    6   15
            37  1933    6   15
            18  1933    6   15
            16  1933    6   15
            11  1933    6   15
            111 1932    5   9
            34  1932    5   9
            60  1932    5   9
            88  1932    5   9
            18  1932    5   9
            33  1931    3   4
            13  1931    3   4
            56  1931    3   4
            23  1930    1   1
            6   1800    6   12
            37  1800    6   12
            98  1800    6   12
            52  1800    6   12
            18  1800    6   12
            76  1800    6   12
            55  1799    4   6
            6   1799    4   6
            52  1799    4   6
            133 1799    4   6
            112 1798    2   2
            677 1798    2   2
            778 888     4   8
            111 888     4   8
            88  888     4   8
            10  888     4   8
            37  887     2   4
            26  887     2   4
            8   886     1   2
            56  885     1   1
            22  120     2   6
            34  120     2   6
            88  119     1   6
            99  118     2   5
            12  118     2   5
            90  117     1   3
            22  115     2   2
            99  115     2   2', header = TRUE)

The Count col shows the total number of ID values per each Day and the Count_group col shows the sum of the ID values per each Day, Day - 1, Day -2, Day -3 and Day -4.

e.g. 1933 = Count_group 15 because Count 6 (1933) + Count 5 (1932) + Count 3 (1931) + Count 1 (1930) + Count 0 (1929).

What I need to do is to create duplicated observations per each Count_group and add them to it in order to show per each Count_group its Day, Day - 1, Day -2, Day -3 and Day -4.

e.g. Count_group = 15 is composed by the Count values of Day 1933, 1932, 1931, 1930 (and 1929 not present in the df). So the five days needs to be included in the Count_group = 15. The next one will be Count_group = 9, composed by 1932, 1931, 1930, 1929 and 1928; etc...

Desired output:

ID  Day   Count Count_group
18  1933    6   15
33  1933    6   15
37  1933    6   15
18  1933    6   15
16  1933    6   15
11  1933    6   15
111 1932    5   15
34  1932    5   15
60  1932    5   15
88  1932    5   15
18  1932    5   15
33  1931    3   15
13  1931    3   15
56  1931    3   15
23  1930    1   15

111 1932    5   9
34  1932    5   9
60  1932    5   9
88  1932    5   9
18  1932    5   9
33  1931    3   9
13  1931    3   9
56  1931    3   9
23  1930    1   9

33  1931    3   4
13  1931    3   4
56  1931    3   4
23  1930    1   4

23  1930    1   1

6   1800    6   12
37  1800    6   12
98  1800    6   12
52  1800    6   12
18  1800    6   12
76  1800    6   12
55  1799    4   12
6   1799    4   12
52  1799    4   12
133 1799    4   12
112 1798    2   12
677 1798    2   12

55  1799    4   6
6   1799    4   6
52  1799    4   6
133 1799    4   6
112 1798    2   6
677 1798    2   6

112 1798    2   2
677 1798    2   2

778 888     4   8
111 888     4   8
88  888     4   8
10  888     4   8
37  887     2   8
26  887     2   8
8   886     1   8
56  885     1   8

37  887     2   4
26  887     2   4
8   886     1   4
56  885     1   4

8   886     1   2
56  885     1   2

56  885     1   1

22  120     2   6
34  120     2   6
88  119     1   6
99  118     2   6
12  118     2   6
90  117     1   6

88  119     1   6
99  118     2   6
12  118     2   6
90  117     1   6
22  115     2   6
99  115     2   6

99  118     2   5
12  118     2   5
90  117     1   5
22  115     2   5
99  115     2   5

90  117     1   3
22  115     2   3
99  115     2   3

22  115     2   2
99  115     2   2

(note that different group of 5 days each one have been separated by a blank line in order to make them clearer)

I have got different data.frames which are grouped by n days and therefore I would like to adapt the code (by changing it a little) specifically for each of them.

Thanks

回答1:

A generalised version of my previous answer...

#first add grouping variables
days <- 5 #grouping no of days
df$smalldaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]!=df$Day[i-1]))) #individual days
df$bigdaygroup <- c(0,cumsum(sapply(2:nrow(df),function(i) df$Day[i]<df$Day[i-1]-days+1))) #blocks of linked days

#duplicate days in each big group
df2 <- lapply(split(df,df$bigdaygroup),function(x) {
  n <- max(x$Day)-min(x$Day)+1 #number of consecutive days in big group
  dayvec <- (max(x$Day):min(x$Day)) #possible days in range
  daylog <- dayvec[dayvec %in% x$Day] #actual days in range
  pattern <- data.frame(base=rep(dayvec,each=days))
  pattern$rep <- sapply(1:nrow(pattern),function(i) pattern$base[i]+1-sum(pattern$base[1:i]==pattern$base[i])) #indices to repeat
  pattern$offset <- match(pattern$rep,daylog)-match(pattern$base,daylog) #offsets (used later)
  pattern <- pattern[(pattern$base %in% x$Day) & (pattern$rep %in% x$Day),] #remove invalid elements
  #store pattern in list as offsets needed in next loop
  return(list(df=split(x,x$smalldaygroup)[match(pattern$rep,daylog)],pat=pattern))
})

#change the Count_group to previous value in added entries
df2 <- lapply(df2,function(L) lapply(1:length(L$df),function(i) {
  x <- L$df[[i]]
  offset <- L$pat$offset #pointer to day to copy Count_group from
  x$Count_group <- L$df[[i-offset[i]]]$Count_group[1]
  return(x)
}))

df2 <- do.call(rbind,unlist(df2,recursive=FALSE)) #bind back together

df2[,5:6] <- NULL #remove grouping variables

head(df2,30) #ignore rownames!

       ID  Day Count Count_group
01.1   18 1933     6          15
01.2   33 1933     6          15
01.3   37 1933     6          15
01.4   18 1933     6          15
01.5   16 1933     6          15
01.6   11 1933     6          15
02.7  111 1932     5          15
02.8   34 1932     5          15
02.9   60 1932     5          15
02.10  88 1932     5          15
02.11  18 1932     5          15
03.12  33 1931     3          15
03.13  13 1931     3          15
03.14  56 1931     3          15
04     23 1930     1          15
05.7  111 1932     5           9
05.8   34 1932     5           9
05.9   60 1932     5           9
05.10  88 1932     5           9
05.11  18 1932     5           9
06.12  33 1931     3           9
06.13  13 1931     3           9
06.14  56 1931     3           9
07     23 1930     1           9
08.12  33 1931     3           4
08.13  13 1931     3           4
08.14  56 1931     3           4
09     23 1930     1           4
010    23 1930     1           1
11.16   6 1800     6          12


回答2:

I attach a rather mechanical method, but I believe it is a good starting point. I have noticed that in your original table the entry

ID Day Count Count_group 18 1933 6 14

is duplicated; I have left it untouched for sake of clarity.

Structure of the approach:

  1. Read original data
  2. Generate list of data frames, for each Day
  3. Generate final data frame, collapsing the list in 2.

1. Read original data

We start with

df = read.table(text = 'ID  Day Count   Count_group
                18  1933    6   14
                33  1933    6   14
                37  1933    6   14
                18  1933    6   14
                16  1933    6   14
                11  1933    6   14
                111 1932    5   9
                34  1932    5   9
                60  1932    5   9
                88  1932    5   9
                18  1932    5   9
                33  1931    3   4
                13  1931    3   4
                56  1931    3   4
                23  1930    1   1
                6   1800    6   12
                37  1800    6   12
                98  1800    6   12
                52  1800    6   12
                18  1800    6   12
                76  1800    6   12
                55  1799    4   6
                6   1799    4   6
                52  1799    4   6
                133 1799    4   6
                112 1798    2   2
                677 1798    2   2
                778 888     4   7
                111 888     4   7
                88  888     4   7
                10  888     4   7
                37  887     2   4
                26  887     2   4
                8   886     1   2
                56  885     1   1', header = TRUE)

# ordered vector of unique values for "Day"
ord_day <- unique(df$Day[order(df$Day)])
ord_day
 [1]  885  886  887  888 1798 1799 1800 1930 1931 1932 1933

2. Generate list of data frames, for each Day

For each element in ord_day we introduce a data.frame as element of a list called df_new_aug. Such data frames are defined through a for loop for all values in ord_day except ord_day[2] and ord_day[1] which are treated separately.

Idea behind the looping: for each unique ord_day[i] with i > 2 we check which days between ord_day[i-1] and ord_day[i-2] (or both!) contribute (through the variable "Count") to the value "Count_Group" at ord_day[i].

We therefore introduce if else statements in the loop. Here we go

# Recursive generation of the list of data.frames (for days > 886)
#-----------------------------------------------------------------
df_new <- list()
df_new_aug <- list()

# we exclude cases  i=1, 2: they are manually treated below
for ( i in 3: length(ord_day) ) {

  # is "Count_Group" for ord_day[i] equal to the sum of "Count" at ord_day[i-1] and ord_day[i-2]?
  if ( unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
       unique(df[df$Day == ord_day[i-1], "Count"]) + unique(df[df$Day == ord_day[i-2], "Count"])
       ) {

        # we create columns ID | Day | Count
        df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1] | df$Day == ord_day[i-2], 
                                     c("ID", "Day", "Count")])

        # we append the Count_Group of the Day in ord_day[i]
        df_new_aug[[i]] <- data.frame( df_new[[i]],
                                   Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )


         } else if (unique(df[df$Day == ord_day[i], "Count_group"]) == unique(df[df$Day == ord_day[i], "Count"])  +  
                    unique(df[df$Day == ord_day[i-1], "Count"]) ) #only "Count" at i and i-1 contribute to "Count_group" at i
                    {

                    df_new[[i]] <- data.frame(df[df$Day == ord_day[i] | df$Day == ord_day[i-1], 
                                                 c("ID", "Day", "Count")])

                    # we append the Count_Group of the Day in ord_day[2]
                    df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                  Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )

                     } else #only "Count" at i contributes to "Count_group" at i

                            df_new[[i]] <- data.frame(df[df$Day == ord_day[i], 
                                                         c("ID", "Day", "Count")])

                            # we append the Count_Group of the Day in ord_day[i]
                            df_new_aug[[i]] <- data.frame(df_new[[i]],
                                                          Count_group = rep(unique(df[df$Day == ord_day[i], "Count_group"]), nrow(df_new[[i]]) ) )


  #closing the for loop     
  }


# for ord_day[2] = "886" (both "Count" at i =2 and i = 1 contribute to "Count_group" at i=2)
#-------------------------------------------------------------------------------------
df_new[[2]] <- data.frame(df[df$Day == ord_day[2] | df$Day == ord_day[1], 
                             c("ID", "Day", "Count")])

# we append the Count_Group of the Day in ord_day[2]
df_new_aug[[2]] <- data.frame(df_new[[2]],
                              Count_group = rep(unique(df[df$Day == ord_day[2], "Count_group"]), nrow(df_new[[2]]) ) )

# for ord_day[1] = "885" (only "count" at i = 1 contributes to "Count_group" at i =1)
#------------------------------------------------------------------------------------
df_new[[1]] <- data.frame(df[df$Day == ord_day[1], c("ID", "Day", "Count")])

# we append the Count_Group of the Day in ord_day[i]
df_new_aug[[1]] <- data.frame(df_new[[1]], Count_group = rep(unique(df[df$Day == ord_day[1], "Count_group"]), nrow(df_new[[1]]) ) )


# produced list
df_new_aug

3. Generate final data frame, collapsing the list in 2.

We collapse df_new_aug through an ugly loop, but other solutions (for example with Reduce() and merge() are possible):

# merging the list (mechanically): final result
df_result <- df_new_aug[[1]]
for (i in 1:10){
  df_result <- rbind(df_result, df_new_aug[[i+1]])      
}

One arrives at df_result and the analysis is stopped.