For-Loop By Columns with existing For-loop by Rows

2020-04-21 02:12发布

问题:

I have a dataset as follows as a sample. My actual dataset has 5000 columns:

# Define Adstock Rate
adstock_rate = 0.50
lag_number = 3
# Create Data
advertising = c(117.913, 120.112, 125.828, 115.354, 177.090, 141.647, 137.892,   0.000,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000, 158.511, 109.385,  91.084,  79.253, 102.706, 
        78.494, 135.114, 114.549,  87.337, 107.829, 125.020,  82.956,  60.813,  83.149,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000, 129.515, 105.486, 111.494, 107.099,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,
        134.913, 123.112, 178.828, 112.354, 100.090, 167.647, 177.892,   0.000,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000, 112.511, 155.385,  123.084,  89.253, 67.706, 
        23.494, 122.114, 112.549,  65.337, 134.829, 123.020,  81.956,  23.813,  65.149,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000, 145.515, 154.486, 121.494, 117.099,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000
        )

advertising2 = c(43.913, 231.112, 76.828, 22.354, 98.090, 123.647, 90.892,   0.000,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000, 234.511, 143.385,  78.084,  89.253, 12.706, 
        34.494, 56.114, 78.549,  12.337, 67.829, 42.020,  90.956,  23.813,  83.149,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000, 52.515, 76.486, 89.494, 12.099,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,
        67.913, 12.112, 45.828, 78.354, 89.090, 90.647, 23.892,   0.000,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000, 78.511, 23.385,  43.084,  67.253, 33.706, 
        56.494, 78.114, 98.549,  45.337, 31.829, 67.020,  87.956,  94.813,  65.149,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000, 55.515, 32.486, 78.494, 33.099,   0.000,   0.000,   0.000, 
        0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000,   0.000
        )
Region = c(500, 500, 500, 500, 500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,
   500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500, 500, 500, 
   500, 500,
   501, 501, 501, 501, 501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,
   501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501, 501, 501, 
   501, 501)

advertising_dataset<-data.frame(cbind(Region, advertising, advertising2))

My dataset looks like this:

head(advertising_dataset, 15)

   Region advertising advertising2
1     500     117.913       43.913
2     500     120.112      231.112
3     500     125.828       76.828
4     500     115.354       22.354
5     500     177.090       98.090
6     500     141.647      123.647
7     500     137.892       90.892
8     500       0.000        0.000
9     500       0.000        0.000
10    500       0.000        0.000
11    500       0.000        0.000
12    500       0.000        0.000
13    500       0.000        0.000
14    500       0.000        0.000
15    500       0.000        0.000

A for-loop is then created to only 1 column and then a group_by function after that by Region.

foo <- function(df_, lag_val = 1) {
  df_$adstocked_advertising = df_$advertising
  for (i in (1 + lag_val):nrow(df_)) {
    df_$adstocked_advertising[i] = df_$advertising[i] + adstock_rate * 
df_$adstocked_advertising[i - lag_val]
  }
  return(df_)
}


adv_2 <- data.frame(advertising_dataset %>%
                      group_by(Region) %>%
                      do(foo(data.frame(.), lag_val = 3)))    

How do I apply the above functions including adv_2 to all columns from 2:ncol(advertising_dataset) rather than just the advertising column?

My final number of columns should double in the end because a newly revised column will be created for every existing column.

I have a feeling it is something along these lines, with the function I above:

data.frame(advertising_dataset[1], 
apply(advertising_dataset[2:ncol(advertising_dataset)],2, foo) )

Any help would be great, thanks!

回答1:

We could use accumulate with mutate_all

library(tidyverse)
out <- advertising_dataset %>% 
         group_by(Region) %>%
         mutate_all(funs(adstocked = accumulate(., ~ .y + adstock_rate * .x)))
out
# A tibble: 104 x 5
# Groups:   Region [2]
#   Region advertising advertising2 advertising_adstocked advertising2_adstocked
#    <dbl>       <dbl>        <dbl>                 <dbl>                  <dbl>
# 1    500        118.         43.9                 118.                    43.9
# 2    500        120.        231.                  179.                   253. 
# 3    500        126.         76.8                 215.                   203. 
# 4    500        115.         22.4                 223.                   124. 
# 5    500        177.         98.1                 289.                   160. 
# 6    500        142.        124.                  286.                   204. 
# 7    500        138.         90.9                 281.                   193. 
# 8    500          0           0                   140.                    96.4
# 9    500          0           0                    70.2                   48.2
#10    500          0           0                    35.1                   24.1
# ... with 94 more rows

Checking with the output from OP's solution

head(out[[4]])
#[1] 117.9130 179.0685 215.3623 223.0351 288.6076 285.9508

head(adv_2[[4]])
#[1] 117.9130 179.0685 215.3623 223.0351 288.6076 285.9508

Update

We could modify the OP's function foo for different lag_val

foo1 <- function(dot, lag_val = 1) {
     tmp <- dot
     for(i in (1 + lag_val): length(tmp)) {
           tmp[i] <- tmp[i] + adstock_rate * tmp[i - lag_val]
     }
     return(tmp)
   }


advertising_dataset %>%
       group_by(Region) %>%
       mutate_all(funs(adstocked = foo1(., lag_val = 1)))
# A tibble: 104 x 5
# Groups:   Region [2]
#   Region advertising advertising2 advertising_adstocked advertising2_adstocked
#    <dbl>       <dbl>        <dbl>                 <dbl>                  <dbl>
# 1    500        118.         43.9                 118.                    43.9
# 2    500        120.        231.                  179.                   253. 
# 3    500        126.         76.8                 215.                   203. 
# 4    500        115.         22.4                 223.                   124. 
# 5    500        177.         98.1                 289.                   160. 
# 6    500        142.        124.                  286.                   204. 
# 7    500        138.         90.9                 281.                   193. 
# 8    500          0           0                   140.                    96.4
# 9    500          0           0                    70.2                   48.2
#10    500          0           0                    35.1                   24.1
# ... with 94 more rows

-change the lag_val

advertising_dataset %>%
            group_by(Region) %>%
            mutate_all(funs(adstocked = foo1(., lag_val = 2)))
# A tibble: 104 x 5
# Groups:   Region [2]
#   Region advertising advertising2 advertising_adstocked advertising2_adstocked
#    <dbl>       <dbl>        <dbl>                 <dbl>                  <dbl>
# 1    500        118.         43.9                 118.                    43.9
# 2    500        120.        231.                  120.                   231. 
# 3    500        126.         76.8                 185.                    98.8
# 4    500        115.         22.4                 175.                   138. 
# 5    500        177.         98.1                 269.                   147. 
# 6    500        142.        124.                  229.                   193. 
# 7    500        138.         90.9                 273.                   165. 
# 8    500          0           0                   115.                    96.3
# 9    500          0           0                   136.                    82.3
#10    500          0           0                    57.3                   48.2
# ... with 94 more rows


回答2:

Using data.table, you can apply any function to a subset of columns using an lapply within the j-statement. These functions will act on the columns as vectors, however, so you might want to edit your foo() to be general to any vector. But, basically, all you have to do is:

library(data.table)
setDT(advertising_dataset)
cnames <- colnames(advertising_dataset)
advertising_dataset[, lapply(.SD, foo), .SDcols = cnames[cnames != "Region]]