I want to summarize by a column and then have it t

Below is the code I am currently using to summarize my data and it is working. My question is that I want to actually take the average of the "CE100" column vs the sum. How can I manipulate the below code to do this?

library(data.table, warn.conflicts = FALSE)
library(magrittr)   ### MODIFIED
# library(lubridate, warn.conflicts = FALSE)   ### MODIFIED

################
## PARAMETERS ##
################

# Set path of major source folder for raw transaction data
in_directory <- "C:/Users/NAME/Documents/Raw Data/"

# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
                      "MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
                      "VA-WA", "WB-ZZ")

# Set location for output
out_directory <- "C:/Users/NAME/Documents/YTD Master/"
out_filename <- "OUTPUT.csv"


# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2018-01-01", "2018-06-30")   ### MODIFIED

# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE   ### MODIFIED


##########
## CODE ##
##########

starttime <- Sys.time()

# create vector of filenames to be processed
in_filenames <- list.files(
  file.path(in_directory, in_subfolders), 
  pattern = "\\.txt$", 
  full.names = TRUE, 
  recursive = TRUE)

# filter filenames, only 
selected_in_filenames <- 
  seq(as.Date(date_range[1]), 
      as.Date(date_range[2]), by = "1 month") %>% 
  format("%Y-%m") %>% 
  lapply(function(x) stringr::str_subset(in_filenames, x)) %>% 
  unlist()


# read and aggregate each file separetely
mastertable <- rbindlist(
  lapply(selected_in_filenames, function(fn) {
    message("Processing file: ", fn)
    temptable <- fread(fn,
                       colClasses = c(CUSTOMER_TIER = "character"),
                       na.strings = "")

    { # Add columns
      print(paste0("Adding columns - ", subfolder, " (", j," of ", length(in_subfolders), ")"))
      print(Sys.time()-starttime)
      temptable[, ':='(CustPart = paste0(CUST_ID, INV_ITEM_ID))]}

    # aggregate file but filtered for date_range
    temptable[INVOICE_DT %between% date_range, 
              lapply(.SD, sum), by = .(CustPart, QTR = quarter(INVOICE_DT), YEAR = year(INVOICE_DT)), 
              .SDcols = c("Ext Sale", "CE100")]
  })
)[
  # second aggregation overall
  , lapply(.SD, sum), by = .(CustPart, QTR, YEAR), .SDcols = c("Ext Sale", "CE100")]

# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable)   ### MODIFIED

print(Sys.time()-starttime)

mastertable

I have included all my code to show how I read my data in. If any other details will be necessary like some sample data to work with let me know.

标签： r sum data.table mean

1条回答

smile是对你的礼貌

2楼-- · 2019-09-22 00:04

The crucial point in OP's approach is the staggered aggregation (see the related question row not consolidating duplicates in R when using multiple months in Date Filter).

The OP wants to aggregate data across a number of files which apparently are too large to be loaded altogether and combined into a large data.table.

Instead, each file is read in and aggregated separately. The sub-totals are combined into a data.table from which the overall totals are computed in a second aggregation step.

Now, the OP wants to include sums as well as averages in the aggregation steps. The staggered aggregation works for sums and counts but not for mean, e.g., mean(1:5) which is 3 is not the same as the mean of the sub-totals mean(1:2) and mean(3:5): mean(c(mean(1:2), mean(3:5))) which is 2.75.

So, the approach below computes only sums and counts for the first and second aggregation steps and computes the averages for the selected columns separately. Data are taken from OP's other question. Furthermore, the by = parameter is simplified for demonstration and data.range has been adapted to the sample data.

library(data.table, warn.conflicts = FALSE)
library(magrittr)   ### MODIFIED
# library(lubridate, warn.conflicts = FALSE)   ### MODIFIED

################
## PARAMETERS ##
################

# Set path of major source folder for raw transaction data
in_directory <- "Raw Data"

# List names of sub-folders (currently grouped by first two characters of CUST_ID)
in_subfolders <- list("AA-CA", "CB-HZ", "IA-IL", "IM-KZ", "LA-MI", "MJ-MS",
                      "MT-NV", "NW-OH", "OI-PZ", "QA-TN", "TO-UZ",
                      "VA-WA", "WB-ZZ")

# Set location for output
out_directory <- "YTD Master"
out_filename <- "OUTPUT.csv"


# Set beginning and end of date range to be collected - year-month-day format
date_range <- c("2017-01-01", "2017-06-30")   ### MODIFIED

# Enable or disable filtering of raw files to only grab items bought within certain months to save space.
# If false, all files will be scanned for unique items, which will take longer and be a larger file.
# date_filter <- TRUE   ### MODIFIED


##########
## CODE ##
##########

starttime <- Sys.time()

# create vector of filenames to be processed
in_filenames <- list.files(
  file.path(in_directory, in_subfolders), 
  pattern = "\\.txt$", 
  full.names = TRUE, 
  recursive = TRUE)

# filter filenames
selected_in_filenames <- 
  seq(as.Date(date_range[1]), 
      as.Date(date_range[2]), by = "1 month") %>% 
  format("%Y-%m") %>% 
  lapply(function(x) stringr::str_subset(in_filenames, x)) %>% 
  unlist()


# read and aggregate each file separetely
mastertable <- rbindlist(
  lapply(selected_in_filenames, function(fn) {
    message("Processing file: ", fn)
    temptable <- fread(fn,
                       colClasses = c(CUSTOMER_TIER = "character"),
                       na.strings = "")

    # aggregate file but filtered for date_range
    temptable[INVOICE_DT %between% date_range, 
              c(.(N = .N), lapply(.SD, sum)), 
              by = .(CUST_ID, 
                     QTR = quarter(INVOICE_DT), YEAR = year(INVOICE_DT)), 
              .SDcols = c("Ext Sale", "CE100")] 
  })
)[
  # second aggregation overall
  , lapply(.SD, sum), 
  by = .(CUST_ID, QTR, YEAR), 
  .SDcols = c("N", "Ext Sale", "CE100")]
# update mastertable with averages of selected columns
cols_avg <- c("CE100")
mastertable[, (cols_avg) := lapply(.SD, function(x) x/N), 
    .SDcols = cols_avg]

# Save Final table
print("Saving master table")
fwrite(mastertable, file.path(out_directory, out_filename))
# rm(mastertable)   ### MODIFIED

print(Sys.time()-starttime)

mastertable

     CUST_ID QTR YEAR N Ext Sale      CE100
1: AK0010001   1 2017 4  427.803 29.4119358
2: CO0020001   1 2017 2 1540.300         NA
3: CO0010001   1 2017 2 -179.765  0.0084625

Missing values are being included in the aggregates. It needs to be decided on business side how to handle missing values. In case missing values are to be excluded from aggregation, the staggered computation of averages might become much more complicated.

0人赞添加讨论(0) 举报

I want to summarize by a column and then have it t

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间