Frequency table including zeros for unused values,

2019-02-09 15:14发布

问题:

I have a data set that is as follows:

library(data.table)

test <- data.table(structure(list(Issue.Date = structure(c(16041, 16056, 16042,15990, 15996, 16001, 15995, 15981, 15986, 15996, 15996, 16002,16015, 16020, 16025, 16032, 16023, 16084, 16077, 16102, 16104,16107, 16112, 16113, 16115, 16121, 16125, 16128, 16104, 16132,16133, 16135, 16139, 16146, 16151), class = "Date"), 
    Complaint = structure(c(1L,4L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,5L, 3L, 1L, 3L, 1L, 4L, 4L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L, 3L,3L, 3L), .Label = c("A", "B", "C", "D", "E"), class = "factor"),
    yr = c("2013", "2013", "2013", "2013", "2013", "2013", "2013","2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013","2013", "2013", "2014", "2014", "2014", "2014", "2014", "2014","2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014","2014", "2014", "2014", "2014"), 
    Month = c("2013-12", "2013-12","2013-12", "2013-10", "2013-10", "2013-10", "2013-10", "2013-10","2013-10", "2013-10", "2013-10", "2013-10", "2013-11", "2013-11","2013-11", "2013-11", "2013-11", "2014-01", "2014-01", "2014-02","2014-02", "2014-02", "2014-02", "2014-02", "2014-02", "2014-02","2014-02", "2014-02", "2014-02", "2014-03", "2014-03", "2014-03","2014-03", "2014-03", "2014-03"), 
    da = c("02", "17", "03","12", "18", "23", "17", "03", "08", "18", "18", "24", "06","11", "16", "23", "14", "14", "07", "01", "03", "06", "11","12", "14", "20", "24", "27", "03", "03", "04", "06", "10","17", "22")), 
   .Names = c("Issue.Date", "Complaint", "yr","Month", "da"), class = c("data.table", "data.frame"), row.names = c(NA,-35L)))

Basically what I would like to do is use data.table to create a frequency table that has Complaint and Count by Month. The trick is that I need it to show a Count of zero if there are no Complaints of that type for that Month. I know how to do it without showing the zeros, but I want to know how to include them.

test[ , count := .N, by = "Month,Complaint"]

回答1:

To directly get the counts for each group:

setkey(test, Month, Complaint)

# may need to also add allow.cartesian, depending on actual data
test[CJ(Month, Complaint, unique = TRUE), .N, by = .EACHI]
#      Month Complaint N
# 1: 2013-10         A 0
# 2: 2013-10         B 0
# 3: 2013-10         C 5
# 4: 2013-10         D 4
# 5: 2013-10         E 0
# 6: 2013-11         A 1
# 7: 2013-11         B 0
# 8: 2013-11         C 4
# 9: 2013-11         D 0
#10: 2013-11         E 0
#11: 2013-12         A 1
#12: 2013-12         B 0
#13: 2013-12         C 0
#14: 2013-12         D 2
#15: 2013-12         E 0
#16: 2014-01         A 0
#17: 2014-01         B 0
#18: 2014-01         C 1
#19: 2014-01         D 0
#20: 2014-01         E 1
#21: 2014-02         A 2
#22: 2014-02         B 0
#23: 2014-02         C 6
#24: 2014-02         D 2
#25: 2014-02         E 0
#26: 2014-03         A 1
#27: 2014-03         B 2
#28: 2014-03         C 3
#29: 2014-03         D 0
#30: 2014-03         E 0
#      Month Complaint N

See first revision of the answer if you want to have the counts in the full data.table instead of summarizing.



回答2:

It sounds like you perhaps need to use expand.grid to "fill in" your data.table:

EG <- data.table(expand.grid(Complaint = unique(test$Complaint), 
                             Month = unique(test$Month)), 
                 key = "Complaint,Month")

Then, you can merge:

setkey(test, Complaint, Month)
Full <- merge(test, EG, all.y = TRUE)

And count like this:

Full[ , list(sum(!is.na(Issue.Date))), by = "Month,Complaint"]
#       Month Complaint V1
#  1: 2013-11         A  1
#  2: 2013-12         A  1
#  3: 2014-02         A  2
#  4: 2014-03         A  1
#  5: 2013-10         A  0
#  6: 2014-01         A  0
#  7: 2013-11         B  0
#  8: 2013-12         B  0
# ::: S N I P :::
# 24: 2014-01         D  0
# 25: 2013-11         E  0
# 26: 2013-12         E  0
# 27: 2014-02         E  0
# 28: 2014-03         E  0
# 29: 2013-10         E  0
# 30: 2014-01         E  1
#       Month Complaint V1

Alternatively, just use table (???)

data.table(table(test[, c("Month", "Complaint"), with = FALSE]))
#       Month Complaint N
#  1: 2013-10         A 0
#  2: 2013-11         A 1
#  3: 2013-12         A 1
#  4: 2014-01         A 0
#  5: 2014-02         A 2
#  6: 2014-03         A 1
#  7: 2013-10         B 0
# ::: S N I P :::
# 28: 2014-01         E 1
# 29: 2014-02         E 0
# 30: 2014-03         E 0
#       Month Complaint N