using dplyr in for loop in r [closed]

2019-09-02 18:37发布

问题:

I have a sample data df set like :

issue_1  issue_2  issue_3  check  cat_1  cat_2  cat_3
  a          -        -       0     1       0     0 
  -          b        -       1     0       1     0
  -          -        c       1     0       0     1
  p          -        -       0     1       0     0
  -          -        q       1     0       0     1
  -          r        -       0     0       1     0
  a          -        -       1     1       0     0
  a          b        -       1     1       1     0

to explain, it has multiple occurances of issue_1, issue_2 and issue_3 and for each row value of check is either 0 or 1

I need to calculate total occurances of each values for each issue and total count of 1's for each value of each issues. So for given sample for issue_1 we have 3 occurances of a and 2 cases where a = 1 and one case of p and 0 count of 1's for p. Similarly for other two issues.

I used nested for loop but instead of counting at grouped level it is giving total count of rows. Can someone suggest some better way?

Sample code:

abc <- c('issue_1', 'issue_2', 'issue_3')
qwe <- c('cat_1', 'cat_2', 'cat_3')
for(i in abc){
  for(j in qwe){
    temp <- df[, c(i, j, 'check')]
    temp <- subset(temp, temp[[j]] != 0)
    temp <- temp %>%
        group_by(temp[[i]]) %>%
        mutate(total_issue = length(temp[[i]]) %>%
        mutate(check_again = length(check[check == 1])) %>%
        mutate(percentage = (check_again/total_issue)*100)
    temp <- subset(temp, !(duplicated(temp[[i]])))
    temp <- temp[, c(i, 'total_issue', 'check_again', 'percentage')] 
    assign(paste(i, 'stats', sep = '_'), temp)
    write.csv(temp, paste('path', i, j, '_stats', '.csv'))
  }
}

So for this one, for issue_1 and cat_1 it should give:

issue_1   total_issue   check_again   percentage
  a            3             2           2/3*100
  p            1             0           0

回答1:

This is probably what you are after. Using the first four columns in the data, I used melt() to have the data in a long format. Then, I removed rows with -. Grouping the data by variable and value, I counted how many times each value (each letter) occurred for each issue, summed up check, and calculated percentage.

library(reshape2)
library(dplyr)

melt(mydf[,1:4], id.vars = "check") %>%
filter(value != "-") %>%
group_by(variable, value) %>%
summarise(total = n(), check = sum(check), percent = check / total * 100)

#  variable value total check   percent
#    (fctr) (chr) (int) (int)     (dbl)
#1  issue_1     a     3     2  66.66667
#2  issue_1     p     1     0   0.00000
#3  issue_2     b     2     2 100.00000
#4  issue_2     r     1     0   0.00000
#5  issue_3     c     1     1 100.00000
#6  issue_3     q     1     1 100.00000

DATA

mydf <- structure(list(issue_1 = structure(c(2L, 1L, 1L, 3L, 1L, 1L, 
2L, 2L), .Label = c("-", "a", "p"), class = "factor"), issue_2 = structure(c(1L, 
2L, 1L, 1L, 1L, 3L, 1L, 2L), .Label = c("-", "b", "r"), class = "factor"), 
issue_3 = structure(c(1L, 1L, 2L, 1L, 3L, 1L, 1L, 1L), .Label = c("-", 
"c", "q"), class = "factor"), check = c(0L, 1L, 1L, 0L, 1L, 
0L, 1L, 1L), cat_1 = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L), cat_2 = c(0L, 
1L, 0L, 0L, 0L, 1L, 0L, 1L), cat_3 = c(0L, 0L, 1L, 0L, 1L, 
0L, 0L, 0L)), .Names = c("issue_1", "issue_2", "issue_3", 
"check", "cat_1", "cat_2", "cat_3"), class = "data.frame", row.names = c(NA, 
-8L))


回答2:

This is pretty straightforward with dplyr:

library(dplyr)

dfX = read.table(
  textConnection("
issue_1  issue_2  issue_3  check  cat_1  cat_2  cat_3
a          -        -       0     1       0     0 
-          b        -       1     0       1     0
-          -        c       1     0       0     1
p          -        -       0     1       0     0
-          -        q       1     0       0     1
-          r        -       0     0       1     0
a          -        -       1     1       0     0
a          b        -       1     1       1     0
                 "),
  header = TRUE, na.strings = "-", stringsAsFactors = FALSE)


dfX %>%
  group_by(issue_1) %>%
  summarize(total_issues = n(), 
            check_again = sum(check),
            percentage = 100*(check_again/total_issues))


标签: r for-loop dplyr