R - A loop comparing elements in common between tw

2019-07-05 02:35发布

问题:

I have been trying, for some time, to build a matrix populated by the counts of elements in common between two herarchical lists.

Here is some dummy data:

site<-c('A','A','A','A','A','A','A','A','A','B','B','B','B','B','B')
group<-c('A1','A1','A2','A2','A2','A3','A3','A3','A3', 
'B1','B1','B2','B2','B2','B2')
element<-c("red","orange","blue","black","white", "black","cream","yellow","purple","red","orange","blue","white","gray","salmon")
d<-cbind(site,group,element)

I created a list structure, assuming it would be procedural due to the different number os elements in each list. Also, since I don´t want every possible comparison between groups, but only between sites.

#first level list - by site
sitelist<-split(nodmod, list(nodmod$site),drop = TRUE)
#list by group 
nestedlist <- lapply(sitelist, function(x) split(x, x[['mod']], drop = TRUE))

My intention is to create a table, or matrix with the number of element in common between groups from the two sites (my original data has additional sites). Like such:

    A1  A2  A3
B1  2   0   0
B2  0   2   0

The nested nature of this problem is challenging to me. I am not as familiar with lists, as I´ve solved problems mostly using dataframes. My attempt boiled down to this. I felt it got close, but have many shortcomings with the correct syntax for loops.

t <- outer(1:length(d$A),
         1:length(d$B),
         FUN=function(i,j){
           sapply(1:length(i),
                  FUN=function(x) 
                    length(intersect(d$A[[i]]$element, d$B[[j]]$element)) )
         })

Any help would be much appreciated. Apologies if a similar problem has been solved. I have scoured the internet, but have not found it, or did not comprehend the solution to make it transferable to mine.

回答1:

A similar approach to @Parfait's using matrix multiplication. You may need to play around with the data generation to extend it to your application:

site<-c('A','A','A','A','A','A','A','A','A','B','B','B','B','B','B')
group<-c('A1','A1','A2','A2','A2','A3','A3','A3','A3', 
         'B1','B1','B2','B2','B2','B2')
element<-c("red","orange","blue","black","white", "black","cream","yellow","purple","red","orange","blue","white","gray","salmon")

d<-data.frame(group, el = as.factor(element), stringsAsFactors = FALSE)


As <- d[group %in% paste0("A", 1:3), ]
Bs <- d[group %in% paste0("B", 1:2), ]

A_mat <- as.matrix(table(As))
B_mat <- as.matrix(table(Bs))

Results:

> A_mat
         el
group black blue cream gray orange purple red salmon white yellow
   A1     0    0     0    0      1      0   1      0     0      0
   A2     1    1     0    0      0      0   0      0     1      0
   A3     1    0     1    0      0      1   0      0     0      1


> B_mat
         el
group black blue cream gray orange purple red salmon white yellow
   B1     0    0     0    0      1      0   1      0     0      0
   B2     0    1     0    1      0      0   0      1     1      0


> B_mat %*% t(A_mat)
     group
group A1 A2 A3
   B1  2  0  0
   B2  0  2  0


回答2:

Consider matrix multiplication x %*% y (see ?matmult) by creating a helper matrix of unique element values by unique group values assigning ones in each corresponding cell. Then run the matrix multiplication as the transpose with itself, followed by subset of rows and columns:

# EMPTY MATRIX
helper_mat <- matrix(0, nrow=length(unique(element)), ncol=length(unique(group)),
                     dimnames=list(unique(element), unique(group)))

# ASSIGN 1's AT SELECT LOCATIONS
for(i in seq_along(site)) {
  helper_mat[element[i], group[i]] <- 1
}

helper_mat
#        A1 A2 A3 B1 B2
# red     1  0  0  1  0
# orange  1  0  0  1  0
# blue    0  1  0  0  1
# black   0  1  1  0  0
# white   0  1  0  0  1
# cream   0  0  1  0  0
# yellow  0  0  1  0  0
# purple  0  0  1  0  0
# gray    0  0  0  0  1
# salmon  0  0  0  0  1

# MATRIX MULTIPLICATION WITH SUBSET
final_mat <- t(helper_mat) %*% helper_mat
final_mat <- final_mat[grep("B", rownames(final_mat)), grep("A", colnames(final_mat))]

final_mat
#    A1 A2 A3
# B1  2  0  0
# B2  0  2  0

Even shorter version thanks to @Lamia:

helper_mat <- table(element, group)

final_mat <- t(helper_mat) %*% helper_mat # ALTERNATIVELY: crossprod(helper_mat)

final_mat <- final_mat[grep("B", rownames(final_mat)), grep("A", colnames(final_mat))]

final_mat
#      group
# group A1 A2 A3
#    B1  2  0  0
#    B2  0  2  0


回答3:

# example dataset
site<-c('A','A','A','A','A','A','A','A','A','B','B','B','B','B','B')
group<-c('A1','A1','A2','A2','A2','A3','A3','A3','A3', 
         'B1','B1','B2','B2','B2','B2')
element<-c("red","orange","blue","black","white", "black","cream","yellow","purple","red","orange","blue","white","gray","salmon")
d<-cbind(site,group,element)

library(tidyverse)

# save as dataframe
d = data.frame(d)

expand.grid(groupA = unique(d$group[d$site=="A"]),
            groupB = unique(d$group[d$site=="B"])) %>%               # get all combinations of A and B columns
  rowwise() %>%                                                      # for each row
  mutate(counts = length(intersect(d$element[d$group==groupA], 
                                   d$element[d$group==groupB]))) %>% # count common elements
  spread(groupA, counts) %>%                                         # reshape data
  data.frame() %>%                                                   
  column_to_rownames("groupB")

#    A1 A2 A3
# B1  2  0  0
# B2  0  2  0

Instead of rowwise you can use a vectorised function that will be (automatically) applied to each row, like this:

# create a function and vectorise it
CountCommonElements = function(x, y) length(intersect(d$element[d$group==x], d$element[d$group==y]))
CountCommonElements = Vectorize(CountCommonElements)

expand.grid(groupA = unique(d$group[d$site=="A"]),
            groupB = unique(d$group[d$site=="B"])) %>%                                                              
  mutate(counts = CountCommonElements(groupA, groupB)) %>% 
  spread(groupA, counts) %>%                                       
  data.frame() %>%                                                   
  column_to_rownames("groupB")

#    A1 A2 A3
# B1  2  0  0
# B2  0  2  0