How to match, replace and sum header rows from ano

2019-08-02 07:03发布

问题:

I have two data-sets:

a. A data frame that looks like this:

        SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5

(Note: The row values are NOT identical. This is just for the purpose of representation here)

b. Another data-set that looks like this:

Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF

I want to match the Family column in data-set (2) to the corresponding Species in data-frame(1) and add up the values (if there are multiple species) under the same Family. I know I can use the merge function, but I don't know how to use it, or how to call it in the header row and then sum it all.

Pre-Final output

         Family1    Family1   Family1  Family2  Family3  Family4
Site1     1          0        4        6          2        5 
Site2     1          0        4        6          2        5 
Site3     1          0        4        6          2        5 
Site5     1          0        4        6          2        5 

Final output

         Family1      Family2    Family3   Family4
Site1     5             6          2        5           
Site2     5             6          2        5             
Site3     5             6          2        5             
Site4     5             6          2        5     

回答1:

If I understand correctly, you can reshape your first data.frame from "wide" to "long" format, merge it with the second data.frame, and recast the result to wide format, using appropriate aggregation:

dfa$id <- row.names(dfa)
mdfa <- reshape2::melt(dfa, id.vars = "id", variable.name = "Species")

reshape2::dcast(
    merge(dfb, mdfa, by = "Species"), 
    id ~ Family, 
    fun.aggregate = sum
)
#      id Family1 Family2 Family3 Family4
# 1 Site1       5       6       2       5
# 2 Site2       5       6       2       5
# 3 Site3       5       6       2       5
# 4 Site4       5       6       2       5

Data:

dfa <- read.table(text = "SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5",
header = TRUE, stringsAsFactors = FALSE)

dfb <- read.table(text = "Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF",
header = TRUE, stringsAsFactors = FALSE)


回答2:

You can do:

transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]

Here is the result:

dfa <- read.table(text = "SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5",
                  header = TRUE, stringsAsFactors = FALSE)
# > transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]
#       Family1 Family2 Family3 Family4
# Site1       5       6       2       5
# Site2       5       6       2       5
# Site3       5       6       2       5
# Site4       5       6       2       5

or you can do a matrix multiplication:

as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
# > as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
#       [,1] [,2] [,3] [,4]
# Site1    5    6    2    5
# Site2    5    6    2    5
# Site3    5    6    2    5
# Site4    5    6    2    5

With

dat2 <- read.table(header=TRUE, text=
"Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF")

you can shorten the code for the matrix multiplication to

as.matrix(dfa) %*% t(table(dat2)) # or 
tcrossprod(as.matrix(dfa), as.matrix(table(dat2)))

(tnx to @alexis_laz for the comment)
.
Another solution with base R:

d <- cbind(rowSums(dfa[1:3]), dfa[-(1:3)])
names(d) <- paste0("Family", 1:4)
d


回答3:

Here's another base R solution with a lookup table (named vector) and rowSums.

# get lookup table
lookup <- setNames(dfb$Species, dfb$Family)
# get corresponding column positions with match
colPos <- names(lookup)[match(names(dfa), lookup)]

# return data.frame with named columns
setNames(data.frame(lapply(unique(names(lookup)),
                           function(i) rowSums(dfa[i == colPos]))),
         unique(names(lookup)))

This returns

      Family1 Family2 Family3 Family4
Site1       5       6       2       5
Site2       5       6       2       5
Site3       5       6       2       5
Site4       5       6       2       5

In the second line, match is used to find the corresponding column positions. In the third line, lapply runs through the unique family names and applies rowSums to the columns that correspond to these names. This returns a list which is converted into a data.frame and given names with setNames.



回答4:

My answer 2 with data.table or dplyr:

data <- read.table(text="
sites      SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5" ,  header=TRUE, stringsAsFactors=FALSE)
famdf <- read.table(text="
Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF" ,  header=TRUE, stringsAsFactors=FALSE)

#My answer 1 with data.table:
melted<-data.table::melt(data,id.vars="sites", variable.name= "Species")

data.table::dcast(
  setDT(merge(famdf, melted, by = "Species"))[,c("sites","Family","value")], 
  ... ~ Family,
    fun = sum,
  value.var = "value", 
)
#end
#My answer 2 with dplyr or data.table:
transpose<-function(df){
  n<-df[,1]
  df <- as.data.frame(t(df[,-1]))
  colnames(df) <- n
  df$id<-factor(row.names(df))
  return(df)
  }
data<-transpose(data)
data$fam<-fam$Family[match(data$id, fam$Species)]
data <- subset(data, select = -id )

#Sum option 1 data.table
library(data.table)
transpose(setDF(setDT(data)[, lapply(.SD,sum), by = .(fam)]))
#Sum option 2 dplyr
library(dplyr)
result<-as.data.frame(data %>%
  group_by(fam) %>%
  summarise_each(funs(sum))
)
transpose(result)


标签: r merge matching