我有两个数据集:
一种。 数据帧,看起来像这样:
SpeciesA SpeciesB SpeciesC SpeciesD SpeciesE SpeciesF
Site1 1 0 4 6 2 5
Site2 1 0 4 6 2 5
Site3 1 0 4 6 2 5
Site4 1 0 4 6 2 5
(注:该行是不相等的这只是表示这里的目的。)
湾 另一个数据集,看起来像这样:
Family Species
Family1 SpeciesA
Family1 SpeciesB
Family1 SpeciesC
Family2 SpeciesD
Family3 SpeciesE
Family4 SpeciesF
我想将数据集(2)在数据帧中的对应种家庭列匹配(1)和加起来的值(如果有多个种)相同的家庭下。 我知道我可以使用merge
功能,但我不知道如何使用它,或者如何调用它的标题行中再总结这一切。
预决赛输出
Family1 Family1 Family1 Family2 Family3 Family4
Site1 1 0 4 6 2 5
Site2 1 0 4 6 2 5
Site3 1 0 4 6 2 5
Site5 1 0 4 6 2 5
最终输出
Family1 Family2 Family3 Family4
Site1 5 6 2 5
Site2 5 6 2 5
Site3 5 6 2 5
Site4 5 6 2 5
如果我理解正确的话,你可以重塑你的第一个data.frame
从“宽”到“长”格式, merge
与第二它data.frame
,并重铸结果宽幅,使用适当的聚合:
dfa$id <- row.names(dfa)
mdfa <- reshape2::melt(dfa, id.vars = "id", variable.name = "Species")
reshape2::dcast(
merge(dfb, mdfa, by = "Species"),
id ~ Family,
fun.aggregate = sum
)
# id Family1 Family2 Family3 Family4
# 1 Site1 5 6 2 5
# 2 Site2 5 6 2 5
# 3 Site3 5 6 2 5
# 4 Site4 5 6 2 5
数据:
dfa <- read.table(text = "SpeciesA SpeciesB SpeciesC SpeciesD SpeciesE SpeciesF
Site1 1 0 4 6 2 5
Site2 1 0 4 6 2 5
Site3 1 0 4 6 2 5
Site4 1 0 4 6 2 5",
header = TRUE, stringsAsFactors = FALSE)
dfb <- read.table(text = "Family Species
Family1 SpeciesA
Family1 SpeciesB
Family1 SpeciesC
Family2 SpeciesD
Family3 SpeciesE
Family4 SpeciesF",
header = TRUE, stringsAsFactors = FALSE)
你可以做:
transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]
下面是结果:
dfa <- read.table(text = "SpeciesA SpeciesB SpeciesC SpeciesD SpeciesE SpeciesF
Site1 1 0 4 6 2 5
Site2 1 0 4 6 2 5
Site3 1 0 4 6 2 5
Site4 1 0 4 6 2 5",
header = TRUE, stringsAsFactors = FALSE)
# > transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]
# Family1 Family2 Family3 Family4
# Site1 5 6 2 5
# Site2 5 6 2 5
# Site3 5 6 2 5
# Site4 5 6 2 5
或者你可以做一个矩阵乘法:
as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
# > as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
# [,1] [,2] [,3] [,4]
# Site1 5 6 2 5
# Site2 5 6 2 5
# Site3 5 6 2 5
# Site4 5 6 2 5
同
dat2 <- read.table(header=TRUE, text=
"Family Species
Family1 SpeciesA
Family1 SpeciesB
Family1 SpeciesC
Family2 SpeciesD
Family3 SpeciesE
Family4 SpeciesF")
你可以缩短代码为矩阵乘法
as.matrix(dfa) %*% t(table(dat2)) # or
tcrossprod(as.matrix(dfa), as.matrix(table(dat2)))
(TNX到@alexis_laz的评论)
。
与基础R另一种解决方案:
d <- cbind(rowSums(dfa[1:3]), dfa[-(1:3)])
names(d) <- paste0("Family", 1:4)
d
下面是利用查找表(名为矢量)和另一基础R溶液rowSums
。
# get lookup table
lookup <- setNames(dfb$Species, dfb$Family)
# get corresponding column positions with match
colPos <- names(lookup)[match(names(dfa), lookup)]
# return data.frame with named columns
setNames(data.frame(lapply(unique(names(lookup)),
function(i) rowSums(dfa[i == colPos]))),
unique(names(lookup)))
这将返回
Family1 Family2 Family3 Family4
Site1 5 6 2 5
Site2 5 6 2 5
Site3 5 6 2 5
Site4 5 6 2 5
在第二行中, match
是用来寻找相应的列位置。 在第三行, lapply
通过独特的姓氏中运行并应用rowSums
到对应于这些名称的列。 这将返回其转换成一个列表data.frame
和名与setNames
。
我的回答2 data.table或dplyr:
data <- read.table(text="
sites SpeciesA SpeciesB SpeciesC SpeciesD SpeciesE SpeciesF
Site1 1 0 4 6 2 5
Site2 1 0 4 6 2 5
Site3 1 0 4 6 2 5
Site4 1 0 4 6 2 5" , header=TRUE, stringsAsFactors=FALSE)
famdf <- read.table(text="
Family Species
Family1 SpeciesA
Family1 SpeciesB
Family1 SpeciesC
Family2 SpeciesD
Family3 SpeciesE
Family4 SpeciesF" , header=TRUE, stringsAsFactors=FALSE)
#My answer 1 with data.table:
melted<-data.table::melt(data,id.vars="sites", variable.name= "Species")
data.table::dcast(
setDT(merge(famdf, melted, by = "Species"))[,c("sites","Family","value")],
... ~ Family,
fun = sum,
value.var = "value",
)
#end
#My answer 2 with dplyr or data.table:
transpose<-function(df){
n<-df[,1]
df <- as.data.frame(t(df[,-1]))
colnames(df) <- n
df$id<-factor(row.names(df))
return(df)
}
data<-transpose(data)
data$fam<-fam$Family[match(data$id, fam$Species)]
data <- subset(data, select = -id )
#Sum option 1 data.table
library(data.table)
transpose(setDF(setDT(data)[, lapply(.SD,sum), by = .(fam)]))
#Sum option 2 dplyr
library(dplyr)
result<-as.data.frame(data %>%
group_by(fam) %>%
summarise_each(funs(sum))
)
transpose(result)