Calculate the frequency of strings separated by de

2019-09-10 17:47发布

问题:

I have a dataframe mydf, where the Left and Right genes are separated by ':'. I need to calculate the number of occurrence of these genes in LeftGene and RightGene columns per file and get something like in the result. What would be the best way to do it in R?

sample     LeftGene    RightGene
file1
           ATT:TAA
           ATT:ATT      ATT
file2      
           TTP:TTG      TTP:TTP

Results

file1
LeftGene           RightGene
ATT=3              ATT=1
TAA=1

file2
LeftGene           RightGene
TTP=1              TTP=2
TTG=1

Attn: akrun

Here is the dput of actual data where we have file_name, and need to get the frequency of Left.Gene.Symbols and Right.Gene.Symbols in each file. I would also love to see the frequency of these genes from all files (cumulative). Thank you for your help.

mydf<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA, 
"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3", 
"NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA", 
"NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA", 
"NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name", 
"Left.Gene.Symbols", "Right.Gene.Symbols")))

expected output:

AMLM12001KP
Left.Gene.Symbols       Right.Gene.Symbols

1114002
Left.Gene.Symbols       Right.Gene.Symbols
CR1L=1                  CDC27=2
GIGYF2=3                ENPP3=1
ENPP3=1

1121501
Left.Gene.Symbols       Right.Gene.Symbols
NTNG1=2 
ENPP3=1

All files
Left.Gene.Symbol        Right.Gene.Symbols
CR1L=1                  CDC27=2
GIGYF2=3                ENPP3=1
NTNG1=2 
ENPP3=2

回答1:

EDIT

dd2<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA,"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3","NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA","NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA", "NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name", "Left.Gene.Symbols", "Right.Gene.Symbols")))


## change character NAs to <NA> and carry-forward the file column
dd2[dd2 == 'NA'] <- NA
dd2[, 1] <- na.omit(unique(dd2[, 1]))[cumsum(!is.na(dd2[, 1]))]

## split based on file name
sp <- split(data.frame(dd2, stringsAsFactors = FALSE), dd2[, 1])

## split each string by `:` and make a table
(l <- lapply(sp, function(x) {
  x <- droplevels(x[, -1])
  f <- function(x) na.omit(unlist(strsplit(x, ':')))
  left <- f(x[, 1])
  right <- f(x[, 2])
  table(c(left, right), rep(names(x), c(length(left), length(right))))
}))

# $`1114002`
# 
#          Left.Gene.Symbols Right.Gene.Symbols
#   CDC27                  0                  2
#   CR1L                   1                  0
#   ENPP3                  1                  1
#   GIGYF2                 3                  0
# 
# $`1121501`
# 
#         Left.Gene.Symbols
#   ENPP3                 1
#   NTNG1                 2
# 
# $AMLM12001KP
# < table of extent 0 x 0 >

And since each list element is a table, work with them as tables

data.frame(l$`1114002`)

#     Var1               Var2 Freq
# 1  CDC27  Left.Gene.Symbols    0
# 2   CR1L  Left.Gene.Symbols    1
# 3  ENPP3  Left.Gene.Symbols    1
# 4 GIGYF2  Left.Gene.Symbols    3
# 5  CDC27 Right.Gene.Symbols    2
# 6   CR1L Right.Gene.Symbols    0
# 7  ENPP3 Right.Gene.Symbols    1
# 8 GIGYF2 Right.Gene.Symbols    0

Here's another way in a listy format

rl <- readLines(textConnection("
sample     LeftGene    RightGene
file1
           ATT:ATT      ATT
file2      
           TTP:TTG      TTP:TTP
"))

dd <- setNames(read.table(text = rl[grep('file', rl) + 1], stringsAsFactors = FALSE),
               c('LeftGene','RightGene'))
rownames(dd) <- paste0('File', 1:nrow(dd))

setNames(lapply(1:nrow(dd), function(x) {
  sp <- strsplit(unlist(dd[x, ]), ':')
  table(unlist(sp), rep(names(sp), lengths(sp)))
}), rownames(dd))

# $File1
#     
#       LeftGene RightGene
#   ATT        2         1
#
# $File2
#      
#       LeftGene RightGene
#   TTG        1         0
#   TTP        1         2

or

setNames(lapply(1:nrow(dd), function(x) {
  sp <- strsplit(unlist(dd[x, ]), ':')
  lapply(sp, function(y) data.frame(table(y)))
}), rownames(dd))


# $File1
# $File1$LeftGene
#     y Freq
# 1 ATT    2
# 
# $File1$RightGene
#     y Freq
# 1 ATT    1
# 
# 
# $File2
# $File2$LeftGene
#     y Freq
# 1 TTG    1
# 2 TTP    1
# 
# $File2$RightGene
#     y Freq
# 1 TTP    2


回答2:

We split the 2nd and 3rd column of 'df' with the delimiter :, convert to 'long' format with cSplit from splitstackshape. The output will be a data.table. We use melt to reshape it again by selecting the 'id.var' as 'sample' while remove the NA values. Grouped by 'sample', 'variable', and 'value', we get the number of rows (.N), create new variables by pasteing the 'value' and 'N' and a sequence variable ('ind'). Then , we dcast from 'long' to 'wide' format.

library(splitstackshape)
library(data.table)
dM <- melt(cSplit(df, 2:3, ':', 'long'),
         id.var='sample', na.rm=TRUE)[, .N,.(sample, variable, value)]
dM[, valueN:= paste(value, N, sep="=")]
dM[, ind:= 1:.N, .(sample, variable)]
dcast(dM, ind+sample~variable, value.var='valueN')
#   ind sample LeftGene RightGene
#1:   1  file1    ATT=2     ATT=1
#2:   1  file2    TTP=1     TTP=2
#3:   2  file2    TTG=1        NA

data

df <- structure(list(sample = c("file1", "file2"),
 LeftGene = c("ATT:ATT", 
"TTP:TTG"), RightGene = c("ATT", "TTP:TTP")), 
.Names = c("sample", 
"LeftGene", "RightGene"), class = "data.frame", 
row.names = c(NA, -2L))