subset data.table keeping only elements greater th

2019-02-26 20:25发布

问题:

I would like to subset news (below) to create news2 (further below) which will only include the rows/columns where the abs(value) in each element of news > 0.01.

Below is the code that I have tried:

gr <- data.frame(which(abs(news[, 1:ncol(news), with = FALSE]) > 0.01,
arr.ind = TRUE))
news2a <- news[gr$row, c(1, gr$col + 1L), with = FALSE]
news2a[, which(duplicated(names(news2a))) := NULL]

The code above does not always work. Note: In the real data set, there are both more rows and columns.

# news
ID         diff.jan         diff.feb         diff.mar         diff.apr
1:   7 -2.998852570e-13  2.764079712e-13 -3.291735832e-13  0.000000000e+00
2:   8  1.010000000e-01 -3.717073578e-13 -6.575639966e-13 -2.100269646e-13
3:  10  0.000000000e+00 -3.973537519e-13  0.000000000e+00  0.000000000e+00
4:  47  0.000000000e+00  0.000000000e+00  0.000000000e+00 -2.371100404e-13
5:  50  0.000000000e+00 -2.281689276e-13  2.192820401e-13 -1.857449127e-13
6:  79  0.000000000e+00  4.031985405e-13 -3.981825179e-13  0.000000000e+00
7: 202  6.409906781e-13  0.000000000e+00               NA  1.000000000e+01
8: 203  6.359592723e-13  0.000000000e+00  0.000000000e+00  1.100000000e+01
9: 468  2.545310002e-13 -2.426929277e-13 -2.612280890e-13  0.000000000e+00
       diff.may         diff.jun         diff.jul         diff.aug
1:  0.000000000e+00  0.000000000e+00  1.583933835e-13  1.182802403e-13
2:  0.000000000e+00  1.298306616e-13 -8.222315538e-13  9.721908246e-13
3:  0.000000000e+00  0.000000000e+00  0.000000000e+00  4.697083567e-13
4: -1.315189580e-13  6.926635309e-13  1.243841313e-13  0.000000000e+00
5:  0.000000000e+00  0.000000000e+00  0.000000000e+00  2.210000000e-01
6:  0.000000000e+00  0.000000000e+00  5.015727533e-13  0.000000000e+00
7:  0.000000000e+00 -1.073174486e-13  0.000000000e+00  0.000000000e+00
8:  0.000000000e+00  5.697594583e-13  0.000000000e+00  8.891748412e-13
9: -6.365151884e-13  1.595531286e-13  0.000000000e+00 -1.574081330e-13

news <- structure(list(ID = c(7L, 8L, 10L, 47L, 50L, 79L, 202L, 203L, 
468L), diff.jan = c(-2.99885257e-13, 0.101, 0, 0, 0, 0, 6.409906781e-13, 
6.359592723e-13, 2.545310002e-13), diff.feb = c(2.764079712e-13, 
-3.717073578e-13, -3.973537519e-13, 0, -2.281689276e-13, 4.031985405e-13, 
0, 0, -2.426929277e-13), diff.mar = c(-3.291735832e-13, -6.575639966e-13, 
0, 0, 2.192820401e-13, -3.981825179e-13, NA, 0, -2.61228089e-13
), diff.apr = c(0, -2.100269646e-13, 0, -2.371100404e-13, -1.857449127e-13, 
0, 10, 11, 0), diff.may = c(0, 0, 0, -1.31518958e-13, 0, 0, 0, 
0, -6.365151884e-13), diff.jun = c(0, 1.298306616e-13, 0, 6.926635309e-13, 
0, 0, -1.073174486e-13, 5.697594583e-13, 1.595531286e-13),
diff.jul = c(1.583933835e-13, 
-8.222315538e-13, 0, 1.243841313e-13, 0, 5.015727533e-13, 0, 
0, 0), diff.aug = c(1.182802403e-13, 9.721908246e-13, 4.697083567e-13, 
0, 0.221, 0, 0, 8.891748412e-13, -1.57408133e-13)), .Names = c("ID", 
"diff.jan", "diff.feb", "diff.mar", "diff.apr", "diff.may", "diff.jun", 
"diff.jul", "diff.aug"), class = c("data.table", "data.frame"
), row.names = c(NA, -9L))

news2 is what I would like to achieve based on news above.

#news2
ID diff.jan diff.apr diff.aug
1:   8    0.101       NA       NA
2:  50       NA       NA    0.221
3: 202       NA       10       NA
4: 203       NA       11       NA

dput(news2)
news2 <- structure(list(ID = c(8L, 50L, 202L, 203L), diff.jan = c(0.101, 
NA, NA, NA), diff.apr = c(NA, NA, 10L, 11L), diff.aug = c(NA, 
0.221, NA, NA)), .Names = c("ID", "diff.jan", "diff.apr", "diff.aug"
), class = c("data.table", "data.frame"), row.names = c(NA, -4L
))

Can you offer suggestions for code that will achieve the desired result?

回答1:

If you melt your data.table to long format, this is easy:

library(reshape2)
news1 <- melt(news, id.vars = "ID")

news2 <- news1[abs(value) > 0.01,]
#    ID variable  value
#1:   8 diff.jan  0.101
#2: 202 diff.apr 10.000
#3: 203 diff.apr 11.000
#4:  50 diff.aug  0.221

dcast.data.table(news2, ID ~ variable)
#    ID diff.jan diff.apr diff.aug
#1:   8    0.101       NA       NA
#2:  50       NA       NA    0.221
#3: 202       NA       10       NA
#4: 203       NA       11       NA

Personally, I wouldn't do the last step.