I have a data.table and would like to compute stats by groups.
R) set.seed(1)
R) DT=data.table(a=rnorm(100),b=rnorm(100))
Those groups should be defined by
R) quantile(DT$a,probs=seq(.1,.9,.1))
10% 20% 30% 40% 50% 60% 70% 80% 90%
-1.05265747329 -0.61386923071 -0.37534201964 -0.07670312896 0.11390916079 0.37707993057 0.58121734252 0.77125359976 1.18106507751
How can I compute say the average of b
per bin, say if b=-.5
I am within [-0.61386923071,-0.37534201964]
so in bin 3
How about :
> DT[, mean(b), keyby=cut(a,quantile(a,probs=seq(.1,.9,.1)))]
cut V1
1: NA -0.31359818
2: (-1.05,-0.614] -0.14103182
3: (-0.614,-0.375] -0.33474492
4: (-0.375,-0.0767] 0.20827735
5: (-0.0767,0.114] 0.14890251
6: (0.114,0.377] 0.16685304
7: (0.377,0.581] 0.07086979
8: (0.581,0.771] 0.17950572
9: (0.771,1.18] -0.04951607
To have a look at that NA (and to check the results anyway), I next did :
> DT[, list(mean(b),.N,list(a)), keyby=cut(a,quantile(a,probs=seq(.1,.9,.1)))]
cut V1 N V3
1: NA -0.31359818 20 1.59528080213779,1.51178116845085,-2.2146998871775,-1.98935169586337,-1.47075238389927,1.35867955152904,
2: (-1.05,-0.614] -0.14103182 10 -0.626453810742332,-0.835628612410047,-0.820468384118015,-0.621240580541804,-0.68875569454952,-0.70749515696212,
3: (-0.614,-0.375] -0.33474492 10 -0.47815005510862,-0.41499456329968,-0.394289953710349,-0.612026393250771,-0.443291873218433,-0.589520946188072,
4: (-0.375,-0.0767] 0.20827735 10 -0.305388387156356,-0.155795506705329,-0.102787727342996,-0.164523596253587,-0.253361680136508,-0.112346212150228,
5: (-0.0767,0.114] 0.14890251 10 -0.0449336090152309,-0.0161902630989461,0.0745649833651906,-0.0561287395290008,-0.0538050405829051,-0.0593133967111857,
6: (0.114,0.377] 0.16685304 10 0.183643324222082,0.329507771815361,0.36458196213683,0.341119691424425,0.188792299514343,0.153253338211898,
7: (0.377,0.581] 0.07086979 10 0.487429052428485,0.575781351653492,0.389843236411431,0.417941560199702,0.387671611559369,0.556663198673657,
8: (0.581,0.771] 0.17950572 10 0.738324705129217,0.593901321217509,0.61982574789471,0.763175748457544,0.696963375404737,0.768532924515416,
9: (0.771,1.18] -0.04951607 10 1.12493091814311,0.943836210685299,0.821221195098089,0.918977371608218,0.782136300731067,1.10002537198388,
Aside: I've returned a list
column (each cell is itself a vector) there to have a quick look at the values going into the bins, just to check. data.table
displays commas when printing (and shows just the first 6 items per cell), but each cell of V3
there is actually a numeric vector.
So the values outside the first and last break
are being coded together as NA. It's not obvious to me how to tell cut
not to do that. So I just added -Inf and +Inf :
> DT[,list(mean(b),.N),keyby=cut(a,c(-Inf,quantile(a,probs=seq(.1,.9,.1)),+Inf))]
cut V1 N
1: (-Inf,-1.05] -0.16938368 10
2: (-1.05,-0.614] -0.14103182 10
3: (-0.614,-0.375] -0.33474492 10
4: (-0.375,-0.0767] 0.20827735 10
5: (-0.0767,0.114] 0.14890251 10
6: (0.114,0.377] 0.16685304 10
7: (0.377,0.581] 0.07086979 10
8: (0.581,0.771] 0.17950572 10
9: (0.771,1.18] -0.04951607 10
10: (1.18, Inf] -0.45781268 10
That's better. Or alternatively :
> DT[, list(mean(b),.N), keyby=cut(a,quantile(a,probs=seq(0,1,.1)),include=TRUE)]
cut V1 N
1: [-2.21,-1.05] -0.16938368 10
2: (-1.05,-0.614] -0.14103182 10
3: (-0.614,-0.375] -0.33474492 10
4: (-0.375,-0.0767] 0.20827735 10
5: (-0.0767,0.114] 0.14890251 10
6: (0.114,0.377] 0.16685304 10
7: (0.377,0.581] 0.07086979 10
8: (0.581,0.771] 0.17950572 10
9: (0.771,1.18] -0.04951607 10
10: (1.18,2.4] -0.45781268 10
That way you see what the min and max is, rather than it displaying -Inf and +Inf. Notice you need to pass include=TRUE
to cut
otherwise 11 bins will be returned with only 1 in the first.
I do this type of thing a lot, so I wrote a pretty flexible bin_data() method for it in my R package - mltools. It's entirely data.table
based and makes use of the new non-equi joins.
To answer your specific question, set Bin1 as a column in DT
, then group by Bin1
library(data.table)
library(mltools)
DT[, Bin1 := bin_data(vals=a, bins=seq(.1, .9, .1), binType="quantile")]
DT[, list(mean(b)), keyby=Bin1]
Bin1 V1
1: NA -0.31359818
2: [-1.05265747329296, -0.613869230708978) -0.14103182
3: [-0.613869230708978, -0.375342019639661) -0.33474492
4: [-0.375342019639661, -0.0767031289639095) 0.20827735
5: [-0.0767031289639095, 0.113909160788544) 0.14890251
6: [0.113909160788544, 0.377079930573521) 0.16685304
7: [0.377079930573521, 0.581217342522697) 0.07086979
8: [0.581217342522697, 0.771253599758546) 0.17950572
9: [0.771253599758546, 1.1810650775142] -0.04951607
You can do other cool stuff too
Make 10 equally-spaced bins by quantile
DT[, Bin2 := bin_data(vals=a, bins=10, binType="quantile")]
DT[, list(mean(b)), keyby=Bin2]
Bin2 V1
1: [-2.2146998871775, -1.05265747329296) -0.16938368
2: [-1.05265747329296, -0.613869230708978) -0.14103182
3: [-0.613869230708978, -0.375342019639661) -0.33474492
4: [-0.375342019639661, -0.0767031289639095) 0.20827735
5: [-0.0767031289639095, 0.113909160788544) 0.14890251
6: [0.113909160788544, 0.377079930573521) 0.16685304
7: [0.377079930573521, 0.581217342522697) 0.07086979
8: [0.581217342522697, 0.771253599758546) 0.17950572
9: [0.771253599758546, 1.1810650775142) -0.04951607
10: [1.1810650775142, 2.40161776050478] -0.45781268
Make the last boundary left-closed right-open
DT[, Bin3 := bin_data(vals=a, bins=10, binType="quantile", boundaryType="lcro)")]
DT[, list(mean(b)), keyby=Bin2]
1: NA 0.42510038
2: [-2.2146998871775, -1.05265747329296) -0.16938368
3: [-1.05265747329296, -0.613869230708978) -0.14103182
4: [-0.613869230708978, -0.375342019639661) -0.33474492
5: [-0.375342019639661, -0.0767031289639095) 0.20827735
6: [-0.0767031289639095, 0.113909160788544) 0.14890251
7: [0.113909160788544, 0.377079930573521) 0.16685304
8: [0.377079930573521, 0.581217342522697) 0.07086979
9: [0.581217342522697, 0.771253599758546) 0.17950572
10: [0.771253599758546, 1.1810650775142) -0.04951607
11: [1.1810650775142, 2.40161776050478) -0.55591413
Specify your own explicit bins (notice empty bins are returned)
bin_data(dt=DT, binCol="a", bins=seq(-5, 5, 1), returnDT=TRUE)
Bin a b
1: [-5, -4) NA NA
2: [-4, -3) NA NA
3: [-3, -2) -2.214700 -0.65069635
4: [-2, -1) -1.989352 -0.17955653
5: [-2, -1) -1.470752 -0.03763417
---
100: [1, 2) 1.586833 -1.20808279
101: [2, 3) 2.401618 0.42510038
102: [2, 3) 2.172612 0.20753834
103: [3, 4) NA NA
104: [4, 5] NA NA
Use variable size bins
bin_data(dt=DT, binCol="a", bins=data.table(LB=c(-5, 0, 1), RB=c(0, 1, Inf)), returnDT=TRUE)
Bin a b
1: [-5, 0) -0.626453811 -0.62036668
2: [-5, 0) -0.835628612 -0.91092165
3: [-5, 0) -0.820468384 1.76728727
4: [-5, 0) -0.305388387 1.68217608
5: [-5, 0) -0.621240581 1.43228224
---
95: [1, Inf] 2.172611670 0.20753834
96: [1, Inf] 1.178086997 0.21992480
97: [1, Inf] 1.063099837 1.46458731
98: [1, Inf] 1.207867806 0.40201178
99: [1, Inf] 1.160402616 -0.73174817
100: [1, Inf] 1.586833455 -1.20808279
Bin a b