Long time lurker, first time asker.
I'm trying to calculate 'items in common between 2 sets of items' for a 20M+ items dataset. Sample data looks like this.
#serially numbered items
parents <- rep(1:10000)
#generate rnorm # of children items
numchild <- round(rnorm(10000, mean=30, sd=10))
#fill the parent-child list
parent_child <- list()
for (x in 1:length(parents)){
if (numchild[x]>0){
f1 <- sample(1:length(parents), size=numchild[x])
f2 <- list(parents[f1])
parent_child <- c(parent_child, f2)
}
else {
parent_child <- c(parent_child, list(x+1)) #if numchild=0, make up something
}
}
Here is what I want to do: say parent item #1 has 5 children items-- 1,2,3,4,5 and parent item #2 has 3 children item-- 4,10,22.
I want to compute the length(intersection) of every (parent_i, parent_j) combination. In the above case, it would be 1 common item-- 4.
I am doing this for 10M+ parent items that on average have 15-20 children items with a (0,100) range. So that's a 10M x 10M item-item matrix.
I have a foreach loop that I am testing out on a smaller subset that works but doesn't quite scale for the full dataset (64 core machine with 256GB RAM). With the loop below I am already computing only half of the user-user matrix--> (parent_i, parent_j) same as (parent_j, parent_i) for this purpose.
#small subset
a <- parent_child[1:1000]
outerresults <- foreach (i = 1:(length(a)), .combine=rbind, .packages=c('foreach','doParallel')) %dopar% {
b <- a[[i]]
rest <- a[i+1:length(a)]
foreach (j = 1:(length(rest)), .combine=rbind) %dopar% {
common <- length(intersect(b, rest[[j]]))
if (common > 0) {g <- data.frame(u1=i, u2=j+1, common)}
}
}
I've been experimenting variations on this (using Reduce, storing parent-children in a daataframe etc.) but haven't had much luck.
Is there a way to make this scale?
I reversed the split, so that we have a child-parent relationship
len <- sapply(parent_child, length)
child_parent <- split(rep(seq_along(parent_child), len),
unlist(parent_child, use.names=FALSE))
Something like the following constructs a string with pairs of parents sharing a child, across all children
keep <- sapply(child_parent, length) > 1
int <- lapply(child_parent[keep], function(x) {
x <- combn(sort(x), 2)
paste(x[1,], x[2,], sep=".")
})
and tallying
table(unlist(int, use.names=FALSE))
or a little more quickly
xx <- unlist(int, use.names=FALSE)
nms <- unique(xx)
cnt <- match(xx, nms)
setNames(tabulate(cnt, length(nms), nms)
for
f1 <- function(parent_child) {
len <- sapply(parent_child, length)
child_parent <- split(rep(seq_along(parent_child), len),
unlist(parent_child, use.names=FALSE))
keep <- sapply(child_parent, length) > 1
int <- lapply(child_parent[keep], function(x) {
x <- combn(sort(x), 2)
paste(x[1,], x[2,], sep=".")
})
xx <- unlist(int, use.names=FALSE)
nms <- unique(xx)
cnt <- match(xx, nms)
setNames(tabulate(cnt, length(nms)), nms)
}
with (this is for all 10000 parent-child elements)
> system.time(ans1 <- f1(parent_child))
user system elapsed
14.625 0.012 14.668
> head(ans1)
542.1611 542.1832 542.2135 542.2435 542.2527 542.2806
1 1 1 1 1 1
I'm not sure that this would really scale to the size of problem you're talking about, though -- it's polynomial in the number of parents per child.
One possibility for speed-up is to 'memoize' the combinatorial calculation, using the length of the argument as a 'key' and storing the combination as 'value'. This reduces the number of times combn
is called to the number of unique lengths of elements of child_parent.
combn1 <- local({
memo <- new.env(parent=emptyenv())
function(x) {
key <- as.character(length(x))
if (!exists(key, memo))
memo[[key]] <- t(combn(length(x), 2))
paste(x[memo[[key]][,1]], x[memo[[key]][,2]], sep=".")
}
})
f2 <- function(parent_child) {
len <- sapply(parent_child, length)
child_parent <- split(rep(seq_along(parent_child), len),
unlist(parent_child, use.names=FALSE))
keep <- sapply(child_parent, length) > 1
int <- lapply(child_parent[keep], combn1)
xx <- unlist(int, use.names=FALSE)
nms <- unique(xx)
cnt <- match(xx, nms)
setNames(tabulate(cnt, length(nms)), nms)
}
which helps somewhat
> system.time(ans2 <- f2(parent_child))
user system elapsed
5.337 0.000 5.347
> identical(ans1, ans2)
[1] TRUE
The slow part is now paste
> Rprof(); ans2 <- f2(parent_child); Rprof(NULL); summaryRprof()
$by.self
self.time self.pct total.time total.pct
"paste" 3.92 73.41 3.92 73.41
"match" 0.74 13.86 0.74 13.86
"unique.default" 0.40 7.49 0.40 7.49
"as.character" 0.08 1.50 0.08 1.50
"unlist" 0.08 1.50 0.08 1.50
"combn" 0.06 1.12 0.06 1.12
"lapply" 0.02 0.37 4.00 74.91
"any" 0.02 0.37 0.02 0.37
"setNames" 0.02 0.37 0.02 0.37
$by.total
...
We can avoid this by encoding the parents with shared child id into a single integer; because of the way floating point numbers are represented in R, this will be exact until about 2^21
encode <- function(x, y, n)
(x - 1) * (n + 1) + y
decode <- function(z, n)
list(x=ceiling(z / (n + 1)), y = z %% (n + 1))
and adjusting our combn1 and f2 functions as
combn2 <- local({
memo <- new.env(parent=emptyenv())
function(x, encode_n) {
key <- as.character(length(x))
if (!exists(key, memo))
memo[[key]] <- t(combn(length(x), 2))
encode(x[memo[[key]][,1]], x[memo[[key]][,2]], encode_n)
}
})
f3 <- function(parent_child) {
encode_n <- length(parent_child)
len <- sapply(parent_child, length)
child_parent <-
unname(split(rep(seq_along(parent_child), len),
unlist(parent_child, use.names=FALSE)))
keep <- sapply(child_parent, length) > 1
int <- lapply(child_parent[keep], combn2, encode_n)
id <- unlist(int, use.names=FALSE)
uid <- unique(xx)
n <- tabulate(match(xx, uid), length(uid))
do.call(data.frame, c(decode(uid, encode_n), list(n=n)))
}
leading to
> system.time(f3(parent_child))
user system elapsed
2.140 0.000 2.146
This compares very favorably (note that the timing in the previous line is for 10,000 parent-child relations) with jlhoward's revised answer
> system.time(result.3 <- do.call("rbind",lapply(1:99,gg)))
user system elapsed
2.465 0.000 2.468
> system.time(f3(parent_child[1:99]))
user system elapsed
0.016 0.000 0.014
and scales in a much more reasonable way.
For what it's worth, the data generation routine is in the second circle of Patrick Burn's R Inferno, using the 'copy-and-append' algorithm rather than pre-allocating the space and filling it in. Avoid this by writing the for
loop body as a function, and using lapply. Avoid the need for the complicated conditional in the for
loop by fixing the issue before-hand
numchild <- round(rnorm(10000, mean=30, sd=10))
numchild[numchild < 0] <- sample(numchild[numchild > 0], sum(numchild < 0))
or by sampling from a distribution (rpois, rbinom) that generates positive integer values. Data generation is then
n_parents <- 10000
numchild <- round(rnorm(n_parents, mean=30, sd=10))
numchild[numchild < 0] <- sample(numchild[numchild > 0], sum(numchild < 0))
parent_child <- lapply(numchild, sample, x=n_parents)
Here is another approach that is about 10X faster than my previous answer, and 17X faster than the original code (also simpler):
ff <- function(u2, u1, a) {
common <- length(intersect(a,parent_child[[u2]]))
if (common>0) {return(data.frame(u1,u2,common))}
}
gg <- function(u1) {
a <- parent_child[[u1]]
do.call("rbind",lapply((u1+1):100,ff,u1,a))
}
system.time(result.3 <- do.call("rbind",lapply(1:99,gg)))
user system elapsed
1.04 0.00 1.03
result.3
is identical to result.2
from previous answer:
max(abs(result.3-result.2))
[1] 0
Well, a small improvement (I think):
Original code (wrapped in function call):
f = function(n) {
#small subset
a <- parent_child[1:n]
outerresults <- foreach (i = 1:(length(a)),
.combine=rbind,
.packages=c('foreach','doParallel')) %dopar% {
b <- a[[i]]
rest <- a[i+1:length(a)]
foreach (j = 1:(length(rest)), .combine=rbind) %dopar% {
common <- length(intersect(b, rest[[j]]))
if (common > 0) {g <- data.frame(u1=i, u2=j+1, common)}
}
}
return(outerresults)
}
Modified code:
g <- function(n) {
a <- parent_child[1:n]
outerresults <- foreach (i = 1:n,
.combine=rbind,
.packages=c('foreach','doParallel')) %dopar% {
b <- a[[i]]
foreach (j = (i):n, .combine=rbind) %dopar% {
if (i!=j) {
c <- a[[j]]
common <- length(intersect(b, c))
if (common > 0) {g <- data.frame(u1=i, u2=j, common)}
}
}
}
return(outerresults)
}
Benchmarks:
system.time(result.old<-f(100))
user system elapsed
17.21 0.00 17.33
system.time(result.new<-g(100))
user system elapsed
10.42 0.00 10.47
The numbering for u2 is a little different becasue of the different approaches, but both produce the same vector of matches:
max(abs(result.old$common-result.new$common))
[1] 0
I tried this with data table joins replacing intersect(...)
and it was actually much slower(!!)