I have a list of documents that have already been tokenized:
dat <- list(c("texaco", "canada", "lowered", "contract", "price", "pay",
"crude", "oil", "canadian", "cts", "barrel", "effective", "decrease",
"brings", "companys", "posted", "price", "benchmark", "grade",
"edmonton", "swann", "hills", "light", "sweet", "canadian", "dlrs",
"bbl", "texaco", "canada", "changed", "crude", "oil", "postings",
"feb", "reuter"), c("argentine", "crude", "oil", "production",
"pct", "january", "mln", "barrels", "mln", "barrels", "january",
"yacimientos", "petroliferos", "fiscales", "january", "natural",
"gas", "output", "totalled", "billion", "cubic", "metrers", "pct",
"billion", "cubic", "metres", "produced", "january", "yacimientos",
"petroliferos", "fiscales", "added", "reuter"))
I'm trying to efficiently convert this list of tokens to a list of n-grams. Here's the function I've written so far:
find_ngrams <- function(x, n){
if (n==1){ return(x)}
out <- as.list(rep(NA, length(x)))
for (i in 1:length(x)){
words <- x[[i]]
out[[i]] <- words
for (j in 2:n){
phrases <- sapply(1:j, function(k){
words[k:(length(words)-n+k)]
})
phrases <- apply(phrases, 1, paste, collapse=" ")
out[[i]] <- c(out[[i]], phrases)
}
}
return(out)
}
This works fine for finding n-grams, but it seems inefficient. Replacing the for-loops with *apply
functions would still leaves me with loops nested 3-deep:
result <- find_ngrams(dat, 2)
> result[[2]]
[1] "argentine" "crude" "oil"
[4] "production" "pct" "january"
[7] "mln" "barrels" "mln"
[10] "barrels" "january" "yacimientos"
[13] "petroliferos" "fiscales" "january"
[16] "natural" "gas" "output"
[19] "totalled" "billion" "cubic"
[22] "metrers" "pct" "billion"
[25] "cubic" "metres" "produced"
[28] "january" "yacimientos" "petroliferos"
[31] "fiscales" "added" "reuter"
[34] "argentine crude" "crude oil" "oil production"
[37] "production pct" "pct january" "january mln"
[40] "mln barrels" "barrels mln" "mln barrels"
[43] "barrels january" "january yacimientos" "yacimientos petroliferos"
[46] "petroliferos fiscales" "fiscales january" "january natural"
[49] "natural gas" "gas output" "output totalled"
[52] "totalled billion" "billion cubic" "cubic metrers"
[55] "metrers pct" "pct billion" "billion cubic"
[58] "cubic metres" "metres produced" "produced january"
[61] "january yacimientos" "yacimientos petroliferos" "petroliferos fiscales"
[64] "fiscales added" "added reuter"
Are there any significant parts of this code that could be vectorized?
/edit: here's an updated version of Matthew Plourde's function, that does "up-to-n-grams" and works across the entire list:
find_ngrams_base <- function(x, n) {
if (n == 1) return(x)
out <- lapply(1:n, function(n_i) embed(x, n_i))
out <- sapply(out, function(y) apply(y, 1, function(row) paste(rev(row), collapse=' ')))
unlist(out)
}
find_ngrams_plourde <- function(x, ...){
lapply(x, find_ngrams_base, ...)
}
We can benchmark against the function I wrote, and see that it's a bit slower:
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_plourde(dat, 2),
x <- find_ngrams_plourde(dat, 3),
y <- find_ngrams_plourde(dat, 4),
z <- find_ngrams_plourde(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
1 a <- find_ngrams(dat, 2) 0.040 1.000
2 b <- find_ngrams(dat, 3) 0.081 2.025
3 c <- find_ngrams(dat, 4) 0.117 2.925
5 w <- find_ngrams_plourde(dat, 2) 0.144 3.600
6 x <- find_ngrams_plourde(dat, 3) 0.212 5.300
7 y <- find_ngrams_plourde(dat, 4) 0.277 6.925
4 d <- find_ngrams(dat, 10) 0.361 9.025
8 z <- find_ngrams_plourde(dat, 10) 0.669 16.725
However, it also find a lot of ngrams my function misses (whoops):
for (i in 1:length(dat)){
print(setdiff(w[[i]], a[[i]]))
print(setdiff(x[[i]], b[[i]]))
print(setdiff(y[[i]], c[[i]]))
print(setdiff(z[[i]], d[[i]]))
}
I feel like both function can be improved, but I can't think of any way to avoid the triple loop (loop over the vectors, loop over the number of ngrams needed, 1-n, loop over the words to construct ngrams)
/edit 2: Here's a revised function, based off Matt's answer:
find_ngrams_2 <- function(x, n){
if (n == 1) return(x)
lapply(x, function(y) c(y, unlist(lapply(2:n, function(n_i) do.call(paste, unname(rev(data.frame(embed(y, n_i), stringsAsFactors=FALSE))))))))
}
It seems to return the correct list of ngrams, and it is faster (in most cases) than my original function:
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_2(dat, 2),
x <- find_ngrams_2(dat, 3),
y <- find_ngrams_2(dat, 4),
z <- find_ngrams_2(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
5 w <- find_ngrams_2(dat, 2) 0.039 1.000
1 a <- find_ngrams(dat, 2) 0.041 1.051
6 x <- find_ngrams_2(dat, 3) 0.078 2.000
2 b <- find_ngrams(dat, 3) 0.081 2.077
7 y <- find_ngrams_2(dat, 4) 0.119 3.051
3 c <- find_ngrams(dat, 4) 0.123 3.154
4 d <- find_ngrams(dat, 10) 0.399 10.231
8 z <- find_ngrams_2(dat, 10) 0.436 11.179
Here's one way with
embed
.There seems to be a bug in your function. If you fix that, we can do a benchmark.