我有一个已经被符号化的文件清单:
dat <- list(c("texaco", "canada", "lowered", "contract", "price", "pay",
"crude", "oil", "canadian", "cts", "barrel", "effective", "decrease",
"brings", "companys", "posted", "price", "benchmark", "grade",
"edmonton", "swann", "hills", "light", "sweet", "canadian", "dlrs",
"bbl", "texaco", "canada", "changed", "crude", "oil", "postings",
"feb", "reuter"), c("argentine", "crude", "oil", "production",
"pct", "january", "mln", "barrels", "mln", "barrels", "january",
"yacimientos", "petroliferos", "fiscales", "january", "natural",
"gas", "output", "totalled", "billion", "cubic", "metrers", "pct",
"billion", "cubic", "metres", "produced", "january", "yacimientos",
"petroliferos", "fiscales", "added", "reuter"))
我想令牌这个名单高效地转换成n元列表。 下面是我到目前为止已经编写的功能:
find_ngrams <- function(x, n){
if (n==1){ return(x)}
out <- as.list(rep(NA, length(x)))
for (i in 1:length(x)){
words <- x[[i]]
out[[i]] <- words
for (j in 2:n){
phrases <- sapply(1:j, function(k){
words[k:(length(words)-n+k)]
})
phrases <- apply(phrases, 1, paste, collapse=" ")
out[[i]] <- c(out[[i]], phrases)
}
}
return(out)
}
这工作得很好寻找正克,但似乎效率不高。 与更换for循环*apply
功能将仍然给我留下了3套深循环:
result <- find_ngrams(dat, 2)
> result[[2]]
[1] "argentine" "crude" "oil"
[4] "production" "pct" "january"
[7] "mln" "barrels" "mln"
[10] "barrels" "january" "yacimientos"
[13] "petroliferos" "fiscales" "january"
[16] "natural" "gas" "output"
[19] "totalled" "billion" "cubic"
[22] "metrers" "pct" "billion"
[25] "cubic" "metres" "produced"
[28] "january" "yacimientos" "petroliferos"
[31] "fiscales" "added" "reuter"
[34] "argentine crude" "crude oil" "oil production"
[37] "production pct" "pct january" "january mln"
[40] "mln barrels" "barrels mln" "mln barrels"
[43] "barrels january" "january yacimientos" "yacimientos petroliferos"
[46] "petroliferos fiscales" "fiscales january" "january natural"
[49] "natural gas" "gas output" "output totalled"
[52] "totalled billion" "billion cubic" "cubic metrers"
[55] "metrers pct" "pct billion" "billion cubic"
[58] "cubic metres" "metres produced" "produced january"
[61] "january yacimientos" "yacimientos petroliferos" "petroliferos fiscales"
[64] "fiscales added" "added reuter"
是否有可能被矢量这段代码的任何显著的部分?
/编辑:这里是马修Plourde的功能,那确实“先进的N-gram”,并在整个列表适用的更新版本:
find_ngrams_base <- function(x, n) {
if (n == 1) return(x)
out <- lapply(1:n, function(n_i) embed(x, n_i))
out <- sapply(out, function(y) apply(y, 1, function(row) paste(rev(row), collapse=' ')))
unlist(out)
}
find_ngrams_plourde <- function(x, ...){
lapply(x, find_ngrams_base, ...)
}
我们可以反对我写的功能标杆,并认为它是一个有点慢:
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_plourde(dat, 2),
x <- find_ngrams_plourde(dat, 3),
y <- find_ngrams_plourde(dat, 4),
z <- find_ngrams_plourde(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
1 a <- find_ngrams(dat, 2) 0.040 1.000
2 b <- find_ngrams(dat, 3) 0.081 2.025
3 c <- find_ngrams(dat, 4) 0.117 2.925
5 w <- find_ngrams_plourde(dat, 2) 0.144 3.600
6 x <- find_ngrams_plourde(dat, 3) 0.212 5.300
7 y <- find_ngrams_plourde(dat, 4) 0.277 6.925
4 d <- find_ngrams(dat, 10) 0.361 9.025
8 z <- find_ngrams_plourde(dat, 10) 0.669 16.725
然而,它也发现了很多的n-gram的我的功能缺失(哎呦):
for (i in 1:length(dat)){
print(setdiff(w[[i]], a[[i]]))
print(setdiff(x[[i]], b[[i]]))
print(setdiff(y[[i]], c[[i]]))
print(setdiff(z[[i]], d[[i]]))
}
我觉得同时功能可以得到改善,但我不能想到的任何方式,以避免三重环路(环路在载体中,遍历需要,1-n中,环上的字来构建n元语法n元语法的数量)
/编辑2:这是基于关闭修订功能,马特的答案:
find_ngrams_2 <- function(x, n){
if (n == 1) return(x)
lapply(x, function(y) c(y, unlist(lapply(2:n, function(n_i) do.call(paste, unname(rev(data.frame(embed(y, n_i), stringsAsFactors=FALSE))))))))
}
它似乎返回n元语法的正确列表,它比我原来的功能,更快(在大多数情况下):
library(rbenchmark)
benchmark(
replications=100,
a <- find_ngrams(dat, 2),
b <- find_ngrams(dat, 3),
c <- find_ngrams(dat, 4),
d <- find_ngrams(dat, 10),
w <- find_ngrams_2(dat, 2),
x <- find_ngrams_2(dat, 3),
y <- find_ngrams_2(dat, 4),
z <- find_ngrams_2(dat, 10),
columns=c('test', 'elapsed', 'relative'),
order='relative'
)
test elapsed relative
5 w <- find_ngrams_2(dat, 2) 0.039 1.000
1 a <- find_ngrams(dat, 2) 0.041 1.051
6 x <- find_ngrams_2(dat, 3) 0.078 2.000
2 b <- find_ngrams(dat, 3) 0.081 2.077
7 y <- find_ngrams_2(dat, 4) 0.119 3.051
3 c <- find_ngrams(dat, 4) 0.123 3.154
4 d <- find_ngrams(dat, 10) 0.399 10.231
8 z <- find_ngrams_2(dat, 10) 0.436 11.179