Is there a function to add to the existing corpus? I've already generated my matrix, I'm looking to periodically add to the table without re-crunching the whole sha-bang
e.g;
articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
tfidf_vectorizer = TfidfVectorizer(
max_df=.8,
max_features=2000,
min_df=.05,
preprocessor=prep_text,
use_idf=True,
tokenizer=tokenize_text
)
tfidf_matrix = tfidf_vectorizer.fit_transform(articleList)
#### ADDING A NEW ARTICLE TO EXISTING SET?
bigger_tfidf_matrix = tfidf_vectorizer.fit_transform(['the last article I wanted to add'])
You can access the vocabulary_
attribute of your vectoriser directly, and you can access the idf_
vector via _tfidf._idf_diag
, so it would be possible to monkey-patch something like this:
import re
import numpy as np
from scipy.sparse.dia import dia_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
def partial_fit(self, X):
max_idx = max(self.vocabulary_.values())
for a in X:
#update vocabulary_
if self.lowercase: a = a.lower()
tokens = re.findall(self.token_pattern, a)
for w in tokens:
if w not in self.vocabulary_:
max_idx += 1
self.vocabulary_[w] = max_idx
#update idf_
df = (self.n_docs + self.smooth_idf)/np.exp(self.idf_ - 1) - self.smooth_idf
self.n_docs += 1
df.resize(len(self.vocabulary_))
for w in tokens:
df[self.vocabulary_[w]] += 1
idf = np.log((self.n_docs + self.smooth_idf)/(df + self.smooth_idf)) + 1
self._tfidf._idf_diag = dia_matrix((idf, 0), shape=(len(idf), len(idf)))
TfidfVectorizer.partial_fit = partial_fit
articleList = ['here is some text blah blah','another text object', 'more foo for your bar right now']
vec = TfidfVectorizer()
vec.fit(articleList)
vec.n_docs = len(articleList)
vec.partial_fit(['the last text I wanted to add'])
vec.transform(['the last text I wanted to add']).toarray()
# array([[ 0. , 0. , 0. , 0. , 0. ,
# 0. , 0. , 0. , 0. , 0. ,
# 0. , 0. , 0.27448674, 0. , 0.43003652,
# 0.43003652, 0.43003652, 0.43003652, 0.43003652]])