How to solve a NotImplementedError from nltk.class

2020-08-01 05:10发布

问题:

I am new to programming, but have looked at my code over and over and can't see any mistakes. I don't know how proceed any more because this error pops up no matter what I try. I'll post the full code here.

Any help would be much appreciated, thank you!

import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode 

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

        def classify(self, features):
            votes = []
            for c in self._classifiers:
                v = c.classify(features)
                votes.append(v)
            return mode(votes)


        def confidence(self, features):
            votes = []
            for c in self._classifiers:
                v = c.classify(features)
                votes.append(v)


            choice_votes = votes.count(mode(votes))
            conf = choice_votes / len(votes)
            return conf


documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
        all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

featuresets = [(find_features(rev), category) for (rev, category) in documents]

training_set = featuresets[:1900]
testing_set = featuresets[1900:]

# classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

print("Original NaiveBayes accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(10)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)


voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

I also tried raising a NotImplementedError exception on the class at the top but it did not change the output in Python.

This is the error:

Traceback (most recent call last):
  File "code/test.py", line 109, in <module>
    print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/util.py", line 87, in accuracy
    results = classifier.classify_many([fs for (fs, l) in gold])
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/api.py", line 77, in classify_many
    return [self.classify(fs) for fs in featuresets]
  File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/nltk/classify/api.py", line 56, in classify
    raise NotImplementedError()
NotImplementedError

回答1:

As noted in the comments, there's some bad spaghetti like code in the ClassiferI api that has classify calling classify_many when overriden. It might not be a bad thing when considering that the ClassifierI is strongly tied with the NaiveBayesClassifier object.

But for the particular use in the OP, the spaghetti code there isn't welcomed.

TL;DR

Take a look at https://www.kaggle.com/alvations/sklearn-nltk-voteclassifier

In Long

From the traceback, the error is starts from nltk.classify.util.accuracy() calling the ClassifierI.classify().

The ClassifierI.classify() is generally used to classify ONE document and the input is a dictionary of featureset with its binary values.

The ClassifierI.classify_many() is supposed to classify a MULTIPLE documents and the input is a list of dictionary of featureset with its binary values.

So the quick hack is to overwrite how the accuracy() function so that the VotedClassifier won't be dependent on the ClassifierI definition of classify() vs classify_many(). That would also mean that we don't inherit from ClassifierI. IMHO, if you don't need other functions other than classify(), there's no need to inherit the baggage that ClassifierI might come with:

def my_accuracy(classifier, gold):
    documents, labels = zip(*gold)
    predictions = classifier.classify_documents(documents)
    correct = [y == y_hat for y, y_hat in zip(labels, predictions)]
    if correct:
        return sum(correct) / len(correct)
    else:
        return 0

class VotraClassifier:
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify_documents(self, documents):
        return [self.classify_many(doc) for doc in documents]

    def classify_many(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

Now if we call the new my_accuracy() with the new VotedClassifier object:

voted_classifier = VotraClassifier(nltk_nb, 
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

my_accuracy(voted_classifier, testing_set)

[out]:

0.86

Note: There's certain randomness when it comes to shuffling the document and then holding out a set to test for the classifier accuracy.

My suggestion is to do the following instead of simple random.shuffle(documents)

  • Repeat the experiments with various random seed.
  • For each random seed, do a 10-fold cross validation.


标签: python nltk