I am trying to calculate silhouette score
as I find the optimal number of clusters to create, but get an error that says:
ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
I am unable to understand the reason for this. Here is the code, that I am using to cluster and calculate silhouette score
.
I read the csv that contains the text to be clustered and run K-Means
on the n
cluster values. What could be the reason I am getting this error?
#Create cluster using K-Means
#Only creates graph
import matplotlib
#matplotlib.use('Agg')
import re
import os
import nltk, math, codecs
import csv
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import silhouette_score
model_name = checkpoint_save_path
loaded_model = Doc2Vec.load(model_name)
#Load the test csv file
data = pd.read_csv(test_filename)
overview = data['overview'].astype('str').tolist()
overview = filter(bool, overview)
vectors = []
def split_words(text):
return ''.join([x if x.isalnum() or x.isspace() else " " for x in text ]).split()
def preprocess_document(text):
sp_words = split_words(text)
return sp_words
for i, t in enumerate(overview):
vectors.append(loaded_model.infer_vector(preprocess_document(t)))
sse = {}
silhouette = {}
for k in range(1,15):
km = KMeans(n_clusters=k, max_iter=1000, verbose = 0).fit(vectors)
sse[k] = km.inertia_
#FOLLOWING LINE CAUSES ERROR
silhouette[k] = silhouette_score(vectors, km.labels_, metric='euclidean')
best_cluster_size = 1
min_error = float("inf")
for cluster_size in sse:
if sse[cluster_size] < min_error:
min_error = sse[cluster_size]
best_cluster_size = cluster_size
print(sse)
print("====")
print(silhouette)