I am training a small network and the training seems to go fine, the val loss decreases, I reach validation accuracy around 80, and it actually stops training once there is no more improvement (patience=10). It trained for 40 epochs. However, it keeps predicting only one class for every test image! I tried to initialize the conv layers randomly, I added regularizers, I switched from Adam to SGD, I added clipvalue, I added dropouts. I also switched to softmax (I have only two labels but I saw some recommendation on using softmax and Dense layer with 2 neurons). Some or one of these helped with the overfitting, but nothing worked for the prediction problem. The data is balanced, though it is a small dataset, so it doesn't make sense that it reaches 80% if it predicts the same labels for evaluation set as well.
What is wrong with my model and how can I fix it? Any comments are welcome.
#Import some packages to use
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
import os
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.initializers import RandomNormal
os.environ["CUDA_VISIBLE_DEVICES"]="0"
epochs = 200
callbacks = []
#schedule = None
decay = 0.0
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')
reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-5, mode='min')
train_dir = '/home/d/Desktop/s/data/train'
eval_dir = '/home/d/Desktop/s/data/eval'
test_dir = '/home/d/Desktop/s/data/test'
# create a data generator
train_datagen = ImageDataGenerator(rescale=1./255, #Scale the image between 0 and 1
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,)
val_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale
test_datagen = ImageDataGenerator(rescale=1./255) #We do not augment validation data. we only perform rescale
# load and iterate training dataset
train_generator = train_datagen.flow_from_directory(train_dir, target_size=(224,224),class_mode='categorical', batch_size=16, shuffle='True', seed=42)
# load and iterate validation dataset
val_generator = val_datagen.flow_from_directory(eval_dir, target_size=(224,224),class_mode='categorical', batch_size=16, shuffle='True', seed=42)
# load and iterate test dataset
test_generator = test_datagen.flow_from_directory(test_dir, target_size=(224,224), class_mode=None, batch_size=1, shuffle='False', seed=42)
#We will use a batch size of 32. Note: batch size should be a factor of 2.***4,8,16,32,64...***
#batch_size = 4
#from keras import layers
from keras import models
from keras import optimizers
#from keras.layers import Dropout
#from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
model = models.Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', name='block1_conv1', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05), input_shape=(224, 224, 3)))
model.add(Conv2D(64, (3, 3), activation='relu', name='block1_conv2', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(128, (3, 3), activation='relu', name='block2_conv1', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(Conv2D(128, (3, 3), activation='relu', name='block2_conv2',kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(MaxPooling2D((2, 2), name='block2_pool'))
model.add(Dropout(0.2))
model.add(Conv2D(256, (3, 3), activation='relu', name='block3_conv1', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(Conv2D(256, (3, 3), activation='relu', name='block3_conv2', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(Conv2D(256, (3, 3), activation='relu', name='block3_conv3', kernel_initializer=RandomNormal(
mean=0.0, stddev=0.05), bias_initializer=RandomNormal(mean=0.0, stddev=0.05)))
model.add(MaxPooling2D((2, 2), name='block3_pool'))
model.add(Dropout(0.2))
#model.add(layers.Conv2D(512, (3, 3), activation='relu', name='block4_conv1'))
#model.add(layers.Conv2D(512, (3, 3), activation='relu', name='block4_conv2'))
#model.add(layers.Conv2D(512, (3, 3), activation='relu', name='block4_conv3'))
#model.add(layers.MaxPooling2D((2, 2), name='block4_pool'))
model.add(Flatten())
model.add(Dense(256, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01), activation='relu', kernel_initializer='he_uniform'))
model.add(Dropout(0.5))
model.add(Dense(2, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01), activation='softmax'))
#Lets see our model
model.summary()
#We'll use the RMSprop optimizer with a learning rate of 0.0001
#We'll use binary_crossentropy loss because its a binary classification
#model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1e-5, momentum=0.9), metrics=['acc'])
model.compile(loss='categorical_crossentropy',
#optimizer=optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08, decay=decay),
optimizer=optimizers.SGD(lr= 0.0001, clipvalue = 0.5, decay=1e-6, momentum=0.9, nesterov=True),
metrics=['accuracy'])
#The training part
#We train for 64 epochs with about 100 steps per epoch
history = model.fit_generator(train_generator,
steps_per_epoch=train_generator.n // train_generator.batch_size,
epochs=epochs,
validation_data=val_generator,
validation_steps=val_generator.n // val_generator.batch_size,
callbacks=[earlyStopping, mcp_save]) #, reduce_lr_loss])
#Save the model
model.save_weights('/home/d/Desktop/s/categorical_weights.h5')
model.save('/home/d/Desktop/s/categorical_model_keras.h5')
#lets plot the train and val curve
#get the details form the history object
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
#Train and validation accuracy
plt.plot(epochs, acc, 'b', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.title('Training and Validation accuracy')
plt.legend()
plt.figure()
#Train and validation loss
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.legend()
plt.show()
model.evaluate_generator(generator=val_generator, steps=val_generator.n // val_generator.batch_size)
STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
test_generator.reset()
pred=model.predict_generator(test_generator,
steps=STEP_SIZE_TEST,
verbose=1)
predicted_class_indices=np.argmax(pred,axis=1)
labels = (train_generator.class_indices)
np.save('/home/d/Desktop/s/classes', labels)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]
filenames=test_generator.filenames
results=pd.DataFrame({"Filename":filenames,
"Predictions":predictions})
results.to_csv("categorical_results.csv",index=False)