I have written a simple MLP with one single layer. When learning the XOR function using sigmoid activations, the loss reduced consistently. However, if I change the labels of the XOR data from [0, 1] to [-1, 1] and use the tanh activation instead, the loss doesn't reduce. It should still work by simply changing the labels and using tanh, shouldn't it? And if so, where is my implementation going wrong with tanh?
import sys
import numpy as np
class DataSet:
def __init__(self, data):
self.size = len(data)
self.inputs, self.labels = [], []
for i in range(len(data)):
self.inputs.append(data[i][0])
self.labels.append(data[i][1])
try:
self.numInputs = len(self.inputs[0])
except TypeError:
self.numInputs = 1
try:
self.numOutputs = len(self.labels[0])
except TypeError:
self.numOutputs = 1
class MLP:
def __init__(self, numInputs, numHidden, numOutputs, activationFunction):
# MLP architecture sizes
self.numInputs = numInputs
self.numHidden = numHidden
self.numOutputs = numOutputs
self.activationFunction = activationFunction.lower()
# MLP weights
self.IH_weights = np.random.rand(numInputs, numHidden) # Input -> Hidden
self.HO_weights = np.random.rand(numHidden, numOutputs) # Hidden -> Output
# Gradients corresponding to weight matrices computed during backprop
self.IH_gradients = np.zeros_like(self.IH_weights)
self.HO_gradients = np.zeros_like(self.HO_weights)
# Input, hidden and output neuron values
self.I = np.zeros(numInputs) # Inputs
self.L = np.zeros(numOutputs) # Labels
self.H = np.zeros(numHidden) # Hidden
self.O = np.zeros(numOutputs) # Output
# ADD BIAS FOR RELU ########################################################
# ADD SOFTMAX ##############################################################
def activation(self, x, derivative=False):
if self.activationFunction == 'sigmoid':
if derivative:
return x * (1 - x)
return 1 / (1 + np.exp(-x))
if self.activationFunction == 'tanh':
if derivative:
return 1. - np.tanh(x) ** 2
return np.tanh(x)
# if self.activationFunction == 'softmax':
# if derivative:
# return ########
# return np.exp(x) / np.sum(np.exp(x))
if self.activationFunction == 'relu':
if derivative:
return (x > 0).astype(float)
return np.maximum(0, x)
print("ERROR: Activation function not found.")
sys.exit()
def forward(self, inputs):
# Ensure that inputs is a list
try:
len(inputs)
except TypeError:
inputs = [inputs]
self.I = np.array(inputs).reshape(1, self.numInputs)
self.H = self.I.dot(self.IH_weights)
self.H = self.activation(self.H)
self.O = self.H.dot(self.HO_weights)
self.O = self.activation(self.O)
return self.O, self.L
def backwards(self, labels):
# Ensure that labels is a list
try:
len(labels)
except TypeError:
labels = [labels]
self.L = np.array(labels)
self.O_error = self.L - self.O
self.O_delta = self.O_error * self.activation(self.O, derivative=True)
self.H_error = self.O_delta.dot(self.HO_weights.T)
self.H_delta = self.H_error * self.activation(self.H, derivative=True)
self.IH_gradients += self.I.T.dot(self.H_delta)
self.HO_gradients += self.H.T.dot(self.O_delta)
return np.sum(self.O_error ** 2)
def updateWeights(self, learningRate):
self.IH_weights += self.IH_gradients
self.HO_weights += self.HO_gradients
self.IH_gradients = np.zeros_like(self.IH_weights)
self.HO_gradients = np.zeros_like(self.HO_weights)
# data = DataSet([
# [[0, 0], 0],
# [[0, 1], 1],
# [[1, 0], 1],
# [[1, 1], 0]
# ])
#
# mlp = MLP(data.numInputs, 20, data.numOutputs, 'sigmoid')
data = DataSet([
[[0, 0], -1],
[[0, 1], 1],
[[1, 0], 1],
[[1, 1], -1]
])
mlp = MLP(data.numInputs, 20, data.numOutputs, 'tanh')
numEpochs = 10000
learningRate = 0.01
losses = []
for epoch in range(numEpochs):
errors = []
correct = []
for i in range(data.size):
mlp.forward(data.inputs[i])
errors.append(mlp.backwards(data.labels[i]))
mlp.updateWeights(learningRate)
epochLoss = np.mean(errors)
losses.append(epochLoss)
if epoch % 100 == 0 or epoch == numEpochs - 1:
print("EPOCH:", epoch)
print("LOSS: ", epochLoss, "\n")
The sample your network is failing to predict is ([0,0], -1), which is a result of the lack of bias. For I = [0,0], we have H = dot(Wh,I) = [0,0,..0]. Since tanh(0) = 0, the network is unable to learn any transformation of zero to non-zero at any layer. Adding bias will solve the issue; a hacky solution can be achieved by adding a 3rd column to your dataset filled with any constant, to emulate the effect of bias.