Multilayer Perceptron works fine with sigmoid acti

2019-08-24 03:50发布

I have written a simple MLP with one single layer. When learning the XOR function using sigmoid activations, the loss reduced consistently. However, if I change the labels of the XOR data from [0, 1] to [-1, 1] and use the tanh activation instead, the loss doesn't reduce. It should still work by simply changing the labels and using tanh, shouldn't it? And if so, where is my implementation going wrong with tanh?

import sys

import numpy as np


class DataSet:

    def __init__(self, data):
        self.size = len(data)
        self.inputs, self.labels = [], []
        for i in range(len(data)):
            self.inputs.append(data[i][0])
            self.labels.append(data[i][1])
        try:
            self.numInputs = len(self.inputs[0])
        except TypeError:
            self.numInputs = 1
        try:
            self.numOutputs = len(self.labels[0])
        except TypeError:
            self.numOutputs = 1


class MLP:

    def __init__(self, numInputs, numHidden, numOutputs, activationFunction):
        # MLP architecture sizes
        self.numInputs = numInputs
        self.numHidden = numHidden
        self.numOutputs = numOutputs
        self.activationFunction = activationFunction.lower()

        # MLP weights
        self.IH_weights = np.random.rand(numInputs, numHidden)      # Input -> Hidden
        self.HO_weights = np.random.rand(numHidden, numOutputs)     # Hidden -> Output

        # Gradients corresponding to weight matrices computed during backprop
        self.IH_gradients = np.zeros_like(self.IH_weights)
        self.HO_gradients = np.zeros_like(self.HO_weights)

        # Input, hidden and output neuron values
        self.I = np.zeros(numInputs)    # Inputs
        self.L = np.zeros(numOutputs)   # Labels
        self.H = np.zeros(numHidden)    # Hidden
        self.O = np.zeros(numOutputs)   # Output

    # ADD BIAS FOR RELU ########################################################
    # ADD SOFTMAX ##############################################################
    def activation(self, x, derivative=False):
        if self.activationFunction == 'sigmoid':
            if derivative:
                return x * (1 - x)
            return 1 / (1 + np.exp(-x))

        if self.activationFunction == 'tanh':
            if derivative:
                return 1. - np.tanh(x) ** 2
            return np.tanh(x)

        # if self.activationFunction == 'softmax':
        #     if derivative:
        #         return ########
        #     return np.exp(x) / np.sum(np.exp(x))

        if self.activationFunction == 'relu':
            if derivative:
                return (x > 0).astype(float)
            return np.maximum(0, x)

        print("ERROR: Activation function not found.")
        sys.exit()

    def forward(self, inputs):
        # Ensure that inputs is a list
        try:
            len(inputs)
        except TypeError:
            inputs = [inputs]

        self.I = np.array(inputs).reshape(1, self.numInputs)
        self.H = self.I.dot(self.IH_weights)
        self.H = self.activation(self.H)
        self.O = self.H.dot(self.HO_weights)
        self.O = self.activation(self.O)
        return self.O, self.L

    def backwards(self, labels):
        # Ensure that labels is a list
        try:
            len(labels)
        except TypeError:
            labels = [labels]

        self.L = np.array(labels)
        self.O_error = self.L - self.O
        self.O_delta = self.O_error * self.activation(self.O, derivative=True)

        self.H_error = self.O_delta.dot(self.HO_weights.T)
        self.H_delta = self.H_error * self.activation(self.H, derivative=True)

        self.IH_gradients += self.I.T.dot(self.H_delta)
        self.HO_gradients += self.H.T.dot(self.O_delta)

        return np.sum(self.O_error ** 2)

    def updateWeights(self, learningRate):
        self.IH_weights += self.IH_gradients
        self.HO_weights += self.HO_gradients
        self.IH_gradients = np.zeros_like(self.IH_weights)
        self.HO_gradients = np.zeros_like(self.HO_weights)

# data = DataSet([
#     [[0, 0], 0],
#     [[0, 1], 1],
#     [[1, 0], 1],
#     [[1, 1], 0]
# ])
#
# mlp = MLP(data.numInputs, 20, data.numOutputs, 'sigmoid')

data = DataSet([
    [[0, 0], -1],
    [[0, 1], 1],
    [[1, 0], 1],
    [[1, 1], -1]
])

mlp = MLP(data.numInputs, 20, data.numOutputs, 'tanh')

numEpochs = 10000
learningRate = 0.01

losses = []
for epoch in range(numEpochs):
    errors = []
    correct = []
    for i in range(data.size):
        mlp.forward(data.inputs[i])
        errors.append(mlp.backwards(data.labels[i]))
    mlp.updateWeights(learningRate)
    epochLoss = np.mean(errors)
    losses.append(epochLoss)
    if epoch % 100 == 0 or epoch == numEpochs - 1:
        print("EPOCH:", epoch)
        print("LOSS: ", epochLoss, "\n")

1条回答
可以哭但决不认输i
2楼-- · 2019-08-24 04:34

The sample your network is failing to predict is ([0,0], -1), which is a result of the lack of bias. For I = [0,0], we have H = dot(Wh,I) = [0,0,..0]. Since tanh(0) = 0, the network is unable to learn any transformation of zero to non-zero at any layer. Adding bias will solve the issue; a hacky solution can be achieved by adding a 3rd column to your dataset filled with any constant, to emulate the effect of bias.

查看更多
登录 后发表回答