Multilayer Perceptron works fine with sigmoid acti

I have written a simple MLP with one single layer. When learning the XOR function using sigmoid activations, the loss reduced consistently. However, if I change the labels of the XOR data from [0, 1] to [-1, 1] and use the tanh activation instead, the loss doesn't reduce. It should still work by simply changing the labels and using tanh, shouldn't it? And if so, where is my implementation going wrong with tanh?

import sys

import numpy as np


class DataSet:

    def __init__(self, data):
        self.size = len(data)
        self.inputs, self.labels = [], []
        for i in range(len(data)):
            self.inputs.append(data[i][0])
            self.labels.append(data[i][1])
        try:
            self.numInputs = len(self.inputs[0])
        except TypeError:
            self.numInputs = 1
        try:
            self.numOutputs = len(self.labels[0])
        except TypeError:
            self.numOutputs = 1


class MLP:

    def __init__(self, numInputs, numHidden, numOutputs, activationFunction):
        # MLP architecture sizes
        self.numInputs = numInputs
        self.numHidden = numHidden
        self.numOutputs = numOutputs
        self.activationFunction = activationFunction.lower()

        # MLP weights
        self.IH_weights = np.random.rand(numInputs, numHidden)      # Input -> Hidden
        self.HO_weights = np.random.rand(numHidden, numOutputs)     # Hidden -> Output

        # Gradients corresponding to weight matrices computed during backprop
        self.IH_gradients = np.zeros_like(self.IH_weights)
        self.HO_gradients = np.zeros_like(self.HO_weights)

        # Input, hidden and output neuron values
        self.I = np.zeros(numInputs)    # Inputs
        self.L = np.zeros(numOutputs)   # Labels
        self.H = np.zeros(numHidden)    # Hidden
        self.O = np.zeros(numOutputs)   # Output

    # ADD BIAS FOR RELU ########################################################
    # ADD SOFTMAX ##############################################################
    def activation(self, x, derivative=False):
        if self.activationFunction == 'sigmoid':
            if derivative:
                return x * (1 - x)
            return 1 / (1 + np.exp(-x))

        if self.activationFunction == 'tanh':
            if derivative:
                return 1. - np.tanh(x) ** 2
            return np.tanh(x)

        # if self.activationFunction == 'softmax':
        #     if derivative:
        #         return ########
        #     return np.exp(x) / np.sum(np.exp(x))

        if self.activationFunction == 'relu':
            if derivative:
                return (x > 0).astype(float)
            return np.maximum(0, x)

        print("ERROR: Activation function not found.")
        sys.exit()

    def forward(self, inputs):
        # Ensure that inputs is a list
        try:
            len(inputs)
        except TypeError:
            inputs = [inputs]

        self.I = np.array(inputs).reshape(1, self.numInputs)
        self.H = self.I.dot(self.IH_weights)
        self.H = self.activation(self.H)
        self.O = self.H.dot(self.HO_weights)
        self.O = self.activation(self.O)
        return self.O, self.L

    def backwards(self, labels):
        # Ensure that labels is a list
        try:
            len(labels)
        except TypeError:
            labels = [labels]

        self.L = np.array(labels)
        self.O_error = self.L - self.O
        self.O_delta = self.O_error * self.activation(self.O, derivative=True)

        self.H_error = self.O_delta.dot(self.HO_weights.T)
        self.H_delta = self.H_error * self.activation(self.H, derivative=True)

        self.IH_gradients += self.I.T.dot(self.H_delta)
        self.HO_gradients += self.H.T.dot(self.O_delta)

        return np.sum(self.O_error ** 2)

    def updateWeights(self, learningRate):
        self.IH_weights += self.IH_gradients
        self.HO_weights += self.HO_gradients
        self.IH_gradients = np.zeros_like(self.IH_weights)
        self.HO_gradients = np.zeros_like(self.HO_weights)

# data = DataSet([
#     [[0, 0], 0],
#     [[0, 1], 1],
#     [[1, 0], 1],
#     [[1, 1], 0]
# ])
#
# mlp = MLP(data.numInputs, 20, data.numOutputs, 'sigmoid')

data = DataSet([
    [[0, 0], -1],
    [[0, 1], 1],
    [[1, 0], 1],
    [[1, 1], -1]
])

mlp = MLP(data.numInputs, 20, data.numOutputs, 'tanh')

numEpochs = 10000
learningRate = 0.01

losses = []
for epoch in range(numEpochs):
    errors = []
    correct = []
    for i in range(data.size):
        mlp.forward(data.inputs[i])
        errors.append(mlp.backwards(data.labels[i]))
    mlp.updateWeights(learningRate)
    epochLoss = np.mean(errors)
    losses.append(epochLoss)
    if epoch % 100 == 0 or epoch == numEpochs - 1:
        print("EPOCH:", epoch)
        print("LOSS: ", epochLoss, "\n")

标签： python machine-learning neural-network deep-learning

1条回答

可以哭但决不认输i

2楼-- · 2019-08-24 04:34

The sample your network is failing to predict is ([0,0], -1), which is a result of the lack of bias. For I = [0,0], we have H = dot(Wh,I) = [0,0,..0]. Since tanh(0) = 0, the network is unable to learn any transformation of zero to non-zero at any layer. Adding bias will solve the issue; a hacky solution can be achieved by adding a 3rd column to your dataset filled with any constant, to emulate the effect of bias.

0人赞添加讨论(0) 举报

Multilayer Perceptron works fine with sigmoid acti

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间