Weights and Biases not updating in tensorflow

2019-06-01 00:38发布

问题:

I've made this neural net to figure out whether a house is a good buy or a bad buy. For some reasons the code is not updating weights and biases. My loss stays same. This is my code:

I've made this neural net to figure out whether a house is a good buy or a bad buy. For some reasons the code is not updating weights and biases. My loss stays same. This is my code:

import pandas as pd
import tensorflow as tf

data = pd.read_csv("E:/workspace_py/datasets/good_bad_buy.csv")

features = data.drop(['index', 'good buy'], axis = 1)
lbls = data.drop(['index', 'area', 'bathrooms', 'price', 'sq_price'], axis = 1)

features = features[0:20]
lbls = lbls[0:20]

print(features)
print(lbls)
n_examples = len(lbls)

# Model

# Hyper parameters

epochs = 100
learning_rate = 0.1
batch_size = 1

input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('float', [None, 1])

weights = {
            'hl1': tf.Variable(tf.random_normal([4, 10])),
            'hl2': tf.Variable(tf.random_normal([10, 10])),
            'hl3': tf.Variable(tf.random_normal([10, 4])),
            'ol': tf.Variable(tf.random_normal([4, 1]))
            }

biases = {
            'hl1': tf.Variable(tf.random_normal([10])),
            'hl2': tf.Variable(tf.random_normal([10])),
            'hl3': tf.Variable(tf.random_normal([4])),
            'ol': tf.Variable(tf.random_normal([1]))
            }

hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))

loss = tf.reduce_mean((labels - ol)**2)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

iterations = int(n_examples/batch_size)


for epoch_no in range(epochs):
    ptr = 0
    for iteration_no in range(iterations):
        epoch_input = features[ptr:ptr+batch_size]
        epoch_label = lbls[ptr: ptr+batch_size]
        ptr = ptr + batch_size
        _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls})
    print("Error at epoch ", epoch_no, ": ", err)

print(sess.run(ol, feed_dict={input_data: [[2104, 3, 399900, 190.0665]]}))

This is the dataset:

Features:

    area  bathrooms   price    sq_price
0   2104          3  399900  190.066540
1   1600          3  329900  206.187500
2   2400          3  369000  153.750000
3   1416          2  232000  163.841808
4   3000          4  539900  179.966667
5   1985          4  299900  151.083123
6   1534          3  314900  205.280313
7   1427          3  198999  139.452698
8   1380          3  212000  153.623188
9   1494          3  242500  162.315930
10  1940          4  239999  123.710825
11  2000          3  347000  173.500000
12  1890          3  329999  174.602645
13  4478          5  699900  156.297454
14  1268          3  259900  204.968454
15  2300          4  449900  195.608696
16  1320          2  299900  227.196970
17  1236          3  199900  161.731392
18  2609          4  499998  191.643542
19  3031          4  599000  197.624546

labels:

    good buy
0        1.0
1        0.0
2        1.0
3        0.0
4        1.0
5        0.0
6        0.0
7        1.0
8        0.0
9        0.0
10       1.0
11       1.0
12       1.0
13       1.0
14       0.0
15       1.0
16       0.0
17       1.0
18       1.0
19       1.0

Any suggestions on how to fix this? I've tried tf.reduce_sum other than tf.reduce_mean. I've also tried a larger batch_size.

回答1:

A few things to consider

  • Minibatch not being evaluated correctly since you feed in features and lbls instead of epoch_input and epoch_label.
  • You do not precondition your data in any way, so it is completely out of range. I.e. my code below normalizes the features into stddev and mean. You might consider using batch_normalization.
  • You are not evaluating error at any point. You need a held out training and testing set. My code below doesn't hold out data, but it does test in terms of error % rather than just loss (which is a weak proxy for error, so you shouldn't call it error).
  • You initialize biases to random normals. You probably want to just start those at zero.
  • You probably should use tf.layers or another high level api.

The below code achieves a training error of 95%. You'd want to test with a held out data set not used for training to evaluate the testing error.

#!/usr/bin/env python
import sys
import pandas as pd
import numpy as np
import tensorflow as tf


data = pd.read_csv("data.csv")

features = data.drop(['good buy'], axis = 1)
lbls = data.drop([ 'area', 'bathrooms', 'price', 'sq_price'], axis = 1)

features = features[0:20]
lbls = lbls[0:20]

mu = np.mean(features, axis=0)
sigma = (np.std(features, axis=0))
features = (features - mu) / sigma

n_examples = len(lbls)

# Model

# Hyper parameters

epochs = 100
learning_rate = 0.01
batch_size = 5

input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('float', [None, 1])

weights = {
      'hl1': tf.Variable(tf.random_normal([4, 10])),
      'hl2': tf.Variable(tf.random_normal([10, 10])),
      'hl3': tf.Variable(tf.random_normal([10, 4])),
      'ol': tf.Variable(tf.random_normal([4, 1]))
      }

biases = {
      'hl1': tf.Variable(tf.zeros([10])),
      'hl2': tf.Variable(tf.zeros([10])),
      'hl3': tf.Variable(tf.zeros([4])),
      'ol': tf.Variable(tf.zeros([1]))
      }



hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))

loss = tf.reduce_mean((labels - ol)**2)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

iterations = int(n_examples/batch_size)


def training_accuracy():
  foo,  = sess.run([ol], feed_dict={input_data: features, labels: lbls})
  return (float(np.count_nonzero(np.equal(np.round(foo), lbls))) / float(lbls.shape[0]))


print("Initial training accuracy %f" % training_accuracy())


for epoch_no in range(epochs):
  ptr = 0
  for iteration_no in range(iterations):
    epoch_input = features[ptr:ptr+batch_size]
    epoch_label = lbls[ptr: ptr+batch_size]
    ptr = (ptr + batch_size)%len(features)
    _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label})
  print("Error at epoch ", epoch_no, ": ", err)
  print("  Training accuracy %f" % training_accuracy())

Also, please do not post usage questions like this on github, they belong here on StackOverflow.



回答2:

There are several things not ok with your code. First, you mean

    epoch_input = features[ptr:ptr+batch_size]
    epoch_label = lbls[ptr: ptr+batch_size]
    ptr = ptr + batch_size
    // _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls}
    _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label}

Now it uses minibatch.

Debugging the gradient:

You can always check some stuff by adding

loss = tf.Print(loss, [tf.reduce_sum(weights['hl1'])])

This will print the elements of that list [tf.reduce_sum(weights['hl1'])]. To investigate further your problem, you can check the gradients instead of using minimize

grads = tf.reduce_sum(tf.gradients(loss, ol)[0])
sess.run(grads, {input_data: features, labels: lbls})

And finally, the loss function is inappropriate/numerical instable for classification. With your version, I get:

variables
   Variable:0
   Variable_1:0
   Variable_2:0
   Variable_3:0
   Variable_4:0
   Variable_5:0
   Variable_6:0
   Variable_7:0
I tensorflow/core/kernels/logging_ops.cc:79] [-6.2784553]
-----------------------------------------
name MatMul_grad
gradient [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
value [[-0.59977376 -0.30060738  0.55068201  0.15304407  1.39992142  0.07495346
  -0.87189424 -0.22595075 -0.30094525 -1.2688272 ]
 [-0.44018757  1.08651936 -0.26267499 -0.54463315  0.47019768  0.69873857
   0.56195319  0.20222363  0.38143152 -0.92212462]
 [-0.39977714 -1.07244122  0.41926911  1.4951371  -2.28751612  0.45676312
   0.88010246 -0.88077509 -1.25860023  0.56874037]
 [-0.98260719 -1.30747247 -1.4460088   1.0717535   0.08794415 -0.53184992
  -1.17537284 -0.51598179 -0.15323587  0.91142744]]
-----------------------------------------
name MatMul_1_grad
gradient [[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
value [[-0.1170694   0.12174897  0.91696155  0.59427398  0.90844423  0.29010534
  -0.34039831 -0.62824941  0.37833953  0.27777222]
 [-0.34947088  1.09264851  0.27353975  1.31722498 -0.42032316 -2.74952078
  -0.66349608 -0.61844724 -0.82141227  1.21691799]
 [ 0.10453336 -1.68631995  0.45700032 -1.58120835 -1.23378754 -0.05648948
  -1.64761281 -0.57684237 -0.06499017 -0.49623618]
 [ 1.47821534 -0.5329541   0.09209292  1.78089786  1.71149898  0.30547267
   0.39544162  1.00369155  1.0097307  -0.92320329]
 [ 1.27038908 -2.17246103 -0.31276336  0.8945803   0.30964327  1.15329361
   0.9711507  -0.36301252 -0.05652813  0.63399518]
 [-0.30909851 -0.41660413 -0.50603527  0.11735299 -0.26837045  0.16547598
  -0.33875859 -0.46821991  0.25723135 -0.80380815]
 [-0.86255074 -1.11751068  0.01365725  0.66119182  0.48947951  1.6353699
  -0.794447    0.43182942 -0.97692633 -1.62605619]
 [ 1.38552308  0.83679706 -0.87287223  2.59401655 -0.61855     0.38301265
   1.09983373  0.49209142  1.03003716 -1.33537853]
 [ 0.74452382  1.57940936 -0.90974236 -1.2211293  -1.1076287   0.92846316
  -0.46856263 -0.3179535   0.75120807 -0.86442506]
 [ 0.31622764 -0.35965034 -0.02351121 -0.0650174   0.4714573   0.35687482
   1.43354905  0.39608309  0.42744714 -0.37226421]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]
value [[-1.50904143  0.00228321  1.45787132  0.68312413]
 [-0.16627057  1.31303644  1.16326404  0.72901946]
 [ 0.8004092   0.37329885  0.89361066 -0.19850619]
 [ 1.58354807 -1.05612624  0.69891322 -0.32565734]
 [-1.57602286 -0.41256282  0.69086516 -0.54095054]
 [ 1.72376788 -0.53928965 -0.71574098 -0.94974124]
 [-0.62061429  1.51380932 -0.72585452 -0.07695383]
 [ 0.35537818  1.49691582  0.03931179  0.93435526]
 [ 0.20697887  1.39266443  0.73217523 -0.64737892]
 [ 1.00519872  0.90984046  1.68565321 -0.28157935]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.]
 [ 0.]
 [ 0.]
 [ 0.]]
value [[ 0.94082022]
 [ 0.14753926]
 [-0.08765228]
 [ 1.32516992]]
-----------------------------------------
name Add_grad
gradient [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
value [ 1.71239722  1.12632215  0.75409448  0.01951236  0.32135537 -1.46281374
  0.40413955  0.54653352 -0.57894999  0.2746354 ]
-----------------------------------------
name Add_1_grad
gradient [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
value [ 0.74800217 -0.43517059 -0.77706921  1.46858656  1.09103405 -0.46681881
  0.6126743  -2.27877688  1.48809242 -1.19616997]
-----------------------------------------
name Add_2_grad
gradient [ 0.  0.  0.  0.]
value [-0.12137324 -0.23238407  0.17909229 -0.75496733]
-----------------------------------------
name Add_3_grad
gradient [ 0.]
value [-0.91176724]

As you see, almost all gradients are zero. Why?

  • by definition (labels - ol) is in [0, 1]
  • the squared value is much smaller than one
  • the derivative of sigmoid s(x) is s'(x) = s(x)*(1-s(x)) the gradients are multiplied by this value which is again much smaller than one.

But after using sparse_softmax_cross_entropy_with_logits which is numerically stable and operates in the log-domain I get

variables
   Variable:0
   Variable_1:0
   Variable_2:0
   Variable_3:0
   Variable_4:0
   Variable_5:0
   Variable_6:0
   Variable_7:0
-----------------------------------------
name MatMul_grad
gradient [[ -1.42780918e-05  -1.96137808e-05  -2.44040220e-05  -2.25691911e-05
    0.00000000e+00   2.95208647e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -2.54181440e-08  -3.49168410e-08  -4.34445262e-08  -4.01781257e-08
    0.00000000e+00   5.25536308e-08   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -2.45539122e-03  -3.37296468e-03  -4.19673882e-03  -3.88120394e-03
    0.00000000e+00   5.07667707e-03   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [ -1.42123906e-06  -1.95235293e-06  -2.42917258e-06  -2.24653377e-06
    0.00000000e+00   2.93850212e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
value [[ 0.43133125 -0.40009859 -0.08456381  0.59587955  0.57171088 -0.9824872
   1.18876612  0.9704771   0.74798232  0.15660612]
 [-1.18380785  0.22617982 -1.15734088 -0.50478351  1.43819618  1.55950046
  -1.1510663  -0.88835335  0.58378232  0.56860197]
 [ 0.29826403  0.02192715  0.62225986  2.47716165 -0.9223454   1.70159853
  -1.03968358 -0.26019615 -0.33808291 -0.30873826]
 [ 0.59774327 -1.28855145 -0.43420359 -0.4413566  -0.19220066  0.96984953
  -0.04922202  0.32994318 -1.05539823 -0.80112725]]
-----------------------------------------
name MatMul_1_grad
gradient [[  0.00000000e+00   1.15650124e-03   0.00000000e+00   0.00000000e+00
    6.59449317e-04  -1.09400018e-03   0.00000000e+00  -4.02117817e-04
    5.44495881e-04  -8.90314346e-04]
 [  0.00000000e+00   7.24206184e-05   0.00000000e+00   0.00000000e+00
    4.12950030e-05  -6.85067716e-05   0.00000000e+00  -2.51807924e-05
    3.40965707e-05  -5.57518724e-05]
 [  0.00000000e+00   2.38713808e-03   0.00000000e+00   0.00000000e+00
    1.36117137e-03  -2.25812919e-03   0.00000000e+00  -8.30012548e-04
    1.12389564e-03  -1.83770037e-03]
 [  0.00000000e+00   9.52679198e-03   0.00000000e+00   0.00000000e+00
    5.43227792e-03  -9.01193265e-03   0.00000000e+00  -3.31248436e-03
    4.48533799e-03  -7.33405072e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   6.51591457e-03   0.00000000e+00   0.00000000e+00
    3.71544389e-03  -6.16377220e-03   0.00000000e+00  -2.26559630e-03
    3.06777749e-03  -5.01617463e-03]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
value [[ 0.38902158 -2.14370036 -1.02228141 -0.6492967   1.87193418 -0.06453216
   1.0013988  -1.26857054  0.59826601  0.45045251]
 [ 0.51465249 -1.09108925 -0.21368918 -0.49310678 -0.87893176 -0.07944249
  -0.15810326  1.65703297  1.01812947 -0.95572269]
 [-1.76351583 -1.46950841  1.43533802  2.15617752  1.30682683  0.77409673
  -1.50309181  0.81978178  0.6672287  -0.434971  ]
 [-0.7291944   2.16516733 -1.39850736 -1.06059277  0.40035763  1.23335707
  -0.03707252  1.88107574  0.09459961  2.11439633]
 [-1.39152992 -1.39924514 -0.35704514 -0.71152836 -2.68857026  0.78129828
  -1.0077033  -1.26149333  0.4403404  -0.10159389]
 [ 0.37354535  0.12654085  0.7632165  -0.76493222  0.68177891 -0.34254205
  -1.11582613  2.60665917  1.53196526 -0.867055  ]
 [ 0.62746197 -0.01072595  3.26629376  1.28371656 -0.88725293  3.55530715
   0.67065352 -0.61927503  1.20604384 -0.87207574]
 [-0.68954837  1.89912283  0.90083456  0.02054735 -0.23425011  0.39949065
  -0.08969283 -0.75943565  1.0924015   0.28920195]
 [-0.64865923 -1.29299021 -0.39945969  0.02289505  1.46024895  0.94282049
  -0.99704605 -1.36124468  0.76788425  0.86770487]
 [ 0.63794595  1.68530416 -0.15548207 -0.22658408 -0.45446202 -0.77308726
  -0.12694608  1.17369819  2.25879693  0.20346723]]
-----------------------------------------
name MatMul_2_grad
gradient [[ 0.          0.          0.          0.        ]
 [-0.02205572  0.          0.00960038  0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.01932034  0.          0.00840973  0.        ]
 [-0.01617817  0.          0.00704201  0.        ]
 [ 0.          0.          0.          0.        ]
 [-0.05091252  0.          0.02216113  0.        ]
 [-0.0189826   0.          0.00826272  0.        ]
 [-0.01993647  0.          0.00867792  0.        ]]
value [[-0.18724969 -0.0544498  -0.69153035  0.47535184]
 [-0.75444973 -1.33321464 -0.13066645  1.56889391]
 [-0.6458627   1.17859495 -0.75926393  0.30138403]
 [ 1.0069555  -0.69344127  0.49295315  0.54917085]
 [-0.55954564 -1.13277721 -0.37167427 -0.64837182]
 [ 0.93753678  1.12197697  0.63789612  0.52438796]
 [ 0.77543265 -1.241382    1.78230286 -0.6928125 ]
 [ 0.95383584 -2.00331807  1.63409865 -0.36474878]
 [-0.73891008  2.066082   -0.94303596 -0.42322466]
 [ 0.38519588  0.03278512 -0.3487882  -1.50447905]]
-----------------------------------------
name MatMul_3_grad
gradient [[ 0.08460998]
 [ 0.        ]
 [ 0.16564058]
 [ 0.        ]]
value [[-0.35376808]
 [-0.07330427]
 [ 0.15398768]
 [-0.06484076]]
-----------------------------------------
name Add_grad
gradient [ -8.22783885e-09  -1.13025616e-08  -1.40629695e-08  -1.30056375e-08
   0.00000000e+00   1.70115797e-08   0.00000000e+00   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
value [-1.00038147 -0.56519473  0.59372097 -1.1646167  -0.16213787 -0.69313556
  0.62788707  1.03768504  0.57876503 -0.5201084 ]
-----------------------------------------
name Add_1_grad
gradient [  0.00000000e+00   1.28705375e-08   0.00000000e+00   0.00000000e+00
   7.33891703e-09  -1.21749730e-08   0.00000000e+00  -4.47511184e-09
   6.05961770e-09  -9.90818183e-09]
value [ 0.02854451 -1.46039021 -0.03916361  0.40116394  0.16030532  0.88267213
 -0.46328214  0.18927227 -1.7536788  -0.46590349]
-----------------------------------------
name Add_2_grad
gradient [ -1.84504412e-08   0.00000000e+00   8.03108247e-09   0.00000000e+00]
value [ 0.94534302 -0.9080081  -1.86719894 -1.31547296]
-----------------------------------------
name Add_3_grad
gradient [ 0.29727879 -0.29727876]
value [ 0.07999782 -0.75647992]

The gradients are (while very small) this time non zero. The code for reproducing that is

import numpy as np
import tensorflow as tf

features = [
[2104, 3, 399900, 190.066540],
[1600, 3, 329900, 206.187500],
[2400, 3, 369000, 153.750000],
[1416, 2, 232000, 163.841808],
[3000, 4, 539900, 179.966667],
[1985, 4, 299900, 151.083123],
[1534, 3, 314900, 205.280313],
[1427, 3, 198999, 139.452698],
[1380, 3, 212000, 153.623188],
[1494, 3, 242500, 162.315930],
[1940, 4, 239999, 123.710825],
[2000, 3, 347000, 173.500000],
[1890, 3, 329999, 174.602645],
[4478, 5, 699900, 156.297454],
[1268, 3, 259900, 204.968454],
[2300, 4, 449900, 195.608696],
[1320, 2, 299900, 227.196970],
[1236, 3, 199900, 161.731392],
[2609, 4, 499998, 191.643542],
[3031, 4, 599000, 197.624546]]

lbls = [1,0,1,0,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1]
features = np.array(features, dtype=np.float32)
lbls = np.array(lbls, dtype=np.int32)

n_examples = len(lbls)
epochs = 100
learning_rate = 0.1
batch_size = 1

input_data = tf.placeholder('float', [None, 4])
labels = tf.placeholder('int32', [None])

weights = {
            'hl1': tf.Variable(tf.random_normal([4, 10])),
            'hl2': tf.Variable(tf.random_normal([10, 10])),
            'hl3': tf.Variable(tf.random_normal([10, 4])),
            'ol': tf.Variable(tf.random_normal([4, 1]))
            }

biases = {
            'hl1': tf.Variable(tf.random_normal([10])),
            'hl2': tf.Variable(tf.random_normal([10])),
            'hl3': tf.Variable(tf.random_normal([4])),
            # 'ol': tf.Variable(tf.random_normal([1])),
            'ol': tf.Variable(tf.random_normal([2]))
            }

hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1']))
hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2']))
hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3']))
# ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol']))
logits = tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])

# ol = tf.Print(ol, [tf.reduce_sum(weights['hl1'])])
# loss = tf.reduce_mean((labels - ol)**2)
cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
# loss = tf.reduce_mean((labels - ol)**2)
loss = tf.reduce_mean(cost)
optimizer = tf.train.AdamOptimizer(learning_rate)

iterations = int(n_examples/batch_size)

def debug_minimize(optimizer, loss, sess):
    from tensorflow.python.ops import variables
    from tensorflow.python.framework import ops
    # get all varibles
    var_list = (variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
    print 'variables'
    for v in var_list:
        print '  ', v.name
    # get all gradients
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars)

    zipped_val = sess.run(grads_and_vars, {input_data: features, labels: lbls})

    for rsl, tensor in zip(zipped_val, grads_and_vars):
        print '-----------------------------------------'
        print 'name', tensor[0].name.replace('/tuple/control_dependency_1:0', '').replace('gradients/', '')
        print 'gradient', rsl[0]
        print 'value', rsl[1]
    return train_op

sess = tf.Session()
sess.run(tf.global_variables_initializer())
debug_minimize(optimizer, loss, sess)


回答3:

I'm not sure if this is the problem for you. But sigmoid functions gradient can get very small if its input is to big, this can make updates very slow.

To check if this is the case for you try initializing all your weights at very small values. You can adjust this by setting a standard deviation for your random norms.

tf.Variable(tf.random_normal([4, 10],  stddev=0.1))