I just started learning tensorflow and was training on some data to play around with regularizaiton. One strange thing I found is that once I apply the regularization term to loss function, my training time grows exponentially overtime. I found two posts that might be similar but they don't turn out to be helpful.
Processing time gets longer and longer after each iteration (TensorFlow)
TensorFlow: slow performance when getting gradients at inputs
Am I missing something? see my code below.
data = TrainingData("test")
dataset, labels = data.get_training_set()
dataset = np.array(dataset).astype(np.float32)
labels = np.array(labels).astype(np.float32)
train_dataset, valid_dataset, train_labels, valid_labels = train_test_split(dataset, labels,
test_size=0.33)
print("Beta: {0}".format(beta))
batch_size = 100
beta = np.float32(20.0)
graph = tf.Graph()
with graph.as_default():
input_size = len(train_dataset[0])
output_size = len(labels[0])
hidden_nodes = int((input_size + output_size)/2)
w1 = tf.Variable(tf.truncated_normal([input_size, hidden_nodes]))
b1 = tf.Variable(tf.zeros([hidden_nodes]))
w2 = tf.Variable(tf.truncated_normal([hidden_nodes, output_size]))
b2 = tf.Variable(tf.zeros([output_size]))
def forward_prop(dataset, w1, b1, w2, b2):
# global w1, b1, w2, b2
h1 = tf.nn.relu(tf.matmul(dataset, w1) + b1)
return tf.matmul(h1, w2) + b2
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, input_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, output_size))
tf_final_train_data_set = tf.constant(train_dataset)
tf_valid_dataset = tf.constant(valid_dataset)
tf_valid_dataset = tf.cast(tf_valid_dataset, tf.float32)
# Training computation.
logits = forward_prop(tf_train_dataset, w1, b1, w2, b2)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) + 1.0 / beta * (tf.nn.l2_loss(w1) + tf.nn.l2_loss(w2)) )
# Optimizer.
# optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
optimizer = tf.train.AdamOptimizer().minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
final_train_prediction = tf.nn.softmax(forward_prop(tf_final_train_data_set, w1, b1, w2, b2))
valid_prediction = tf.nn.softmax(forward_prop(tf_valid_dataset, w1, b1, w2, b2))
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])
num_steps = 3000
# todo training time increases over time, need to find out why
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
graph.finalize()
print("Initialized")
start = time.time()
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# print(offset, len(train_dataset))
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 200 == 0):
stop = time.time()
duration = stop - start
print("Time elapsed: {0}".format(duration))
start = time.time()
# print("Minibatch loss at step %d: %f" % (step, l))
# print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
# print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
print("Training accuracy: %.1f%%" % accuracy(final_train_prediction.eval(), train_labels))
print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
The print out is like this:
Beta: 20
Initialized
Time elapsed: 0.0639870166779
Time elapsed: 7.24627017975
Time elapsed: 7.11730599403
Time elapsed: 7.07944703102
Time elapsed: 7.22844195366
Time elapsed: 7.93321800232
Time elapsed: 8.07225179672
Time elapsed: 8.5860388279
Time elapsed: 20.4446599483
Time elapsed: 58.4914281368
Time elapsed: 90.9370441437
Time elapsed: 116.852982998
Time elapsed: 141.120803118
If I set beta to 1, I get something similar:
Beta: 1
Initialized
Time elapsed: 0.0859398841858
Time elapsed: 9.63061714172
Time elapsed: 8.11608719826
Time elapsed: 9.0208530426
Time elapsed: 9.21532988548
Time elapsed: 8.46498990059
Time elapsed: 9.67137289047
Time elapsed: 9.01787590981
Time elapsed: 20.2034230232
Time elapsed: 64.7520048618
Time elapsed: 90.5053529739
Time elapsed: 118.591227055
But if I remove l2_loss term completely, like below.
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
My training time becomes normal:
Initialized
Time elapsed: 0.0581669807434
Time elapsed: 8.1517791748
Time elapsed: 7.20353102684
Time elapsed: 7.19430088997
Time elapsed: 9.12145781517
Time elapsed: 7.9555811882
Time elapsed: 7.5981760025
Time elapsed: 7.70339202881
Time elapsed: 7.48909091949
Time elapsed: 8.91624116898
Time elapsed: 7.94902992249
Time elapsed: 7.71093988419
Time elapsed: 10.0522530079
Time elapsed: 7.7255730629
Time elapsed: 7.6657538414
Training accuracy: 97.1%
Validation accuracy: 55.5%