Following the tutorial on TensorFlow, I am trying to understand run-time statistics using tensorboard.
I find that the compute time of a High-level node representing a name scope is not equal to the sum of compute times of its sub-nodes. Why isn't it the same?
For example, in the attached snapshot:
- Compute time of ConvLayer2 = 75.5 ms, while the
- Sub-nodes compute time = 55.2 (conv) + 1.73 (add) + 1 (other nodes) = 57.9 ms
Snapshot of ConvLayer2
import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
g = tf.Graph()
with g.as_default():
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding = "SAME")
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
x = tf.placeholder(tf.float32, shape=[100, 784], name = "X_input") # Input layer
y_= tf.placeholder(tf.float32, shape=[100, 10], name = "Y_labels")
# Reshape input vector into a 4d tensor
x_image = tf.reshape(x, [-1, 28, 28, 1])
# Layer 1
with tf.name_scope('ConvLayer1'):
W_conv1 = tf.Variable(tf.truncated_normal([5, 5, 1, 32], stddev=0.1), name = "Weights_L1")
b_conv1 = tf.Variable(tf.constant(0.1, shape = [32]), name = "Bias_L1")
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
# Layer 2
with tf.name_scope('ConvLayer2'):
W_conv2 = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1), name = "Weights_L2")
b_conv2 = tf.Variable(tf.constant(0.1, shape = [64]), name = "Bias_L2")
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
# Layer 3 : Fully Connected Layer
with tf.name_scope('FullyConnectLayer1'):
w_fc1 = tf.Variable(tf.truncated_normal([7*7*64, 1024], stddev=0.1), name = "Weights_fc1")
b_fc1 = tf.Variable(tf.constant(0.1, shape = [1024]), name = "Bias_fc1")
# Flatten
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)
# Dropout to reduce overfitting
with tf.name_scope('performDropout'):
keep_probability = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_probability)
# Layer 4: Readout layer
with tf.name_scope('FullyConnectLayer2'):
w_fc2 = tf.Variable(tf.truncated_normal([1024, 10], stddev=0.1), name = "Weights_fc2")
b_fc2 = tf.Variable(tf.constant(0.1, shape = [10]), name = "Bias_fc2")
y_out = tf.matmul(h_fc1_drop, w_fc2) + b_fc2
# loss function
with tf.name_scope('xEntropy'):
loss_crossEntropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y_, logits = y_out), name="xent")
tf.summary.scalar("xEntropy", loss_crossEntropy)
with tf.name_scope('Train_AdamOptim'):
optimizer = tf.train.AdamOptimizer(learning_rate = 1e-4)
train_step = optimizer.minimize(loss_crossEntropy)
with tf.name_scope('accuracy'):
correct_predict = tf.equal(tf.argmax(y_out ,1), tf.argmax(y_ ,1))
accuracy = tf.reduce_mean(tf.cast(correct_predict, tf.float32))
tf.summary.scalar("accuracy", accuracy)
# Merge all summary ops into a single op
summary = tf.summary.merge_all()
# Operation: Initialize variables
var_init = tf.global_variables_initializer()
#### Add trace and metadata calls.
run_options = tf.RunOptions(trace_level = tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
# Run the computational graph
with tf.Session() as sess:
# Initialize the variables that were created while building the computational graph
# Write the current session graph events onto a file using summary Filewriter. To be visulaized in Tensorboard.
writer = tf.summary.FileWriter("/tmp/mnist_demo/1")
# Run the training step "required" number of times -- here, 20000 timesteps
for i in range(101): #20000
batch = mnist.train.next_batch(100)
# Training, feed_dict={x:batch[0], y_:batch[1], keep_probability:0.5})
# validation
if i % 50 == 0:
[val_accuracy, s] =[accuracy, summary], feed_dict={x:mnist.validation.images[0:100, :],
y_:mnist.validation.labels[0:100, :], keep_probability: 1.0},
options=run_options, run_metadata=run_metadata)
writer.add_run_metadata(run_metadata, 'step %d' % i)
writer.add_summary(s, i)
print("step %d, validation accuracy %g" % (i, val_accuracy))
Tensorflow tutorial doesn't provide any information on how the High-level node compute time is aggregated. Any help is greatly appreciated.