This is a follow-up to my question posted here: Memory error with larger images when running convolutional neural network using TensorFlow on AWS instance g2.2xlarge
I built a CNN model in Python using TensorFlow and run it on NVIDIA GRID K520 GPU. It runs fine with 64x64 images, but produces a memory error with 128x128 images (even when input consists of only 1 image).
The error says Ran out of memory trying to allocate 2.00GiB.
2GiB is the size of my first fully-connected layer (input: 128*128*2(channels)
output: 128*128 * 4 bytes = 2.14748 GB = 2.0 GiB
).
From here, I can see that GRID K520 has 8GB = 7.45GiB memory. When I start running my code, I also see the output: Total memory: 3.94GiB, Free memory: 3.91GiB
.
My question is, what is the relationship between all these numbers: if there are 7.45GiB of memory on GPU, why there are only 3.94GiB of total memory and most importantly, why GPU cannot allocate 2GiB memory, which is just above half of total memory? (I am not a computer scientist, so a detailed answer would be valuable.)
Some more specific information in case it is useful:
I tried using allow_growth
and per_process_gpu_memory_fraction
. Still get the memory error, but also some memory stats (would really appreciate if someone could explain to me these numbers):
allow_growth = True
Stats:
Limit: 3878682624
InUse: 2148557312
MaxInUse: 2148557312
NumAllocs: 13
MaxAllocSize: 2147483648
allow_growth = False
Stats:
Limit: 3878682624
InUse: 3878682624
MaxInUse: 3878682624
NumAllocs: 13
MaxAllocSize: 3877822976
per_process_gpu_memory_fraction = 0.5
allow_growth = False
Stats:
Limit: 2116026368
InUse: 859648
MaxInUse: 859648
NumAllocs: 12
MaxAllocSize: 409600
per_process_gpu_memory_fraction = 0.5
allow_growth = True
Stats:
Limit: 2116026368
InUse: 1073664
MaxInUse: 1073664
NumAllocs: 12
MaxAllocSize: 623616
Minimal working example: with dummy training set of the same size as the images I input and only one fully-connected layer (full model code is here). This example works with input of size:
X_train = np.random.rand(1, 64, 64, 2)
Y_train = np.random.rand(1, 64, 64)
but doesn't work with input of size
X_train = np.random.rand(1, 128, 128, 2)
Y_train = np.random.rand(1, 128, 128)
Code:
import numpy as np
import tensorflow as tf
# Dummy training set:
X_train = np.random.rand(1, 128, 128, 2)
Y_train = np.random.rand(1, 128, 128)
print('X_train.shape at input = ', X_train.shape, ", Size = ",
X_train.shape[0] * X_train.shape[1] * X_train.shape[2]
* X_train.shape[3])
print('Y_train.shape at input = ', Y_train.shape, ", Size = ",
Y_train.shape[0] * Y_train.shape[1] * Y_train.shape[2])
def create_placeholders(n_H0, n_W0):
x = tf.placeholder(tf.float32, shape=[None, n_H0, n_W0, 2], name='x')
y = tf.placeholder(tf.float32, shape=[None, n_H0, n_W0], name='y')
return x, y
def forward_propagation(x):
x_temp = tf.contrib.layers.flatten(x) # size (n_im, n_H0 * n_W0 * 2)
n_out = np.int(x.shape[1] * x.shape[2]) # size (n_im, n_H0 * n_W0)
# FC: input size (n_im, n_H0 * n_W0 * 2), output size (n_im, n_H0 * n_W0)
FC1 = tf.contrib.layers.fully_connected(
x_temp,
n_out,
activation_fn=tf.tanh,
normalizer_fn=None,
normalizer_params=None,
weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=None,
biases_initializer=None,
biases_regularizer=None,
reuse=True,
variables_collections=None,
outputs_collections=None,
trainable=True,
scope='fc1')
# Reshape output from FC layer into array of size (n_im, n_H0, n_W0, 1):
FC_M = tf.reshape(FC1, [tf.shape(x)[0], tf.shape(x)[1], tf.shape(x)[2], 1])
return FC_M
def compute_cost(FC_M, Y):
cost = tf.square(FC_M - Y)
return cost
def model(X_train, Y_train, learning_rate=0.0001, num_epochs=100):
(m, n_H0, n_W0, _) = X_train.shape
# Create Placeholders
X, Y = create_placeholders(n_H0, n_W0)
# Build the forward propagation
DECONV = forward_propagation(X)
# Add cost function to tf graph
cost = compute_cost(DECONV, Y)
# Backpropagation
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
# Initialize all the variables globally
init = tf.global_variables_initializer()
# Memory config
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# Start the session to compute the tf graph
with tf.Session(config = config) as sess:
# Initialization
sess.run(init)
# Training loop
for epoch in range(num_epochs):
_, temp_cost = sess.run([optimizer, cost],
feed_dict={X: X_train, Y: Y_train})
print ('EPOCH = ', epoch, 'COST = ', np.mean(temp_cost))
# Finally run the model
model(X_train, Y_train, learning_rate=0.00002, num_epochs=5)
Traceback:
W tensorflow/core/common_runtime/bfc_allocator.cc:274] ****************************************************************************************************
W tensorflow/core/common_runtime/bfc_allocator.cc:275] Ran out of memory trying to allocate 2.00GiB. See logs for memory state.
W tensorflow/core/framework/op_kernel.cc:983] Internal: Dst tensor is not initialized.
E tensorflow/core/common_runtime/executor.cc:594] Executor failed to create kernel. Internal: Dst tensor is not initialized.
[[Node: zeros = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [32768,16384] values: [0 0 0]...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Traceback (most recent call last):
File "myAutomap_MinExample.py", line 99, in <module>
num_epochs=5)
File "myAutomap_MinExample.py", line 85, in model
sess.run(init)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 767, in run
run_metadata_ptr)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 965, in _run
feed_dict_string, options, run_metadata)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1015, in _do_run
target_list, options, run_metadata)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1035, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InternalError: Dst tensor is not initialized.
[[Node: zeros = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [32768,16384] values: [0 0 0]...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Caused by op u'zeros', defined at:
File "myAutomap_MinExample.py", line 99, in <module>
num_epochs=5)
File "myAutomap_MinExample.py", line 72, in model
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 289, in minimize
name=name)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 403, in apply_gradients
self._create_slots(var_list)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/training/rmsprop.py", line 103, in _create_slots
self._zeros_slot(v, "momentum", self._name)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/training/optimizer.py", line 647, in _zeros_slot
named_slots[var] = slot_creator.create_zeros_slot(var, op_name)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 121, in create_zeros_slot
val = array_ops.zeros(primary.get_shape().as_list(), dtype=dtype)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 1352, in zeros
output = constant(zero, shape=shape, dtype=dtype, name=name)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/framework/constant_op.py", line 103, in constant
attrs={"value": tensor_value, "dtype": dtype_value}, name=name).outputs[0]
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/home/ubuntu/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
self._traceback = _extract_stack()
InternalError (see above for traceback): Dst tensor is not initialized.
[[Node: zeros = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [32768,16384] values: [0 0 0]...>, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]