I'm trying to find out why the code below occurs a huge memory leak during iteration by iteration. Here's the whole code.
def train_network(file_folder, file_list, hm_epochs, batch_size):
num_files = len(file_list)
with g.as_default():
input_image = tf.placeholder(tf.float32, shape=[1, 40, 200, 300, 3])
y1 = tf.placeholder(tf.int32)
y2 = tf.placeholder(tf.float32)
class_logit, highlight_logit = convolutional_neural_network(input_image)
class_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=class_logit, labels=y1))
highlight_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=highlight_logit, labels=y2))
optimizer1 = tf.train.RMSPropOptimizer(learning_rate=1e-6).minimize(class_loss, centered=True)
optimizer2 = tf.train.RMSPropOptimizer(learning_rate=1e-7).minimize(highlight_loss, centered=True)
#### Saving Network ####
with tf.Session(graph=g) as sess:
saver = tf.train.Saver(max_to_keep = 3)
sess.run(tf.global_variables_initializer())
for epoch in xrange(hm_epochs):
epoch_loss = 0
for idx in xrange(num_files):
_file = file_folder + '/' + file_list[idx]
X_total, Y1_class, Y2_score = read_as_batch(_file)
n_batch = int(X_total.shape[0]/batch_size)
for i in xrange(n_batch):
batch_X = get_batch_piece(X_total, batch_size, i)
batch_Y1 = get_batch_piece(Y1_class, batch_size, i)
batch_Y2 = get_batch_piece(Y2_score, batch_size, i)
_, _, a, b, c, d = sess.run([optimizer1, optimizer2, class_loss, highlight_loss, tf.gather(class_logit, 0), tf.gather(highlight_logit, 0)], feed_dict={input_image: batch_X, y1: batch_Y1, y2: batch_Y2})
result = float(a) + float(b)
del a, b, batch_X, batch_Y1, batch_Y2
epoch_loss += result
del c, d
gc.collect()
ckpt_path = saver.save(sess, "saved/train", epoch)
And the below is memory profiler result. I figured out the functions read_as_batch and get_batch_piece are not the reason of the memory leaking by several experiments.
Line # Mem usage Increment Line Contents
35 215.758 MiB 0.000 MiB @profile
36 def train_network(file_folder, file_list, hm_epochs, batch_size):
37
38 215.758 MiB 0.000 MiB num_files = len(file_list)
44 215.758 MiB 0.000 MiB with g.as_default():
45
46 216.477 MiB 0.719 MiB input_image = tf.placeholder(tf.float32, shape=[1, 40, 200, 300, 3])
47 216.477 MiB 0.000 MiB y1 = tf.placeholder(tf.int32)
48 216.477 MiB 0.000 MiB y2 = tf.placeholder(tf.float32)
49
50 220.199 MiB 3.723 MiB class_logit, highlight_logit = convolutional_neural_network(input_image)
51
52 220.711 MiB 0.512 MiB class_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=class_logit, labels=y1))
54 220.953 MiB 0.242 MiB highlight_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=highlight_logit, labels=y2))
55
56 227.562 MiB 6.609 MiB optimizer1 = tf.train.RMSPropOptimizer(learning_rate=1e-6).minimize(class_loss)
57 234.062 MiB 6.500 MiB optimizer2 = tf.train.RMSPropOptimizer(learning_rate=1e-7).minimize(highlight_loss)
58
59 #### Saving Network ####
60 660.691 MiB 426.629 MiB with tf.Session(graph=g) as sess:
62 666.848 MiB 6.156 MiB saver = tf.train.Saver(max_to_keep = 3)
63 1183.676 MiB 516.828 MiB sess.run(tf.global_variables_initializer())
67 1642.145 MiB 458.469 MiB for epoch in xrange(hm_epochs):
68 1642.145 MiB 0.000 MiB epoch_loss = 0
69 1642.145 MiB 0.000 MiB file_list_ = iter(file_list)
71 #for idx in xrange(num_files):
74 1642.145 MiB 0.000 MiB _file = file_folder + '/' + file_list_.next()
77 1779.477 MiB 137.332 MiB data = np.load(_file)
78 # Batch Data Generation
79 1916.629 MiB 137.152 MiB X_total = np.array([data[0][0][0], data[0][0][1], ...])
81 # Class, Score Data Fetching
82 1916.629 MiB 0.000 MiB Y1_class = data[0][1][0]
83 1916.629 MiB 0.000 MiB Y2_score = data[0][2][0]
85 1916.629 MiB 0.000 MiB batch_X = get_batch_piece(X_total, 1, 1)
86 1916.629 MiB 0.000 MiB batch_Y1 = get_batch_piece(Y1_class, 1, 1)
87 1916.629 MiB 0.000 MiB batch_Y2 = get_batch_piece(Y2_score, 1, 1)
88 1916.805 MiB 0.176 MiB _ = sess.run([optimizer1], feed_dict={input_image: batch_X, y1: batch_Y1, y2: batch_Y2})
89
90 1642.145 MiB -274.660 MiB del data, X_total, Y1_class, Y2_score, batch_X, batch_Y1, batch_Y2, optimizer1
To improve readability, I shorten the code. Even the memory profiling result is little bit different to the original code, it is the same and occurs the same problem (memory leaking). The fact is when I remove the sess.run([optimizer1], ...), the code does not leak the memory even the epoch is over 100. However, the case I run the session, the memory usage gets bigger and bigger so I cannot do train anymore even with the epoch 5.
I need your help. Thank you.
The reason is you create new tensorflow operations on every session call.
Move these two out of for loop
tf.gather(class_logit, 0), tf.gather(highlight_logit, 0)
, and problem should gone.