I have built a Convolutional LSTM model using Tensorflow ConvLSTMCell(), tf.nn.dynamic_rnn(), and tf.contrib.legacy_seq2seq.rnn_decoder(). I have 3 layers of encoder, and 3 layers of decoder, the initial states of decoders come from the final states of encoders. I have 128, 64, and 64 filters for layer 1, layer 2, and layer 3 respectively. finally, I concatenate the outputs of decoders and pass them through a convolution layer to decrease the number of channels to one. and then I apply the loss function. My dataset is Moving mnist dataset. in Moving mnist dataset each sequence has 20 frames, by this model I am trying to predict frame 11 to 20 based on the first 10 frames. But the output which is a 10-frame sequence is far from the ground truth and basically tries to reproduce the last input frame which is the 10th frame. I put the code here, thanks for your help.
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import tensorflow as tf
import numpy as np
from sklearn.metrics import confusion_matrix
import time
from datetime import timedelta
import math
import random
from random import getrandbits
from tensorflow.contrib.rnn.python.ops.rnn_cell import ConvLSTMCell
from tensorflow.python.ops.rnn_cell import LSTMStateTuple
tf.reset_default_graph()
# cell = ConvLSTMCell()
num_channels = 1
img_size = 64
#filter sizes
filter_size1 = 5
filter_size2 = 5
filter_size3 = 5
#number of filters in each layer
num_filters1 = 128
num_filters2 = 64
num_filters3 = 64
img_size_flat = img_size * img_size
y = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='y')
y_image = tf.reshape(y, [-1, img_size, img_size, num_channels], name='y_image')
z = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='z')
z_image = tf.reshape(z, [-1, img_size, img_size, num_channels], name='z_image')
x = tf.placeholder(tf.float32, shape=[None,None,img_size,img_size,num_channels],
name='x')
with tf.variable_scope("Encoder"):
with tf.variable_scope("Encoder_Layer1"):
InputShape = [img_size, img_size, num_channels]
encoder_1_KernelShape = [filter_size1, filter_size1]
rnn_cell = ConvLSTMCell(2, InputShape, num_filters1, encoder_1_KernelShape,
use_bias=True, forget_bias=1.0, name='Encoder_1')
# defining initial state
#initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
initial_state = rnn_cell.zero_state(1, dtype=tf.float32)
encoder_1_outputs, encoder_1_state = tf.nn.dynamic_rnn(rnn_cell, x,
initial_state=initial_state,
dtype=tf.float32)
with tf.variable_scope("Encoder_Layer2"):
Encoder_2_InputShape = [img_size, img_size, num_filters1]
encoder_2_KernelShape = [filter_size2, filter_size2]
encoder_2_cell = ConvLSTMCell(2, Encoder_2_InputShape, num_filters2, encoder_2_KernelShape,
use_bias=True, forget_bias=1.0, name='Encoder_2')
initial_state_2 = encoder_2_cell.zero_state(1, dtype=tf.float32)
encoder_2_outputs, encoder_2_state = tf.nn.dynamic_rnn(encoder_2_cell, encoder_1_outputs,
initial_state=initial_state_2,
dtype=tf.float32)
with tf.variable_scope("Encoder_Layer3"):
Encoder_3_InputShape = [img_size, img_size, num_filters2]
encoder_3_KernelShape = [filter_size3, filter_size3]
encoder_3_cell = ConvLSTMCell(2, Encoder_3_InputShape, num_filters3, encoder_3_KernelShape,
use_bias=True, forget_bias=1.0, name='Encoder_3')
initial_state_3 = encoder_3_cell.zero_state(1, dtype=tf.float32)
encoder_3_outputs, encoder_3_state = tf.nn.dynamic_rnn(encoder_3_cell, encoder_2_outputs,
initial_state=initial_state_3,
dtype=tf.float32)
#Weights function
def new_weights(shape, name):
return tf.get_variable(name, shape, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
#Convolution function
def conv_layer(input, # The previous layer.
num_input_channels, # Num. channels in prev. layer.
filter_size, # Width and height of each filter.
num_filters): # Number of filters.
#with tf.variable_scope("ConvLayer") as Conv_Layer:
filter_shape = [filter_size, filter_size, num_input_channels, num_filters]
w = new_weights(shape=filter_shape, name='ConvLayer_Weights')
conv_output = tf.nn.conv2d(input=input,
filter=w,
strides=[1, 1, 1, 1],
padding='SAME')
#relu_output = tf.nn.relu(conv_output)
return conv_output
#Loss function
def loss(prediction, label):
#with tf.variable_scope("Loss") as Loss_scope:
log_pred = tf.log(tf.clip_by_value((prediction),1e-10,1.0), name='Prediction_Log')
log_pred_2 = tf.log(tf.clip_by_value((1-prediction),1e-10,1.0), name='1-Prediction_Log')
cross_entropy = -tf.multiply(label, log_pred) - tf.multiply((1-label), log_pred_2)
return cross_entropy
# In[ ]:
labels = tf.reshape(y_image, [1, 10, 64, 64, 1])
w = tf.get_variable(name = "decoder_1_weights", shape =[10, 5, 5, 1, num_filters1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
true_label = tf.nn.conv3d(input=labels,
filter=w,
strides=[1, 1, 1, 1, 1],
padding='SAME')
true_label = tf.reshape(true_label, [1, 10, 64, 64, num_filters1])
true_label = tf.unstack(true_label, num = 10, axis = 1)
# In[ ]:
START = np.zeros((((1, 10, 64, 64, num_filters1))))
START = np.float32(START)
GO = tf.unstack(START, num = 10, axis = 1)
# In[ ]:
def loop_fn(previous_output, time):
if previous_output is None: # time == 0
START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START')
return START
else:
return previous_output
# In[ ]:
#loop function for the first decoder in the training phase, we are randomly feeding the ground truth
def loop_fn_train_1(previous_output, time):
if previous_output is None: # time == 0
START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START')
return START
else:
if(bool(random.getrandbits(1))):
return previous_output
else:
return true_label[time]
with tf.variable_scope("Decoder"):
with tf.variable_scope("Decoder_Layer1"):
decoder_1_InputShape = [img_size,img_size, num_filters1]
decoder_1_KernelShape = [filter_size1,filter_size1]
decoder_1_rnn_cell = ConvLSTMCell(2, decoder_1_InputShape, num_filters1, decoder_1_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_1')
decoder_1_outputs, decoder_1_states = tf.contrib.legacy_seq2seq.rnn_decoder(true_label, encoder_1_state,
decoder_1_rnn_cell, loop_fn_train_1)
with tf.variable_scope("Decoder_Layer2"):
decoder_2_InputShape = [img_size,img_size, num_filters2]
decoder_2_KernelShape = [filter_size2,filter_size2]
decoder_2_rnn_cell = ConvLSTMCell(2, decoder_2_InputShape, num_filters2, decoder_2_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_2')
w = tf.get_variable(name = "decoder_2_weights", shape =[10, 5, 5, num_filters1, num_filters2], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
decoder_2_inputs = tf.nn.conv3d(input=decoder_1_outputs,
filter=w,
strides=[1, 1, 1, 1, 1],
padding='SAME')
decoder_2_inputs = tf.reshape(decoder_2_inputs, [1, 10, 64, 64, num_filters2])
decoder_2_inputs = tf.unstack(decoder_2_inputs, num = 10, axis = 1)
#loop function for the second decoder in the training phase, we are randomly feeding the ground truth
def loop_fn_train_2(previous_output, time):
if previous_output is None: # time == 0
START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START')
return START
else:
if(bool(random.getrandbits(1))):
return previous_output
else:
return decoder_2_inputs[time]
decoder_2_outputs, decoder_2_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_2_inputs, encoder_2_state,
decoder_2_rnn_cell, loop_fn_train_2)
with tf.variable_scope("Decoder_Layer3"):
decoder_3_InputShape = [img_size,img_size, num_filters3]
decoder_3_KernelShape = [filter_size3,filter_size3]
decoder_3_rnn_cell = ConvLSTMCell(2, decoder_3_InputShape, num_filters3, decoder_3_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_3')
w = tf.get_variable(name = "decoder_3_weights", shape =[10, 5, 5, num_filters2, num_filters3], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
decoder_3_inputs = tf.nn.conv3d(input=decoder_2_outputs,
filter=w,
strides=[1, 1, 1, 1, 1],
padding='SAME')
decoder_3_inputs = tf.reshape(decoder_3_inputs, [1, 10, 64, 64, num_filters3])
decoder_3_inputs = tf.unstack(decoder_3_inputs, num = 10, axis = 1)
#loop function for the second decoder in the training phase, we are randomly feeding the ground truth
def loop_fn_train_3(previous_output, time):
if previous_output is None: # time == 0
START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START')
return START
else:
if(bool(random.getrandbits(1))):
return previous_output
else:
return decoder_3_inputs[time]
decoder_3_outputs, decoder_3_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_3_inputs, encoder_3_state,
decoder_3_rnn_cell, loop_fn_train_3)
# In[ ]:
with tf.variable_scope("Decoder", reuse=True):
with tf.variable_scope("Decoder_Layer1"):
decoder_1_InputShape = [img_size,img_size, num_filters1]
decoder_1_KernelShape = [filter_size1,filter_size1]
decoder_1_rnn_cell = ConvLSTMCell(2, decoder_1_InputShape, num_filters1, decoder_1_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_1')
Test_decoder_1_outputs, Test_decoder_1_states = tf.contrib.legacy_seq2seq.rnn_decoder(GO, encoder_1_state,
decoder_1_rnn_cell, loop_fn)
with tf.variable_scope("Decoder_Layer2"):
decoder_2_InputShape = [img_size,img_size, num_filters2]
decoder_2_KernelShape = [filter_size2,filter_size2]
decoder_2_rnn_cell = ConvLSTMCell(2, decoder_2_InputShape, num_filters2, decoder_2_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_2')
w = tf.get_variable(name = "decoder_2_weights", shape =[10, 5, 5, num_filters1, num_filters2], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
decoder_2_inputs = tf.nn.conv3d(input=Test_decoder_1_outputs,
filter=w,
strides=[1, 1, 1, 1, 1],
padding='SAME')
decoder_2_inputs = tf.reshape(decoder_2_inputs, [1, 10, 64, 64, num_filters2])
decoder_2_inputs = tf.unstack(decoder_2_inputs, num = 10, axis = 1)
Test_decoder_2_outputs, Test_decoder_2_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_2_inputs, encoder_2_state,
decoder_2_rnn_cell,
loop_fn)
with tf.variable_scope("Decoder_Layer3"):
decoder_3_InputShape = [img_size,img_size, num_filters3]
decoder_3_KernelShape = [filter_size3,filter_size3]
decoder_3_rnn_cell = ConvLSTMCell(2, decoder_3_InputShape, num_filters3, decoder_3_KernelShape,
use_bias=True, forget_bias=1.0, name='Decoder_3')
w = tf.get_variable(name = "decoder_3_weights", shape =[10, 5, 5, num_filters2, num_filters3], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05))
decoder_3_inputs = tf.nn.conv3d(input=Test_decoder_2_outputs,
filter=w,
strides=[1, 1, 1, 1, 1],
padding='SAME')
decoder_3_inputs = tf.reshape(decoder_3_inputs, [1, 10, 64, 64, num_filters3])
decoder_3_inputs = tf.unstack(decoder_3_inputs, num = 10, axis = 1)
Test_decoder_3_outputs, Test_decoder_3_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_3_inputs, encoder_3_state,
decoder_3_rnn_cell, loop_fn)
Conv_inputs = tf.concat([decoder_1_outputs,decoder_2_outputs, decoder_3_outputs], 4)
Conv_inputs = tf.reshape(Conv_inputs, [10, 64, 64, num_filters1 + num_filters2 + num_filters3])
# In[ ]:
Test_Conv_inputs = tf.concat([Test_decoder_1_outputs, Test_decoder_2_outputs, Test_decoder_3_outputs], 4)
Test_Conv_inputs = tf.reshape(Test_Conv_inputs, [10, 64, 64, num_filters1 + num_filters2 + num_filters3])
# In[ ]:
with tf.variable_scope("ConvLayer"):
with tf.variable_scope("ConvLayer_Pred"):
pred_1 = conv_layer(input=Conv_inputs, # The previous layer.
num_input_channels=num_filters1 + num_filters2 + num_filters3, # Num. channels in prev. layer.
filter_size=1, # Width and height of each filter.
num_filters=1)
# In[ ]:
with tf.variable_scope("ConvLayer", reuse=True):
with tf.variable_scope("ConvLayer_Pred"):
Test_pred_1 = conv_layer(input=Test_Conv_inputs, # The previous layer.
num_input_channels=num_filters1 + num_filters2 + num_filters3, # Num. channels in prev. layer.
filter_size=1, # Width and height of each filter.
num_filters=1)
with tf.variable_scope("Training_Loss"):
with tf.variable_scope("Loss_Pred"):
Pdistance = loss(prediction=pred_1, label=y_image)
#cost = tf.reduce_sum(distance)
with tf.variable_scope("Training_Loss", reuse=True):
with tf.variable_scope("Loss_Pred"):
Test_Pdistance = loss(prediction=Test_pred_1, label=y_image)
#cost = tf.reduce_sum(distance)
cost = tf.reduce_sum(Pdistance)
Test_cost = tf.reduce_sum(Test_Pdistance)
#batch_cost += cost
with tf.variable_scope("Optimizer"):
optimizer = tf.train.AdamOptimizer(1e-3).minimize(cost)
session = tf.Session()
session.run(tf.global_variables_initializer())
# In[ ]:
cwd = '/Users/maryamr/Tensorflow/'
# In[ ]:
data = np.load(cwd+'mnist_test_seq.npy')
data_2 = data.reshape([20*10000,64*64])
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
scaler = MinMaxScaler()
scaler.fit(data_2)
print(scaler.data_max_)
data_3 = scaler.transform(data_2)
# In[ ]:
data_3 = data_3.reshape([20, 10000, 64, 64])
# In[ ]:
cost_record = np.zeros(10000)
for i in range(10000):
x_train = data_3[0:10, i, :, :]
x_train = x_train.flatten()
x_train = x_train.reshape([1, 10, img_size, img_size, 1])
x_train = np.float32(x_train)
y_train = data_3[10:20, i, :, :]
#print("true_y_sum: {}".format(np.sum(y_train)))
y_train = y_train.flatten()
y_train = y_train.reshape([10, img_size * img_size])
y_train = np.float32(y_train)
x_2 = np.reshape(x_train,[10, 64, 64, 1])
x_train_reverse = np.flip(x_2, 0)
z_train = np.reshape(x_train_reverse,[10, 64*64])
feed_dict_train = {x: x_train,
y: y_train,
z:z_train}
if(i < 9990):
session.run(optimizer, feed_dict=feed_dict_train)
cost_out = session.run(cost, feed_dict=feed_dict_train)
cost_record[i]=cost_out
else:
final_pred_1 = session.run(Test_pred_1, feed_dict=feed_dict_train)
true_label = session.run(y_image, feed_dict=feed_dict_train)
#Hid = session.run(encoder_1_state.h, feed_dict=feed_dict_train)
Cell_1, Cell_2, Cell_3 = session.run([encoder_1_state.c, encoder_2_state.c, encoder_3_state.c], feed_dict=feed_dict_train)
cost_out = session.run(Test_cost, feed_dict=feed_dict_train)
print("cost: {}".format(cost_out))
cost_record[i]=cost_out
# In[ ]:
plt.plot(cost_record)
plt.xlabel('number of iterations')
plt.ylabel('loss')
#plt.show()
plt.savefig('/Users/maryamr/Loss_plot.png', bbox_inches = 'tight')
# In[ ]:
plt.imsave('/Users/maryamr/Cell_1.png', Cell_1[0,:,:,15], cmap='gray')
plt.imsave('/Users/maryamr/Cell_2.png', Cell_2[0,:,:,15], cmap='gray')
plt.imsave('/Users/maryamr/Cell_3.png', Cell_3[0,:,:,15], cmap='gray')
# In[ ]:
f, axarr = plt.subplots(2, 5)
m = 0
for i in range(2):
for j in range(5):
axarr[i, j].imshow(final_pred_1[m,:,:,0], cmap='gray')
axarr[i, j].get_xaxis()
axarr[i, j].get_yaxis()
m+=1
plt.savefig('/Users/maryamr/final_pred_1_10.png', bbox_inches = 'tight')
# In[ ]:
f, axarr = plt.subplots(2, 5)
m = 0
for i in range(2):
for j in range(5):
axarr[i, j].imshow(true_label[m,:,:,0], cmap='gray')
axarr[i, j].get_xaxis()
axarr[i, j].get_yaxis()
m+=1
plt.savefig('/Users/maryamr/true_label_10.png', bbox_inches = 'tight')
These are the input, output, and loss function (The first 10 images are input and the second 10 images are ground truth for the prediction).I trained the model on 9990 sequences and start the test from 9990 till 10000, that is why you see a jump in the loss plot. and also these results are for the 10000th sequence:
Because you haven't save your model ,if you save the model ,you can restore your model and do prediction.