The code below shows my attempt to run an algorithm on single GPUs and feed data to it using a FIFO queue. The data exists in a CSV file. I use a separate python thread to read from the file one line at a time and enqueue the line into a FIFO.
N = 16
num_ckfs =80000
q = [0.01 for i in range(N)]
q_ckfs = np.array([q for i in range(num_ckfs)])
r = [5]
r_ckfs = np.array([r for i in range(num_ckfs)])
init_var = [10.0 for i in range(N)]
init_var_ckfs = np.array([init_var for i in range(num_ckfs)])
init_state = [0.0 for i in range(N)]
init_state_ckfs = np.array([init_state for i in range(num_ckfs)])
class CKF(object):
def __init__(self, num_ckfs, N):
self.init_variances = tf.Variable(init_var_ckfs, name='init_variances', dtype=tf.float64)
self.init_states = tf.Variable(init_state_ckfs, name='init_states', dtype=tf.float64)
init_states_expanded = tf.expand_dims(self.init_states, 2) # num_ckfs X N X 1
self.q_values = tf.constant(q_ckfs, name='q_values', dtype=tf.float64)
self.r_values = tf.constant(r_ckfs, name='r_values', dtype=tf.float64)
self.input_vectors = tf.placeholder(tf.float64, shape=[num_ckfs, N], name='input_vectors')
self.z_k = tf.placeholder(tf.float64, shape=[num_ckfs, 1], name='z_k');
q = tf.FIFOQueue(200, [tf.float64, tf.float64], shapes=[[num_ckfs,1], [num_ckfs,N]])
self.enqueue_op = q.enqueue([self.z_k, self.input_vectors])
observations, inputs = q.dequeue()
#further processing using the input data
with tf.device('/gpu:0'):
ckf_gpu0 = CKF(num_ckfs, N)
def load_and_enqueue():
#read one line at a time
#obvs_list corresponds to the first column
#data_list corresponds to the rest of the columns
session.run(ckf_gpu0.enqueue_op, feed_dict={
ckf_gpu0.input_vectors: data_list[0], ckf_gpu0.z_k: obvs_list[0]})
count += 1
t = threading.Thread(target=load_and_enqueue)
t.start()
for i in range( num_rows):
out = session.run([ckf_gpu0.projected_output ])
The first problem that I have run into is:
InvalidArgumentError (see above for traceback): Cannot assign a device to node 'fifo_queue': Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Is there an alternate way to do such a thing, i.e. hide the I/O latency while the computation is being done on a GPU?