I made a project that basically uses googles object detection api with tensorflow.
All i am doing is inference with a pre-trained model: Which means realtime object detection where the Input is the Videostream of a webcam or something similar using OpenCV.
Right now i got pretty decent performance results, but i want to further increase the FPS.
Because what i experience is that Tensorflow uses my whole Memory while Inference but the GPU Usage is not maxed out at all (around 40% with a NVIDIA GTX 1050 Laptop, and 6% on a NVIDIA Jetson Tx2).
So my idea was to increase the GPU Usage by increasing the image batch size which is fed in each session run.
So my question is: How can i Batch multiple Frames of the Input-Videostream together before i feed them to sess.run()
?
Have a look at my code object_detetection.py
on my github repo: (https://github.com/GustavZ/realtime_object_detection).
I would be very thankful if you come up with some Hints or Code Implementations!
import numpy as np
import os
import six.moves.urllib as urllib
import tarfile
import tensorflow as tf
import cv2
# Protobuf Compilation (once necessary)
os.system('protoc object_detection/protos/*.proto --python_out=.')
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as vis_util
from stuff.helper import FPS2, WebcamVideoStream
# INPUT PARAMS
# Must be OpenCV readable
# 0 = Default Camera
video_input = 0
visualize = True
max_frames = 300 #only used if visualize==False
width = 640
height = 480
fps_interval = 3
bbox_thickness = 8
# Model preparation
# What model to download.
MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
MODEL_FILE = MODEL_NAME + '.tar.gz'
DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = 'models/' + MODEL_NAME + '/frozen_inference_graph.pb'
# List of the strings that is used to add correct label for each box.
LABEL_MAP = 'mscoco_label_map.pbtxt'
PATH_TO_LABELS = 'object_detection/data/' + LABEL_MAP
NUM_CLASSES = 90
# Download Model
if not os.path.isfile(PATH_TO_CKPT):
print('Model not found. Downloading it now.')
opener = urllib.request.URLopener()
opener.retrieve(DOWNLOAD_BASE + MODEL_FILE, MODEL_FILE)
tar_file = tarfile.open(MODEL_FILE)
for file in tar_file.getmembers():
file_name = os.path.basename(file.name)
if 'frozen_inference_graph.pb' in file_name:
tar_file.extract(file, os.getcwd())
os.remove('../' + MODEL_FILE)
else:
print('Model found. Proceed.')
# Load a (frozen) Tensorflow model into memory.
detection_graph = tf.Graph()
with detection_graph.as_default():
od_graph_def = tf.GraphDef()
with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
serialized_graph = fid.read()
od_graph_def.ParseFromString(serialized_graph)
tf.import_graph_def(od_graph_def, name='')
# Loading label map
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)
# Start Video Stream
video_stream = WebcamVideoStream(video_input,width,height).start()
cur_frames = 0
# Detection
with detection_graph.as_default():
with tf.Session(graph=detection_graph) as sess:
# Definite input and output Tensors for detection_graph
image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
# Each box represents a part of the image where a particular object was detected.
detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
# Each score represent how level of confidence for each of the objects.
# Score is shown on the result image, together with the class label.
detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
num_detections = detection_graph.get_tensor_by_name('num_detections:0')
# fps calculation
fps = FPS2(fps_interval).start()
print ("Press 'q' to Exit")
while video_stream.isActive():
image_np = video_stream.read()
# Expand dimensions since the model expects images to have shape: [1, None, None, 3]
image_np_expanded = np.expand_dims(image_np, axis=0)
# Actual detection.
(boxes, scores, classes, num) = sess.run(
[detection_boxes, detection_scores, detection_classes, num_detections],
feed_dict={image_tensor: image_np_expanded})
# Visualization of the results of a detection.
vis_util.visualize_boxes_and_labels_on_image_array(
image_np,
np.squeeze(boxes),
np.squeeze(classes).astype(np.int32),
np.squeeze(scores),
category_index,
use_normalized_coordinates=True,
line_thickness=bbox_thickness)
if visualize:
cv2.imshow('object_detection', image_np)
# Exit Option
if cv2.waitKey(1) & 0xFF == ord('q'):
break
else:
cur_frames += 1
if cur_frames >= max_frames:
break
# fps calculation
fps.update()
# End everything
fps.stop()
video_stream.stop()
cv2.destroyAllWindows()
print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed()))
print('[INFO] approx. FPS: {:.2f}'.format(fps.fps()))