I have tried the dHash algorithm which is applied on each image, then a hamming_distance is calculated on both hashes, the lower the number, the higher the similarity.
from PIL import Image
import os
import shutil
import glob
from plotData import *
def hamming_distance(s1, s2):
#Return the Hamming distance between equal-length sequences
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))
def dhash(image, hash_size = 8):
# Grayscale and shrink the image in one step.
image = image.convert('L').resize(
(hash_size + 1, hash_size),
Image.ANTIALIAS,
)
pixels = list(image.getdata())
# Compare adjacent pixels.
difference = []
for row in xrange(hash_size):
for col in xrange(hash_size):
pixel_left = image.getpixel((col, row))
pixel_right = image.getpixel((col + 1, row))
difference.append(pixel_left > pixel_right)
# Convert the binary array to a hexadecimal string.
decimal_value = 0
hex_string = []
for index, value in enumerate(difference):
if value:
decimal_value += 2**(index % 8)
if (index % 8) == 7:
hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
decimal_value = 0
return ''.join(hex_string)
orig = Image.open('imageA.png')
modif = Image.open('imageA.png')
hammingDistanceValue = hamming_distance(dhash(orig),dhash(modif))
print hammingDistanceValue
Unfortunately, this approach produces false positives because it does not really look at the line chart shapes as primary similarity feature. I guess, I'd need some kind of machine learning approach maybe from openCV or so. Can anyone guide me into the right direction to something that compares with high precision?
this is the initial image to compare against a collection of similar images.
this is a positive match
this is a false match
update: I added some opencv magic to jme's suggestion below. I try to detect significant features first. Howeve, it still produces false positives, since the overall indicator for similarity is the cummulated value over all features and does not take differences into account that can give a line chart a totally different meaning.
False Positive example
Example of preprocessed image with significant features marked as red dots
from PIL import Image
import os
import numpy as np
from scipy.interpolate import interp1d
import os.path
import shutil
import glob
from plotData import *
import cv2
from matplotlib import pyplot as plt
def load_image(path):
#data = Image.open(path)
img = cv2.imread(path)
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
corners = cv2.goodFeaturesToTrack(gray,25,0.01,10)
corners = np.int0(corners)
for i in corners:
x,y = i.ravel()
cv2.circle(img,(x,y),3,255,-1)
return np.mean((255 - np.array(img))**2, axis=2)
symbol = "PBYI"
x = np.arange(1000)
if not os.path.exists('clusters1DSignal/'+symbol+'/'):
os.mkdir('clusters1DSignal/'+symbol+'/')
else:
shutil.rmtree('clusters1DSignal/'+symbol+'/')
os.mkdir('clusters1DSignal/'+symbol+'/')
shutil.copyfile('rendered/'+symbol+'.png', "clusters1DSignal/"+symbol+"/"+symbol+'.png')
img1 = load_image('rendered/'+symbol+'.png')
y1 = np.argmax(img1, axis=0)
f1 = interp1d(np.linspace(0, 1000, len(y1)), y1)
z1 = f1(x)
for filename in glob.iglob('rendered/*.png'):
try:
img2 = load_image(filename)
except:
continue
y2 = np.argmax(img2, axis=0)
f2 = interp1d(np.linspace(0, 1000, len(y2)), y2)
z2 = f2(x)
result = np.linalg.norm(z1 - z2)
if result < 2100:
print str(result) +": " +filename
symbolCompare = filename.split("/")[1].replace(".png","")
shutil.copyfile('rendered/'+symbolCompare+'.png', "clusters1DSignal/"+symbol+"/"+str(result)+"_"+symbolCompare+".png")
The approach I'd take is this: first, convert each image to a 1d signal by finding for each
x
pixel, a representativey
pixel where the image is red. You can take the mean of they
pixels, but for simplicity, I'll just take the first that isn't white:y1
,y2
, andy3
are 1d arrays which represent the functions in the first, second, and third images. Now we simply treat each array as a vector, and find thel2
distance between them. We prefer thel2
distance because the Hamming distance will be somewhat sensitive for this task.We have a slight problem: the images have different widths, so the
y
arrays aren't of compatible size. A quick-and-dirty fix is to interpolate them to a longer length (we'll use 1000):Now we can find the distance between the images: