I'm trying to compute hashes for a whole directory, in order to monitor changes later. It's relatively easy. However, if there are big files, the computing takes too much time, so I wound up using some multithreading.
Thanks to I/O bottlenecks, I should read a file with one thread, but I can calculate hash for that file in multiple threads with calling TransformBlock methods all at once. The problem is, the result of each calculation is different - 'cause all the threads update one instance of a hashAlgorithm, they do it erratically.
public delegate void CalculateHashDelegate(byte[] buffer);
private MD5 md5;
private long completed_threads_hash;
private object lock_for_hash = new object();
`private string getMd5Hash(string file_path)
{
string file_to_be_hashed = file_path;
byte[] hash;
try
{
CalculateHashDelegate CalculateHash = AsyncCalculateHash;
md5 = MD5.Create();
using (Stream input = File.OpenRead(file_to_be_hashed))
{
int buffer_size = 0x4096;
byte[] buffer = new byte[buffer_size];
long part_count = 0;
completed_threads_hash = 0;
int bytes_read;
while ((bytes_read = input.Read(buffer, 0, buffer.Length)) == buffer_size)
{
part_count++;
IAsyncResult ar_hash = CalculateHash.BeginInvoke(buffer, CalculateHashCallback, CalculateHash);
}
// Wait for completing all the threads
while (true)
{
lock (completed_threads_lock)
{
if (completed_threads_hash == part_count)
{
md5.TransformFinalBlock(buffer, 0, bytes_read);
break;
}
}
}
hash = md5.Hash;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < hash.Length; i++)
{
sb.Append(hash[i].ToString("x2"));
}
md5.Clear();
return sb.ToString();
}
catch (Exception ex)
{
Console.WriteLine("An exception was encountered during hashing file {0}. {1}.", file_to_be_hashed, ex.Message);
return ex.Message;
}
}
public void AsyncCalculateHash(byte[] buffer)
{
lock (lock_for_hash)
{
md5.TransformBlock(buffer, 0, buffer.Length, null, 0);
}
}
private void CalculateHashCallback(IAsyncResult ar_hash)
{
try
{
CalculateHashDelegate CalculateHash = ar_hash.AsyncState as CalculateHashDelegate;
CalculateHash.EndInvoke(ar_hash);
}
catch (Exception ex)
{
Console.WriteLine("Callback exception: ", ex.Message);
}
finally
{
lock (completed_threads_lock)
{
completed_threads_hash++;
}
}
}
Is there a way to organize the hashing process? I can't use .Net newer than 3.5 and such classes as BackroundWorker and ThreadPool. Or maybe there is another method for parallel hash calculating?