It is known that if you read data from disc you are IO bound and you can process/parse the read data much faster than you can read it from disc.
But this common wisdom (myth?) is not reflected by my tests. When I do read a text file with a double and and int in each line separated by a space I am much slower than my physical disc speed (factor 6). The text file looks like this
1,1 0
2,1 1
3,1 2
Update I have included the PInvoke performance when I do a ReadFile with the complete buffer in one read to get the "real" performance.
- ReadFile performance - ReadFileIntoByteBuffer
- StringReader.ReadLine performance - CountLines
- StringReader.Readline unsafe perf - ParseLinesUnsafe
- StringReader.Read unsafe char buf - ParseLinesUnsafeCharBuf
- StringReader.ReadLine + Parsing performance - ParseLines
The results are
Did native read 179,0MB in 0,4s, 484,2MB/s
Did read 10.000.000 lines in 1,6s, 112,7MB/s
Did parse and read unsafe 179,0MB in 2,3s, 76,5MB/s
Did parse and read unsafe char buf 179,0MB in 2,8s, 63,5MB/s
Did read and parse 179,0MB in 9,3s, 19,3MB/s
Although I did try to skip the string construction overhead in ParseLinesUnsafeCharBuf it is still quite a lot slower than the version which allocates a new string every time. It is still much better than the original 20 MB with the easiest solutiono but I do think .NET should be able to do better. If the remoe the logic to parse the strings I do get 258,8 MB/s which is very good and near native speed. But I do not see a way using unsafe code to make my parsing much simpler. I do have to deal with incomplete lines which makes it quite complex.
Update It is clear from the numbers that a simple string.split does cost already way too much. But the StringReader does also cost quite a lot. How would a highly optimized solution look like that gets closer to the real disc speed? I have tried many ways with unsafe code and char buffers but the performance gain was perhaps 30% but nothing in the order of magnitudes I would need. I would be ok with 100MB/s parsing speed. That should be achievable with managed code or am I wrong?
Is it not possible with C# to parse faster than I can read from my hard disc? It is a Intel Postville X25M. The CPU is and older Intel Dual Core. I have 3 GB RAM Windows 7 .NET 3.5 SP1 and .NET 4.
But I did see the same results on normal hard discs as well. Linear reading speed can be up to 400MB/s with todays hard discs. Does this imply that I should restructure my applications to read the data on demand when it is actually needed instead of reading it eagerly into memory at the cost of higher GC times due to the increased object graph which make the GC cycles much longer.
I have noticed that if I my managed application usese more than 500MB of memory it becomes much less responsive. A major contributing factor seems the complexity of the object graph. It might therefore be better to read the data when needed. At least this is my conclusion of the current data.
Here is the code
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Runtime.InteropServices;
using Microsoft.Win32.SafeHandles;
using System.ComponentModel;
namespace IOBound
{
class Program
{
static void Main(string[] args)
{
string data = @"C:\Source\IOBound\NumericData.txt";
if (!File.Exists(data))
{
CreateTestData(data);
}
int MB = (int) (new FileInfo(data).Length/(1024*1024));
var sw = Stopwatch.StartNew();
uint bytes = ReadFileIntoByteBuffer(data);
sw.Stop();
Console.WriteLine("Did native read {0:F1}MB in {1:F1}s, {2:F1}MB/s",
bytes/(1024*1024), sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);
sw = Stopwatch.StartNew();
int n = CountLines(data);
sw.Stop();
Console.WriteLine("Did read {0:N0} lines in {1:F1}s, {2:F1}MB/s",
n, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);
sw = Stopwatch.StartNew();
ParseLinesUnsafe(data);
sw.Stop();
Console.WriteLine("Did parse and read unsafe {0:F1}MB in {1:F1}s, {2:F1}MB/s",
MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);
sw = Stopwatch.StartNew();
ParseLinesUnsafeCharBuf(data);
sw.Stop();
Console.WriteLine("Did parse and read unsafe char buf {0:F1}MB in {1:F1}s, {2:F1}MB/s",
MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);
sw = Stopwatch.StartNew();
ParseLines(data);
sw.Stop();
Console.WriteLine("Did read and parse {0:F1}MB in {1:F1}s, {2:F1}MB/s",
MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);
}
private unsafe static uint ReadFileIntoByteBuffer(string data)
{
using(var stream = new FileStream(data, FileMode.Open))
{
byte[] buf = new byte[200 * 1024 * 1024];
fixed(byte* pBuf = &buf[0])
{
uint dwRead = 0;
if (ReadFile(stream.SafeFileHandle, pBuf, 200 * 1000 * 1000, out dwRead, IntPtr.Zero) == 0)
{
throw new Win32Exception();
}
return dwRead;
}
}
}
private static int CountLines(string data)
{
using (var reader = new StreamReader(data))
{
string line;
int count = 0;
while ((line = reader.ReadLine()) != null)
{
count++;
}
return count;
}
}
unsafe private static void ParseLinesUnsafeCharBuf(string data)
{
var dobules = new List<double>();
var ints = new List<int>();
using (var reader = new StreamReader(data))
{
double d = 0;
long a = 0, b = 0;
int i = 0;
char[] buffer = new char[10*1000*1000];
int readChars = 0;
int startIdx = 0;
fixed(char *ln = buffer)
{
while ((readChars = reader.Read(buffer, startIdx, buffer.Length - startIdx)) != 0)
{
char* pEnd = ln + readChars + startIdx;
char* pCur = ln;
char* pLineStart = null;
while (pCur != pEnd)
{
a = 0;
b = 0;
while (pCur != pEnd && *pCur == '\r' || *pCur == '\n')
{
pCur++;
}
pLineStart = pCur;
while(pCur != pEnd && char.IsNumber(*pCur))
{
a = a * 10 + (*pCur++ - '0');
}
if (pCur == pEnd || *pCur == '\r')
{
goto incompleteLine;
}
if (*pCur++ == ',')
{
long div = 1;
while (pCur != pEnd && char.IsNumber(*pCur))
{
b += b * 10 + (*pCur++ - '0');
div *= 10;
}
if (pCur == pEnd || *pCur == '\r')
{
goto incompleteLine;
}
d = a + ((double)b) / div;
}
else
{
goto skipRest;
}
while (pCur != pEnd && char.IsWhiteSpace(*pCur))
{
pCur++;
}
if (pCur == pEnd || *pCur == '\r')
{
goto incompleteLine;
}
i = 0;
while (pCur != pEnd && char.IsNumber(*pCur))
{
i = i * 10 + (*pCur++ - '0');
}
if (pCur == pEnd)
{
goto incompleteLine;
}
dobules.Add(d);
ints.Add(i);
continue;
incompleteLine:
startIdx = (int)(pEnd - pLineStart);
Buffer.BlockCopy(buffer, (int)(pLineStart - ln) * 2, buffer, 0, 2 * startIdx);
break;
skipRest:
while (pCur != pEnd && *pCur != '\r')
{
pCur++;
}
continue;
}
}
}
}
}
unsafe private static void ParseLinesUnsafe(string data)
{
var dobules = new List<double>();
var ints = new List<int>();
using (var reader = new StreamReader(data))
{
string line;
double d=0;
long a = 0, b = 0;
int ix = 0;
while ((line = reader.ReadLine()) != null)
{
int len = line.Length;
fixed (char* ln = line)
{
while (ix < len && char.IsNumber(ln[ix]))
{
a = a * 10 + (ln[ix++] - '0');
}
if (ln[ix] == ',')
{
ix++;
long div = 1;
while (ix < len && char.IsNumber(ln[ix]))
{
b += b * 10 + (ln[ix++] - '0');
div *= 10;
}
d = a + ((double)b) / div;
}
while (ix < len && char.IsWhiteSpace(ln[ix]))
{
ix++;
}
int i = 0;
while (ix < len && char.IsNumber(ln[ix]))
{
i = i * 10 + (ln[ix++] - '0');
}
dobules.Add(d);
ints.Add(ix);
}
}
}
}
private static void ParseLines(string data)
{
var dobules = new List<double>();
var ints = new List<int>();
using (var reader = new StreamReader(data))
{
string line;
char[] sep = new char[] { ' ' };
while ((line = reader.ReadLine()) != null)
{
var parts = line.Split(sep);
if (parts.Length == 2)
{
dobules.Add( double.Parse(parts[0]));
ints.Add( int.Parse(parts[1]));
}
}
}
}
static void CreateTestData(string fileName)
{
FileStream fstream = new FileStream(fileName, FileMode.Create);
using (StreamWriter writer = new StreamWriter(fstream, Encoding.UTF8))
{
for (int i = 0; i < 10 * 1000 * 1000; i++)
{
writer.WriteLine("{0} {1}", 1.1d + i, i);
}
}
}
[DllImport("kernel32.dll", SetLastError = true)]
unsafe static extern uint ReadFile(SafeFileHandle hFile, [Out] byte* lpBuffer, uint nNumberOfBytesToRead, out uint lpNumberOfBytesRead, IntPtr lpOverlapped);
}
}