In this question, the following code:
public static void Swap(byte[] data)
{
for (int i = 0; i < data.Length; i += 2)
{
byte b = data[i];
data[i] = data[i + 1];
data[i + 1] = b;
}
}
was rewritten in unsafe code to improve its performance:
public static unsafe void SwapX2(Byte[] Source)
{
fixed (Byte* pSource = &Source[0])
{
Byte* bp = pSource;
Byte* bp_stop = bp + Source.Length;
while (bp < bp_stop)
{
*(UInt16*)bp = (UInt16)(*bp << 8 | *(bp + 1));
bp += 2;
}
}
}
Assuming that one wanted to do the same thing with 32 bit words:
public static void SwapX4(byte[] data)
{
byte temp;
for (int i = 0; i < data.Length; i += 4)
{
temp = data[i];
data[i] = data[i + 3];
data[i + 3] = temp;
temp = data[i + 1];
data[i + 1] = data[i + 2];
data[i + 2] = temp;
}
}
how would this be rewritten in a similar fashion?
public static unsafe void SwapX4(Byte[] Source)
{
fixed (Byte* pSource = &Source[0])
{
Byte* bp = pSource;
Byte* bp_stop = bp + Source.Length;
while (bp < bp_stop)
{
*(UInt32*)bp = (UInt32)(
(*bp << 24) |
(*(bp + 1) << 16) |
(*(bp + 2) << 8) |
(*(bp + 3) ));
bp += 4;
}
}
}
Note that both of these functions (my SwapX4 and your SwapX2) will only swap anything on a little-endian host; when run on a big-endian host, they are an expensive no-op.
This version will not exceed the bounds of the buffer. Works on both Little and Big Endian architectures. And is faster on larger data. (Update: Add build configurations for x86 and x64, predefine X86 for 32 bit(x86) and X64 for 64 bit(x64) and it'll be slightly faster.)
public static unsafe void Swap4(byte[] source)
{
fixed (byte* psource = source)
{
#if X86
var length = *((uint*)(psource - 4)) & 0xFFFFFFFEU;
#elif X64
var length = *((uint*)(psource - 8)) & 0xFFFFFFFEU;
#else
var length = (source.Length & 0xFFFFFFFE);
#endif
while (length > 7)
{
length -= 8;
ulong* pulong = (ulong*)(psource + length);
*pulong = ( ((*pulong >> 24) & 0x000000FF000000FFUL)
| ((*pulong >> 8) & 0x0000FF000000FF00UL)
| ((*pulong << 8) & 0x00FF000000FF0000UL)
| ((*pulong << 24) & 0xFF000000FF000000UL));
}
if(length != 0)
{
uint* puint = (uint*)psource;
*puint = ( ((*puint >> 24))
| ((*puint >> 8) & 0x0000FF00U)
| ((*puint << 8) & 0x00FF0000U)
| ((*puint << 24)));
}
}
}