This is a bit of an odd one and apologies if I don't explain it very well.
I am using the following simple code to send messages that I have popped off a queue, using a simple TCP socket, the messages are sent over localhost to another port on the same machine -
try
{
Socket.Select(null, writeList, null, 120000000/*120 seconds*/);
}
catch (SocketException e)
{
log.Error("Select returned an error waiting to send... " + e.Message + " Errorcode: " + e.ErrorCode);
connected = false;
socket.Close();
}
bool readyToWrite = false;
for (int i = 0; i < writeList.Count; i++)
{
readyToWrite = true;
}
if (readyToWrite)
{
try
{
//log.Debug("Sending message type: " + message.header.msgType);
socket.Send(message, message.header.dataLength, SocketFlags.None);
//log.Debug("Message sent");
}
catch (SocketException e)
{
log.Error(e.Message + " Error code: " + e.ErrorCode);
connected = false;
socket.Close();
}
}
else
{
log.Error("Not able to write - stopping sender thread and closing socket");
connected = false;
socket.Close();
}
This normally works fine and in fact my application sends several messages to the other end without a problem initially.
However, I then add 10 or so messages in quick succession to the queue, which get popped off and sent ok, seemingly - the log statements show Send() returned ok and when I look at a network trace it seems the other end has acknowledged them.
But it hasn't. The other end is in a loop calling select() with a one second timeout and this keeps coming back with no data to read, until about 30 seconds later (the same every time), all the messages arrive at the other end all at once.
C++ code from the other side of the connection -
while (m_bRunning && bOK && !bReadyToRead)
{
m_bIsAlive = true;
switch(pSocket->Select(1, true))
{
case 1: // Ready to read
//TRACE("Data ready to be read from RAM\n");
bReadyToRead = true;
break;
case 0: // Select timed out
if (GetTickCount() > dwTimeout)
{
bOK = false;
}
// else No action needed
break;
default: // Error detected
TRACE("Select returned error...\n");
bOK = false;
break;
}
}
// Try and read a message header
iBytesExpected = sizeof(RAM_HEADER);
while ((m_bRunning && bOK) && (iBytesSoFar < iBytesExpected))
{
m_bIsAlive = true;
iBytesRead = pSocket->Read(pWritePos, iBytesExpected-iBytesSoFar);
The C++ select wrapper looks like this -
int CRawSocket::Select(ULONG ulTimeoutSeconds, bool bCheckRead)
{
int iResult = -1; // Error by default
int iSelectReturn = 0;
fd_set readSet;
fd_set writeSet;
struct timeval timeout;
timeout.tv_sec = ulTimeoutSeconds;
timeout.tv_usec = 0;
FD_ZERO(&readSet);
FD_ZERO(&writeSet);
if (bCheckRead)
{
FD_SET(m_hSocket, &readSet);
iSelectReturn = select(1, &readSet, NULL, NULL, &timeout);
}
else
{
FD_SET(m_hSocket, &writeSet);
iSelectReturn = select(1, NULL, &writeSet, NULL, &timeout);
}
if(iSelectReturn != SOCKET_ERROR)
{
if (FD_ISSET(m_hSocket, &readSet))
{
iResult = 1; // Ready to READ
}
else if (FD_ISSET(m_hSocket, &writeSet))
{
iResult = 2; // Ready to WRITE
}
else
{
iResult = 0; // Select TIMED OUT
}
}
else
{
const int e = WSAGetLastError();
ERRORLOG("Select socket error %lu\n", e);
iResult = -1; // Some error occurred
}
return iResult;
}
And the read method -
int CReadWriteSocket::Read(void *pData, int nLen)
{
char* pcData = (char* )pData;
int n = nLen;
// if data size is bigger then network buffer
// handle it nice
do
{
int r1 = ::recv (m_hSocket, pcData, n, 0);
if (r1 == SOCKET_ERROR)
{
int e = WSAGetLastError();
if (e == WSAEWOULDBLOCK)
{
return nLen - n;
}
else
{
TRACE("Socket Read error %d\n", e);
return -1; // error other than would block detected
}
}
else if (r1 == 0) // Connection has closed
{
TRACE("Socket appears to have closed (zero bytes read)\n");
return -1; // Show this as an "error"
}
else if (r1 < 0)
{
ASSERT(0);
return nLen - n;
}
pcData += r1;
n -= r1;
} while (n > 0);
ASSERT(n == 0);
return nLen;
}
I am completely confused as this seems to be standard code that I use all over the place and i've never seen a problem like this occur.
It has been suggested trying the NoDelay socket option, but that had no effect - and infact this would never cause delays of this length from what I am aware.
Any suggestions would be greatly appreciated! Thanks.