I have been profiling TCP latency (in particular, the write
from user space to kernel space of a small message) in order to get some intuition for the latency of a write
(acknowledging that this can be context-specific). I have noticed substantial inconsistency between tests that to me seem similar, and I'm very curious to figure out where the difference comes from. I understand that microbenchmarks can be problematic, but I still feel like I am missing some fundamental understanding (since the latency differences are ~10x).
The set up is that I have a C++ TCP server that accepts one client connection (from another process on the same CPU), and upon connecting with the client makes 20 system calls to write
to the socket, sending one byte at a time. The full code of the server is copied at the end of this post. Here's the output which times each write
using boost/timer
(which adds noise of ~1 mic):
$ clang++ -std=c++11 -stdlib=libc++ tcpServerStove.cpp -O3; ./a.out
18 mics
3 mics
3 mics
4 mics
3 mics
3 mics
4 mics
3 mics
5 mics
3 mics
...
I reliably find that the first write
is significantly slower than the others. If I wrap 10,000 write
calls in a timer, the average is 2 microseconds per write
, yet the first call is always 15+ mics. Why is there this "warming" up phenomenon?
Relatedly, I ran an experiment where in between each write
call I do some blocking CPU work (calculating a large prime number). This causes all the write
calls to be slow:
$ clang++ -std=c++11 -stdlib=libc++ tcpServerStove.cpp -O3; ./a.out
20 mics
23 mics
23 mics
30 mics
23 mics
21 mics
21 mics
22 mics
22 mics
...
Given these results, I'm wondering if there is some kind of batching that happens during the process of copying bytes from the user buffer to the kernel buffer. If multiple write
calls happen in quick succession, do they get coalesced into one kernel interrupt?
In particular I am looking for some notion of how long write
takes to copy buffers from user space to kernel space. If there is some coalescing effect that allows the average write
to only take 2 mics when I do 10,000 in succession, then it would be unfairly optimistic to conclude that the write
latency is 2 mics; it seems that my intuition should be that each write
takes 20 microseconds. This seems surprisingly slow for the lowest latency you can get (a raw write
call on one byte) without kernel bypass.
A final piece of data is that when I set up a ping-pong test between two processes on my computer (a TCP server and a TCP client), I average 6 mics per round trip (which includes a read
, a write
, as well as moving through the localhost network). This seems at odds with the 20 mic latencies for a single write seen above.
Full code for the TCP server:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <boost/timer.hpp>
#include <unistd.h>
// Set up some blocking work.
bool isPrime(int n) {
if (n < 2) {
return false;
}
for (int i = 2; i < n; i++) {
if (n % i == 0) {
return false;
}
}
return true;
}
// Compute the nth largest prime. Takes ~1 sec for n = 10,000
int getPrime(int n) {
int numPrimes = 0;
int i = 0;
while (true) {
if (isPrime(i)) {
numPrimes++;
if (numPrimes >= n) {
return i;
}
}
i++;
}
}
int main(int argc, char const *argv[])
{
int server_fd, new_socket, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
// Create socket for TCP server
server_fd = socket(AF_INET, SOCK_STREAM, 0);
// Prevent writes from being batched
setsockopt(server_fd, SOL_SOCKET, TCP_NODELAY, &opt, sizeof(opt));
setsockopt(server_fd, SOL_SOCKET, TCP_NOPUSH, &opt, sizeof(opt));
setsockopt(server_fd, SOL_SOCKET, SO_SNDBUF, &opt, sizeof(opt));
setsockopt(server_fd, SOL_SOCKET, SO_SNDLOWAT, &opt, sizeof(opt));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
bind(server_fd, (struct sockaddr *)&address, sizeof(address));
listen(server_fd, 3);
// Accept one client connection
new_socket = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen);
char sendBuffer[1] = {0};
int primes[20] = {0};
// Make 20 sequential writes to kernel buffer.
for (int i = 0; i < 20; i++) {
sendBuffer[0] = i;
boost::timer t;
write(new_socket, sendBuffer, 1);
printf("%d mics\n", int(1e6 * t.elapsed()));
// For some reason, doing some blocking work between the writes
// The following work slows down the writes by a factor of 10.
// primes[i] = getPrime(10000 + i);
}
// Print a prime to make sure the compiler doesn't optimize
// away the computations.
printf("prime: %d\n", primes[8]);
}
TCP client code:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <unistd.h>
int main(int argc, char const *argv[])
{
int sock, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
// We'll be passing uint32's back and forth
unsigned char recv_buffer[1024] = {0};
// Create socket for TCP server
sock = socket(AF_INET, SOCK_STREAM, 0);
setsockopt(sock, SOL_SOCKET, TCP_NODELAY, &opt, sizeof(opt));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
// Accept one client connection
if (connect(sock, (struct sockaddr *)&address, (socklen_t)addrlen) != 0) {
throw("connect failed");
}
read(sock, buffer_pointer, num_left);
for (int i = 0; i < 10; i++) {
printf("%d\n", recv_buffer[i]);
}
}
I tried with and without the flags TCP_NODELAY
, TCP_NOPUSH
, SO_SNDBUF
and SO_SNDLOWAT
, with the idea that this might prevent batching (but my understanding is that this batching occurs between the kernel buffer and the network, not between the user buffer and the kernel buffer).
Here is server code for the ping pong test:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <boost/timer.hpp>
#include <unistd.h>
__inline__ uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ (
"xorl %%eax,%%eax \n cpuid"
::: "%rax", "%rbx", "%rcx", "%rdx");
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
// Big Endian (network order)
unsigned int fromBytes(unsigned char b[4]) {
return b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24;
}
void toBytes(unsigned int x, unsigned char (&b)[4]) {
b[3] = x;
b[2] = x>>8;
b[1] = x>>16;
b[0] = x>>24;
}
int main(int argc, char const *argv[])
{
int server_fd, new_socket, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
unsigned char recv_buffer[4] = {0};
unsigned char send_buffer[4] = {0};
// Create socket for TCP server
server_fd = socket(AF_INET, SOCK_STREAM, 0);
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
bind(server_fd, (struct sockaddr *)&address, sizeof(address));
listen(server_fd, 3);
// Accept one client connection
new_socket = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen);
printf("Connected with client!\n");
int counter = 0;
unsigned int x = 0;
auto start = rdtsc();
boost::timer t;
int n = 10000;
while (counter < n) {
valread = read(new_socket, recv_buffer, 4);
x = fromBytes(recv_buffer);
toBytes(x+1, send_buffer);
write(new_socket, send_buffer, 4);
++counter;
}
printf("%f clock cycles per round trip (rdtsc)\n", (rdtsc() - start) / double(n));
printf("%f mics per round trip (boost timer)\n", 1e6 * t.elapsed() / n);
}
Here is client code for the ping pong test:
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <boost/timer.hpp>
#include <unistd.h>
// Big Endian (network order)
unsigned int fromBytes(unsigned char b[4]) {
return b[3] | b[2]<<8 | b[1]<<16 | b[0]<<24;
}
void toBytes(unsigned int x, unsigned char (&b)[4]) {
b[3] = x;
b[2] = x>>8;
b[1] = x>>16;
b[0] = x>>24;
}
int main(int argc, char const *argv[])
{
int sock, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
// We'll be passing uint32's back and forth
unsigned char recv_buffer[4] = {0};
unsigned char send_buffer[4] = {0};
// Create socket for TCP server
sock = socket(AF_INET, SOCK_STREAM, 0);
// Set TCP_NODELAY so that writes won't be batched
setsockopt(sock, SOL_SOCKET, TCP_NODELAY, &opt, sizeof(opt));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
// Accept one client connection
if (connect(sock, (struct sockaddr *)&address, (socklen_t)addrlen) != 0) {
throw("connect failed");
}
unsigned int lastReceived = 0;
while (true) {
toBytes(++lastReceived, send_buffer);
write(sock, send_buffer, 4);
valread = read(sock, recv_buffer, 4);
lastReceived = fromBytes(recv_buffer);
}
}