I have two machines, one "server" and one "client". Both are CentOS6 with NodeJS v5.8.0.
The server runs the following program:
const AppPort = 8080;
var app = require('express')();
var logger = require('log4js').getLogger();
var onFinished = require('on-finished');
var uid = require('uid');
var reqCnt = 0;
var reqFin = 0;
app.get('/', function(req, res) {
onFinished(req, function() {
reqFin++;
var ts2 = (new Date()).getTime();
logger.info(`uid=${req.uid}, dt=${ts2-req.ts1}`);
});
req.ts1 = (new Date()).getTime();
req.uid = uid();
reqCnt++;
logger.info(`ReqCnt=${reqCnt}, fins=${reqFin}`);
res.send("This is XML");
});
app.listen(AppPort);
It's only purpose to return "This is XML"
string and calculate time of fulfilling the request.
On the "client" machine I run the following program:
const AppPort = 10000;
var onFinished = require('on-finished');
var async = require('async');
var request = require('request');
var logger = require('log4js').getLogger();
var app = require('express')();
var fs = require('fs');
var util = require('util');
url = "http://my-server";
var errCnt = 0;
var okCnt = 0;
var active2 = 0;
setInterval(function() {
var errFrac = Math.floor(errCnt/(okCnt+errCnt)*100);
logger.info(`${okCnt},${errCnt},${active2},${errFrac}`);
}, 1000);
app.get('/test', function(req,res) {
onFinished(res, function() {
active2--;
});
active2++;
var ts1 = (new Date()).getTime();
request(url, {timeout: 1000}, function(err, response, body ) {
var ts2 = (new Date()).getTime();
var dt = ts2-ts1;
if ( err ) {
errCnt += 1;
logger.error(`Error: ${err}, dt=${dt}, errCnt=${errCnt}`);
res.send(`Error: ${err}`);
}
else {
okCnt += 1;
logger.info(`OK: ${url}`);
res.send(`OK: ${body}`);
}
});
});
var http = app.listen(AppPort);
logger.info(`Listening on ${AppPort}, pid=${process.pid}`);
This "client" code listens by itself on port 10000 and makes request to "server" machine to get "This is XML" string. This data is transferred back to "client"'s client.
I load-test my client code with siege:
siege -v -r 100 -c 100 http://my-client:10000/test
Almost immediately I start to get ETIMEOUT errors:
[2016-03-15 18:17:05.155] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1028, errCnt=3
[2016-03-15 18:17:05.156] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1028, errCnt=4
[2016-03-15 18:17:05.156] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1027, errCnt=5
[2016-03-15 18:17:05.157] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1027, errCnt=6
[2016-03-15 18:17:05.157] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1027, errCnt=7
[2016-03-15 18:17:05.157] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1027, errCnt=8
[2016-03-15 18:17:05.158] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1027, errCnt=9
[2016-03-15 18:17:05.160] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1029, errCnt=10
[2016-03-15 18:17:05.160] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1028, errCnt=11
[2016-03-15 18:17:05.161] [ERROR] [default] - Error: Error: ETIMEDOUT, dt=1028, errCnt=12
Also, though much less frequently, getaddrinfo
errors appear:
Error: Error: getaddrinfo ENOTFOUND {my-server-domain-here}:8080, dt=2, errCnt=4478
However, all requests to the "server" are processed within less then 3 milliseconds (dt values) on the server itself:
[2016-03-15 18:19:13.847] [INFO] [default] - uid=66ohx90, dt=1
[2016-03-15 18:19:13.862] [INFO] [default] - ReqCnt=5632, fins=5631
[2016-03-15 18:19:13.862] [INFO] [default] - uid=j8mpxdm, dt=0
[2016-03-15 18:19:13.865] [INFO] [default] - ReqCnt=5633, fins=5632
[2016-03-15 18:19:13.866] [INFO] [default] - uid=xcetqyj, dt=1
[2016-03-15 18:19:13.877] [INFO] [default] - ReqCnt=5634, fins=5633
[2016-03-15 18:19:13.877] [INFO] [default] - uid=i5qnbit, dt=0
[2016-03-15 18:19:13.895] [INFO] [default] - ReqCnt=5635, fins=5634
[2016-03-15 18:19:13.895] [INFO] [default] - uid=hpdmxpg, dt=1
[2016-03-15 18:19:13.930] [INFO] [default] - ReqCnt=5636, fins=5635
[2016-03-15 18:19:13.930] [INFO] [default] - uid=8g3t8md, dt=0
[2016-03-15 18:19:13.934] [INFO] [default] - ReqCnt=5637, fins=5636
[2016-03-15 18:19:13.934] [INFO] [default] - uid=8rwkad6, dt=0
[2016-03-15 18:19:14.163] [INFO] [default] - ReqCnt=5638, fins=5637
[2016-03-15 18:19:14.165] [INFO] [default] - uid=1sh2frd, dt=2
[2016-03-15 18:19:14.169] [INFO] [default] - ReqCnt=5639, fins=5638
[2016-03-15 18:19:14.170] [INFO] [default] - uid=comn76k, dt=1
[2016-03-15 18:19:14.174] [INFO] [default] - ReqCnt=5640, fins=5639
[2016-03-15 18:19:14.174] [INFO] [default] - uid=gj9e0fm, dt=0
[2016-03-15 18:19:14.693] [INFO] [default] - ReqCnt=5641, fins=5640
[2016-03-15 18:19:14.693] [INFO] [default] - uid=x0yw66n, dt=0
[2016-03-15 18:19:14.713] [INFO] [default] - ReqCnt=5642, fins=5641
[2016-03-15 18:19:14.714] [INFO] [default] - uid=e2cumjv, dt=1
[2016-03-15 18:19:14.734] [INFO] [default] - ReqCnt=5643, fins=5642
[2016-03-15 18:19:14.735] [INFO] [default] - uid=34e0ohl, dt=1
[2016-03-15 18:19:14.747] [INFO] [default] - ReqCnt=5644, fins=5643
[2016-03-15 18:19:14.749] [INFO] [default] - uid=34aau79, dt=2
So, the problem is not that the "server" processes the requests too long, but there is a problem with the client.
In NodeJS 5.8 globalAgent
looks like the following:
console.log(require('http.globalAgent'))
Agent {
domain: null,
_events: { free: [Function] },
_eventsCount: 1,
_maxListeners: undefined,
defaultPort: 80,
protocol: 'http:',
options: { path: null },
requests: {},
sockets: {},
freeSockets: {},
keepAliveMsecs: 1000,
keepAlive: false,
maxSockets: Infinity,
maxFreeSockets: 256 }
ulimits
on my system look like:
root@njs testreq]# ulimit -all
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 128211
max locked memory (kbytes, -l) 64
max memory size (kbytes, -m) unlimited
open files (-n) 200000
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 10240
cpu time (seconds, -t) unlimited
max user processes (-u) 128211
virtual memory (kbytes, -v) unlimited
file locks (-x) unlimited
What can be a reason for the timeouts?
When running a few load tests recently I ran into a similar error, however instead of ETIMEDOUT errors I saw multiple EADDRINUSE errors. At the time I was running the tests with the following HTTP agent configuration changes.
It turns out this configuration wastes a lot of cycles intentionally closing each connection after a single request, and the EADDRINUSE errors were due to running out of ephemeral ports.
For my tests I was still using version 0.12.9 so I'm not sure if this still holds in versions >= 4.x, but the core HTTP library will automatically maintain connections to servers based on the host/port/protocol when possible. This can greatly reduce the load on the client and server, but can also cause requests to build up if the client pool is too small to handle the rate of outbound requests. The best configuration then is one that will keep alive connections whenever possible, but still has a large enough connection pool to quickly handle each outbound request.
Additionally, Node.js is built on top of libuv which implements the event loop interface. One way or another, almost any asynchronous operation implemented by a core Node.js library will interact with libuv. In order to implement this type of interface libuv will use one of several different policies, one of which is a thread pool. The default size of this thread pool is 4, with a max of 128.
The important point here is that any calls to
getaddrinfo
andgetnameinfo
will use the thread pool, which means regardless of the size of your HTTP connection pool, DNS queries and some operations lower in the network stack will be serialized based on the thread pool size. It's possible to change the thread pool size by setting the environment variableUV_THREADPOOL_SIZE
to a value in the range 4 - 128.For my tests the ideal settings were
UV_THREADPOOL_SIZE=50
with the following HTTP agent configuration.This answer has more info on when and how libuv is used.