Linux, UDP datagrams, and kernel timestamps: Lots

2020-07-13 11:15发布

问题:

I have been trying and failing to get Linux (kernel 4.1.4) to give me timestamps for when UDP datagrams are sent and received. I have read the original kernel docs (https://www.kernel.org/doc/Documentation/networking/timestamping.txt), along with lots of examples and a number of stackoverflow entries. I can send datagrams between sender and receiver with no problems. But I cannot get timestamps for sending or receiving datagrams, and I can't figure out what I'm doing wrong.

One bizarre thing is that when I use the MSG_ERRQUEUE channel for getting timestamp info on a sent datagram, I do get the original outgoing packet, and I do get the first ancillary message (SOL_IP, IP_RECVERR), but I do not get a second message (which should be level SOL_SOCKET, type SCM_TIMESTAMPING).

In another stackoverflow entry on getting timestamps for sent packets (Timestamp outgoing packets), someone mentioned that some drivers might not implement the call to skb_tx_timestamp, but I checked mine (Realtek), and that call is definitely in there.

Here's how I set up the UDP receiver (error handling code not shown):

inf->fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);

timestampOn = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_RX_HARDWARE;
r = setsockopt(inf->fd, SOL_SOCKET, SO_TIMESTAMPING, &timestampOn, sizeof(timestampOn));

r = setsockopt(inf->fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on));

memset(&(inf->local), 0, sizeof(struct sockaddr_in));
inf->local.sin_family = AF_INET;
inf->local.sin_port = htons(port);
inf->local.sin_addr.s_addr = htonl(INADDR_ANY);

r = bind(inf->fd, (struct sockaddr *)&(inf->local), sizeof(struct sockaddr_in));

Using SO_REUSEPORT or not doesn't seem to matter.

For receiving, my understanding is that we don't use MSG_ERRQUEUE. That's only if we want timestamps for sent messages. Besides, when I use MSG_ERRQUEUE with recvmsg, I get "resource temporarily unavailable." Here's how I receive datagrams:

int recv_len;
struct msghdr   msg;
struct iovec    iov;

memset(&msg, 0, sizeof(msg));
memset(&iov, 0, sizeof(iov));

// Space for control message info plus timestamp
char ctrl[2048];
memset(ctrl, 0, sizeof(ctrl));
//struct cmsghdr *cmsg = (struct cmsghdr *) &ctrl;

// Ancillary data buffer and length
msg.msg_control      = (char *) ctrl;
msg.msg_controllen   = sizeof(ctrl);

// Dest address info
msg.msg_name         = (struct sockaddr *) &(inf->remote);
msg.msg_namelen      = sizeof(struct sockaddr_in);

// Array of data buffers (scatter/gather)
msg.msg_iov          = &iov;
msg.msg_iovlen       = 1;

// Data buffer pointer and length
iov.iov_base         = buf;
iov.iov_len          = len;

recv_len = recvmsg(inf->fd, &msg, 0);

And then I pass a pointer to msg to another function (handle_time) that does this:

struct timespec* ts = NULL;
struct cmsghdr* cmsg;
struct sock_extended_err *ext;

for( cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg,cmsg) ) {
    printf("level=%d, type=%d, len=%zu\n", cmsg->cmsg_level, cmsg->cmsg_type, cmsg->cmsg_len);
}

Zero messages are received. So that's the first problem. My setup code above matches like half a dozen other examples I've found on the web, but I'm getting no ancillary data from this.

Next, let's turn to sending datagrams. Here's the setup:

inf->port = port;
inf->fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);

memset(&(inf->remote), 0, sizeof(struct sockaddr_in));
inf->remote.sin_family = AF_INET;
inf->remote.sin_port = htons(port);

timestampOn = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_TX_HARDWARE;
r = setsockopt(inf->fd, SOL_SOCKET, SO_TIMESTAMPING, &timestampOn, sizeof(timestampOn));

on = 1;
r = setsockopt(inf->fd, SOL_SOCKET, SO_BROADCAST, &on, sizeof(on));

r = inet_aton(address, &(inf->remote.sin_addr));

And this is how I send a datagram:

int send_len, r, i;
struct msghdr   msg;
struct iovec    iov;

memset(&msg, 0, sizeof(msg));
memset(&iov, 0, sizeof(iov));

// Space for control message info plus timestamp
char ctrl[2048];
memset(ctrl, 0, sizeof(ctrl));
//struct cmsghdr *cmsg = (struct cmsghdr *) &ctrl;

// Ancillary data buffer and length
//msg.msg_control      = (char *) ctrl;
//msg.msg_controllen   = sizeof(ctrl);    

// Dest address info
msg.msg_name         = (struct sockaddr *) &(inf->remote);
msg.msg_namelen      = sizeof(struct sockaddr_in);

// Array of data buffers (scatter/gather)
msg.msg_iov          = &iov;
msg.msg_iovlen       = 1;

// Data buffer pointer and length
iov.iov_base         = buf;
iov.iov_len          = len;

send_len = sendmsg(inf->fd, &msg, 0);

Examples I've seen reuse the msg and iov data structures, but in my experimentation, I added code to make sure things were cleared, just in case the send left anything behind, although it didn't make any difference. Here's the code for getting the timestamp:

memset(&msg, 0, sizeof(msg));
memset(&iov, 0, sizeof(iov));
memset(ctrl, 0, sizeof(ctrl));
msg.msg_control      = (char *) ctrl;
msg.msg_controllen   = sizeof(ctrl);
msg.msg_name         = (struct sockaddr *) &(inf->remote);
msg.msg_namelen      = sizeof(struct sockaddr_in);
msg.msg_iov          = &iov;
msg.msg_iovlen       = 1;
iov.iov_base         = junk_buf;
iov.iov_len          = sizeof(junk_buf);

for (;;) {
    r = recvmsg(inf->fd, &msg, MSG_ERRQUEUE);
    if (r<0) {
        fprintf(stderr, "Didn't get kernel time\n");
        return send_len;
    }

    printf("recvmsg returned %d\n", r);
    handle_time(&msg);
}

The data buffer contains the original datagram as expected. The ancillary data I get back includes a single message, which handle_time prints as:

level=0, type=11, len=48

This is level SOL_IP and type IP_RECVERR, which is expected according to the docs. Looking into the payload (a struct sock_extended_err), the errno is 42 (ENOMSG, No message of desired type) and origin is 4 (SO_EE_ORIGIN_TXSTATUS). From the docs, this is supposed to happen and demonstrates that in fact I did manage to inform the kernel that I want TX status messages. But there is no second ancillary message!

I have tried to see if there is any kernel compile option that might disable this, but I haven't found any. So I'm just completely baffled here. Can anyone help me figure out what I'm doing wrong?

Thanks!

UPDATE: I tried running this same code on another Linux machine, this time CentOS 7 (kernel 3.10.0-693.2.2.el7.x86_64). I can't figure out what what kind of NIC that machine has, but when I try to send datagrams, I get some other weird behavior. For the very first datagram, when I start this program, I get back the message and a single ancillary message, just as above. For every subsequent sendmsg call, errno tells me that I get an "Invalid argument" error. This error goes away if I don't enable timestamps on the socket.

UPDATE 2: I discovered that I had not been making an ioctl necessary to enable timestamps in the driver. Unfortunately, when I do this call, I get ENODEV from errno (no such device). Here's how I'm trying to do it (which I'm imitating from https://github.com/majek/openonload/blob/master/src/tests/onload/hwtimestamping/tx_timestamping.c):

struct ifreq ifr;
struct hwtstamp_config hwc;

inf->fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);

memset(&ifr, 0, sizeof(ifr));
hwc.flags = 0;
hwc.tx_type = HWTSTAMP_TX_ON;
hwc.rx_filter = HWTSTAMP_FILTER_ALL;
ifr.ifr_data = (char*)&hwc;
r = ioctl(inf->fd, SIOCSHWTSTAMP, &ifr);

That being said, I'd be relatively happy with software timestamps, which should not need this call. So I'm not sure this is helpful anyhow.

UPDATE 3: A compilable example was requested. The whole program is pretty minimal, so I put it into pastebin here: https://pastebin.com/qd0gspRc

Also, here's the output from ethtool:

Time stamping parameters for eth0:
Capabilities:
        software-transmit     (SOF_TIMESTAMPING_TX_SOFTWARE)
        software-receive      (SOF_TIMESTAMPING_RX_SOFTWARE)
        software-system-clock (SOF_TIMESTAMPING_SOFTWARE)
PTP Hardware Clock: none
Hardware Transmit Timestamp Modes: none
Hardware Receive Filter Modes: none

Since this obviously doesn't support hardware timestamps, the ioctl is moot. I tried changing the SO_TIMESTAMPING setting to SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_RX_SOFTWARE for sender and receiver. That didn't help.

Then I tried adding SOF_TIMESTAMPING_SOFTWARE to both. I finally started getting something:

level=1, type=37, len=64

Level 1 is SOL_SOCKET, and type 37 is SCM_TIMESTAMPING. I'll go back to the docs and figure out how to interpret this. It says something about passing an array of three time structures. The driver's call to skb_tx_timestamp should have been sufficient so that it wouldn't require that I enable "fake" software timestamps to get something out.

回答1:

Like I say in comment the use of SOF_TIMESTAMPING_SOFTWARE and SOF_TIMESTAMPING_RAW_HARDWARE is necessary because if I understand correctly the documentation, some bits are to generate the timestamp and some bits are here to report them in control message:

1.3.1 Timestamp Generation

Some bits are requests to the stack to try to generate timestamps. Any combination of them is valid. Changes to these bits apply to newly created packets, not to packets already in the stack. As a result, it is possible to selectively request timestamps for a subset of packets (e.g., for sampling) by embedding an send() call within two setsockopt calls, one to enable timestamp generation and one to disable it. Timestamps may also be generated for reasons other than being requested by a particular socket, such as when receive timestamping is enabled system wide, as explained earlier.

1.3.2 Timestamp Reporting

The other three bits control which timestamps will be reported in a generated control message. Changes to the bits take immediate effect at the timestamp reporting locations in the stack. Timestamps are only reported for packets that also have the relevant timestamp generation request set.

After, to use the data documentation say:

2.1 SCM_TIMESTAMPING records

These timestamps are returned in a control message with cmsg_level SOL_SOCKET, cmsg_type SCM_TIMESTAMPING, and payload of type

struct scm_timestamping { struct timespec ts[3]; };

...

The structure can return up to three timestamps. This is a legacy feature. At least one field is non-zero at any time. Most timestamps are passed in ts[0]. Hardware timestamps are passed in ts[2].

To get transmit timestamp this require some configuration, first you need to know that software timestamp are not always available, I only achieve to get hardware transmit timestamp. But I'm not an expert in these domain, I just try to implemented timestamp with information that I found.

Secondly, I needed to activate hardware feature with linuxptp tool, I use hwstamp_cli:

hwstamp_ctl -i eth0 -r 1 -t 1

With this and some modification on your code I achieve to get hardware transmit timestamp but only with ethX interface because lo interface don't have these feature AFAIK so the final code is:

#include <arpa/inet.h>
#include <errno.h>
#include <inttypes.h>
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
#include <linux/sockios.h>
#include <net/if.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>

#define UDP_MAX_LENGTH 1500

typedef struct {
  int fd;
  int port;
  int err_no;
  struct sockaddr_in local;
  struct sockaddr_in remote;
  struct timeval time_kernel;
  struct timeval time_user;
  int64_t prev_serialnum;
} socket_info;

static int setup_udp_receiver(socket_info *inf, int port) {
  inf->port = port;
  inf->fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
  if (inf->fd < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_server: socket failed: %s\n",
            strerror(inf->err_no));
    return inf->fd;
  }

  int timestampOn =
      SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_TX_SOFTWARE |
      SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_RX_HARDWARE |
      SOF_TIMESTAMPING_TX_HARDWARE | SOF_TIMESTAMPING_RAW_HARDWARE |
      // SOF_TIMESTAMPING_OPT_TSONLY |
      0;
  int r = setsockopt(inf->fd, SOL_SOCKET, SO_TIMESTAMPING, &timestampOn,
                     sizeof timestampOn);
  if (r < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_server: setsockopt failed: %s\n",
            strerror(inf->err_no));
    return r;
  }

  int on = 1;
  r = setsockopt(inf->fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof on);
  if (r < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_server: setsockopt2 failed: %s\n",
            strerror(inf->err_no));
    return r;
  }

  inf->local = (struct sockaddr_in){.sin_family = AF_INET,
                                    .sin_port = htons((uint16_t)port),
                                    .sin_addr.s_addr = htonl(INADDR_ANY)};
  r = bind(inf->fd, (struct sockaddr *)&inf->local, sizeof inf->local);
  if (r < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_server: bind failed: %s\n",
            strerror(inf->err_no));
    return r;
  }

  inf->prev_serialnum = -1;

  return 0;
}

static int setup_udp_sender(socket_info *inf, int port, char *address) {
  inf->port = port;
  inf->fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
  if (inf->fd < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_client: socket failed: %s\n",
            strerror(inf->err_no));
    return inf->fd;
  }

  int timestampOn =
      SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_TX_SOFTWARE |
      SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_RX_HARDWARE |
      SOF_TIMESTAMPING_TX_HARDWARE | SOF_TIMESTAMPING_RAW_HARDWARE |
      // SOF_TIMESTAMPING_OPT_TSONLY |
      0;
  int r = setsockopt(inf->fd, SOL_SOCKET, SO_TIMESTAMPING, &timestampOn,
                     sizeof timestampOn);
  if (r < 0) {
    inf->err_no = errno;
    fprintf(stderr, "setup_udp_server: setsockopt failed: %s\n",
            strerror(inf->err_no));
    return r;
  }

  inf->remote = (struct sockaddr_in){.sin_family = AF_INET,
                                     .sin_port = htons((uint16_t)port)};
  r = inet_aton(address, &inf->remote.sin_addr);
  if (r == 0) {
    fprintf(stderr, "setup_udp_client: inet_aton failed\n");
    inf->err_no = 0;
    return -1;
  }

  inf->local = (struct sockaddr_in){.sin_family = AF_INET,
                                    .sin_port = htons(0),
                                    .sin_addr.s_addr = htonl(INADDR_ANY)};
  inf->prev_serialnum = -1;

  return 0;
}

static void handle_scm_timestamping(struct scm_timestamping *ts) {
  for (size_t i = 0; i < sizeof ts->ts / sizeof *ts->ts; i++) {
    printf("timestamp: %lld.%.9lds\n", (long long)ts->ts[i].tv_sec,
           ts->ts[i].tv_nsec);
  }
}

static void handle_time(struct msghdr *msg) {

  for (struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); cmsg;
       cmsg = CMSG_NXTHDR(msg, cmsg)) {
    printf("level=%d, type=%d, len=%zu\n", cmsg->cmsg_level, cmsg->cmsg_type,
           cmsg->cmsg_len);

    if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_RECVERR) {
      struct sock_extended_err *ext =
          (struct sock_extended_err *)CMSG_DATA(cmsg);
      printf("errno=%d, origin=%d\n", ext->ee_errno, ext->ee_origin);
      continue;
    }

    if (cmsg->cmsg_level != SOL_SOCKET)
      continue;

    switch (cmsg->cmsg_type) {
    case SO_TIMESTAMPNS: {
      struct scm_timestamping *ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
      handle_scm_timestamping(ts);
    } break;
    case SO_TIMESTAMPING: {
      struct scm_timestamping *ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
      handle_scm_timestamping(ts);
    } break;
    default:
      /* Ignore other cmsg options */
      break;
    }
  }
  printf("End messages\n");
}

static ssize_t udp_receive(socket_info *inf, char *buf, size_t len) {
  char ctrl[2048];
  struct iovec iov = (struct iovec){.iov_base = buf, .iov_len = len};
  struct msghdr msg = (struct msghdr){.msg_control = ctrl,
                                      .msg_controllen = sizeof ctrl,
                                      .msg_name = &inf->remote,
                                      .msg_namelen = sizeof inf->remote,
                                      .msg_iov = &iov,
                                      .msg_iovlen = 1};
  ssize_t recv_len = recvmsg(inf->fd, &msg, 0);
  gettimeofday(&inf->time_user, NULL);

  if (recv_len < 0) {
    inf->err_no = errno;
    fprintf(stderr, "udp_receive: recvfrom failed: %s\n",
            strerror(inf->err_no));
  }

  handle_time(&msg);

  return recv_len;
}

static ssize_t udp_send(socket_info *inf, char *buf, size_t len) {
  struct iovec iov = (struct iovec){.iov_base = buf, .iov_len = len};
  struct msghdr msg = (struct msghdr){.msg_name = &inf->remote,
                                      .msg_namelen = sizeof inf->remote,
                                      .msg_iov = &iov,
                                      .msg_iovlen = 1};
  gettimeofday(&inf->time_user, NULL);
  ssize_t send_len = sendmsg(inf->fd, &msg, 0);
  if (send_len < 0) {
    inf->err_no = errno;
    fprintf(stderr, "udp_send: sendmsg failed: %s\n", strerror(inf->err_no));
  }

  return send_len;
}

static ssize_t meq_receive(socket_info *inf, char *buf, size_t len) {
  struct iovec iov = (struct iovec){.iov_base = buf, .iov_len = len};
  char ctrl[2048];
  struct msghdr msg = (struct msghdr){.msg_control = ctrl,
                                      .msg_controllen = sizeof ctrl,
                                      .msg_name = &inf->remote,
                                      .msg_namelen = sizeof inf->remote,
                                      .msg_iov = &iov,
                                      .msg_iovlen = 1};
  ssize_t recv_len = recvmsg(inf->fd, &msg, MSG_ERRQUEUE);
  if (recv_len < 0) {
    inf->err_no = errno;
    if (errno != EAGAIN) {
      fprintf(stderr, "meq_receive: recvmsg failed: %s\n",
              strerror(inf->err_no));
    }
    return recv_len;
  }
  handle_time(&msg);

  return recv_len;
}

typedef struct {
  int64_t serialnum;

  int64_t user_time_serialnum;
  int64_t user_time;

  int64_t kernel_time_serialnum;
  int64_t kernel_time;

  size_t message_bytes;
} message_header;

static const size_t payload_max = UDP_MAX_LENGTH - sizeof(message_header);

static ssize_t generate_random_message(socket_info *inf, char *buf,
                                       size_t len) {
  if (len < sizeof(message_header)) {
    return -1;
  }
  message_header *header = (message_header *)buf;
  char *payload = (char *)(header + 1);
  size_t payload_len = (size_t)random() % (payload_max + 1);
  if (payload_len > len - sizeof(message_header)) {
    payload_len = len - sizeof(message_header);
  }
  for (size_t i = 0; i < payload_len; i++) {
    payload[i] = (char)random();
  }

  static int64_t serial_num = 0;
  *header = (message_header){
      .user_time_serialnum = inf->prev_serialnum,
      .user_time = inf->time_user.tv_sec * 1000000000L + inf->time_user.tv_usec,
      .kernel_time_serialnum = inf->prev_serialnum,
      .kernel_time =
          inf->time_kernel.tv_sec * 1000000000L + inf->time_kernel.tv_usec,
      .serialnum = serial_num,
      .message_bytes = payload_len};
  size_t total = payload_len + sizeof *header;

  printf("uts%5" PRId64 ": kt=%" PRId64 ", ut=%" PRId64 ", sn=%" PRId64
         ": s=%zu\n",
         header->user_time_serialnum, header->kernel_time, header->user_time,
         header->serialnum, total);

  inf->prev_serialnum = serial_num++;

  return (ssize_t)total;
}

static void sender_loop(char *host) {
  socket_info inf;
  int ret = setup_udp_sender(&inf, 8000, host);
  if (ret < 0) {
    return;
  }

  for (int i = 0; i < 2000; i++) {
    useconds_t t = random() % 2000000;
    usleep(t);
    char packet_buffer[4096];
    ssize_t len =
        generate_random_message(&inf, packet_buffer, sizeof packet_buffer);
    if (len < 0) {
      return;
    }
    udp_send(&inf, packet_buffer, (size_t)len);
    while (meq_receive(&inf, packet_buffer, sizeof packet_buffer) != -1) {
    }
  }
}

static void receiver_loop(void) {
  socket_info inf;
  int ret = setup_udp_receiver(&inf, 8000);
  if (ret < 0) {
    return;
  }

  for (int i = 0; i < 1000; i++) {
    char packet_buffer[4096];
    udp_receive(&inf, packet_buffer, sizeof packet_buffer);
  }
}

#define USAGE "Usage: %s [-r | -s host]\n"

int main(int argc, char *argv[]) {
  if (argc < 2) {
    fprintf(stderr, USAGE, argv[0]);
    return 0;
  }

  if (0 == strcmp(argv[1], "-s")) {
    if (argc < 3) {
      fprintf(stderr, USAGE, argv[0]);
      return 0;
    }
    sender_loop(argv[2]);
  } else if (0 == strcmp(argv[1], "-r")) {
    receiver_loop();
  } else {
    fprintf(stderr, USAGE, argv[0]);
  }
}

Exemple output:

$ ./a.out -r
level=1, type=37, len=64
timestamp: 1511196758.087209387s
timestamp: 0.000000000s
timestamp: 0.000000000s
End messages
level=1, type=37, len=64
timestamp: 1511196759.333507671s
timestamp: 0.000000000s
timestamp: 0.000000000s
End messages
$ ./a.out -s "8.8.8.8"
uts   -1: kt=238059712, ut=140918979990070, sn=0: s=482
uts    0: kt=238059712, ut=1511197522000237457, sn=1: s=132
level=1, type=37, len=64
timestamp: 0.000000000s
timestamp: 0.000000000s
timestamp: 1511197359.637050597s
level=0, type=11, len=48
errno=42, origin=4
End messages
uts    1: kt=238059712, ut=1511197523000483805, sn=2: s=1454
level=1, type=37, len=64
timestamp: 0.000000000s
timestamp: 0.000000000s
timestamp: 1511197360.883295397s
level=0, type=11, len=48
errno=42, origin=4
End messages

Live test: sender, receiver