jwb*_*ley 22 c sockets performance network-programming circular-buffer
我正在使用PACKET_MMAP套接字选项在C中编写流量生成器来创建环形缓冲区以通过原始套接字发送数据.环形缓冲区充满了以太网帧以进行发送和sendto调用.环形缓冲区的全部内容通过套接字发送,这应该比在内存中具有缓冲区提供更高的性能,并且sendto对于需要发送的缓冲区中的每个帧重复调用.
当不使用PACKET_MMAP时,在调用单帧时,sendto从用户区内存中的缓冲区复制到内核内存中的SK buf,然后内核必须将数据包复制到NIC访问的内存中,并将NIC发送到DMA将帧放入其自己的硬件缓冲区并将其排队以进行传输.当使用PACKET_MMAP套接字选项时,mmapped内存由应用程序分配并链接到原始套接字.应用程序将数据包放入mmapped缓冲区,调用sendto而不是内核必须将数据包复制到SK buf中,它可以直接从mmapped缓冲区中读取它们.还可以从环形缓冲区而不是单独的分组/帧读取分组的"块".因此,性能提升是一个系统调用,用于复制多个帧,每个帧一次复制操作,以使其进入NIC硬件缓冲区.
当我将使用PACKET_MMAP的套接字的性能与"普通"套接字(其中包含单个数据包的char缓冲区)进行比较时,根本没有性能优势.为什么是这样?在Tx模式下使用PACKET_MMAP时,每个环形块只能放入一个帧(而不是像Rx模式那样每个环形块有多个帧)但是我创建了256个块,所以我们应该在一次sendto调用中发送256帧吗?
使用PACKET_MMAP进行性能main()调用packet_tx_mmap():
bensley@ubuntu-laptop:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1
Using inteface lo (1)
Running in Tx mode
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 17.65 (2206128128) pps 1457152
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.08 (2385579520) pps 1575680
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.28 (2409609728) pps 1591552
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.31 (2414260736) pps 1594624
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 19.30 (2411935232) pps 1593088
Run Code Online (Sandbox Code Playgroud)
没有PACKET_MMAP的性能,main()调用packet_tx():
bensley@ubuntu-laptop:~/C/etherate10+$ sudo taskset -c 1 ./etherate_mt -I 1
Using inteface lo (1)
Running in Tx mode
1. Rx Gbps 0.00 (0) pps 0 Tx Gbps 18.44 (2305001412) pps 1522458
2. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.30 (2537520018) pps 1676037
3. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.29 (2535744096) pps 1674864
4. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.26 (2533014354) pps 1673061
5. Rx Gbps 0.00 (0) pps 0 Tx Gbps 20.32 (2539476106) pps 1677329
Run Code Online (Sandbox Code Playgroud)
该packet_tx()功能比packet_tx_mmap()它看起来的功能略快但它也略短,所以我认为最小的性能提升只是现在的代码行数略少packet_tx.所以在我看来,这两个功能几乎具有相同的性能,为什么呢?为什么PACKET_MMAP不是更快,因为据我所知它应该少得多的系统调用和副本?
void *packet_tx_mmap(void* thd_opt_p) {
struct thd_opt *thd_opt = thd_opt_p;
int32_t sock_fd = setup_socket_mmap(thd_opt_p);
if (sock_fd == EXIT_FAILURE) exit(EXIT_FAILURE);
struct tpacket2_hdr *hdr;
uint8_t *data;
int32_t send_ret = 0;
uint16_t i;
while(1) {
for (i = 0; i < thd_opt->tpacket_req.tp_frame_nr; i += 1) {
hdr = (void*)(thd_opt->mmap_buf + (thd_opt->tpacket_req.tp_frame_size * i));
data = (uint8_t*)(hdr + TPACKET_ALIGN(TPACKET2_HDRLEN));
memcpy(data, thd_opt->tx_buffer, thd_opt->frame_size);
hdr->tp_len = thd_opt->frame_size;
hdr->tp_status = TP_STATUS_SEND_REQUEST;
}
send_ret = sendto(sock_fd, NULL, 0, 0, NULL, 0);
if (send_ret == -1) {
perror("sendto error");
exit(EXIT_FAILURE);
}
thd_opt->tx_pkts += thd_opt->tpacket_req.tp_frame_nr;
thd_opt->tx_bytes += send_ret;
}
return NULL;
}
Run Code Online (Sandbox Code Playgroud)
请注意,下面的函数调用setup_socket()而不是setup_socket_mmap():
void *packet_tx(void* thd_opt_p) {
struct thd_opt *thd_opt = thd_opt_p;
int32_t sock_fd = setup_socket(thd_opt_p);
if (sock_fd == EXIT_FAILURE) {
printf("Can't create socket!\n");
exit(EXIT_FAILURE);
}
while(1) {
thd_opt->tx_bytes += sendto(sock_fd, thd_opt->tx_buffer,
thd_opt->frame_size, 0,
(struct sockaddr*)&thd_opt->bind_addr,
sizeof(thd_opt->bind_addr));
thd_opt->tx_pkts += 1;
}
}
Run Code Online (Sandbox Code Playgroud)
套接字设置功能的唯一区别是粘贴在下面,但基本上是设置SOCKET_RX_RING或SOCKET_TX_RING的要求:
// Set the TPACKET version, v2 for Tx and v3 for Rx
// (v2 supports packet level send(), v3 supports block level read())
int32_t sock_pkt_ver = -1;
if(thd_opt->sk_mode == SKT_TX) {
static const int32_t sock_ver = TPACKET_V2;
sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver));
} else {
static const int32_t sock_ver = TPACKET_V3;
sock_pkt_ver = setsockopt(sock_fd, SOL_PACKET, PACKET_VERSION, &sock_ver, sizeof(sock_ver));
}
if (sock_pkt_ver < 0) {
perror("Can't set socket packet version");
return EXIT_FAILURE;
}
memset(&thd_opt->tpacket_req, 0, sizeof(struct tpacket_req));
memset(&thd_opt->tpacket_req3, 0, sizeof(struct tpacket_req3));
//thd_opt->block_sz = 4096; // These are set else where
//thd_opt->block_nr = 256;
//thd_opt->block_frame_sz = 4096;
int32_t sock_mmap_ring = -1;
if (thd_opt->sk_mode == SKT_TX) {
thd_opt->tpacket_req.tp_block_size = thd_opt->block_sz;
thd_opt->tpacket_req.tp_frame_size = thd_opt->block_sz;
thd_opt->tpacket_req.tp_block_nr = thd_opt->block_nr;
// Allocate per-frame blocks in Tx mode (TPACKET_V2)
thd_opt->tpacket_req.tp_frame_nr = thd_opt->block_nr;
sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_TX_RING , (void*)&thd_opt->tpacket_req , sizeof(struct tpacket_req));
} else {
thd_opt->tpacket_req3.tp_block_size = thd_opt->block_sz;
thd_opt->tpacket_req3.tp_frame_size = thd_opt->block_frame_sz;
thd_opt->tpacket_req3.tp_block_nr = thd_opt->block_nr;
thd_opt->tpacket_req3.tp_frame_nr = (thd_opt->block_sz * thd_opt->block_nr) / thd_opt->block_frame_sz;
thd_opt->tpacket_req3.tp_retire_blk_tov = 1;
thd_opt->tpacket_req3.tp_feature_req_word = 0;
sock_mmap_ring = setsockopt(sock_fd, SOL_PACKET , PACKET_RX_RING , (void*)&thd_opt->tpacket_req3 , sizeof(thd_opt->tpacket_req3));
}
if (sock_mmap_ring == -1) {
perror("Can't enable Tx/Rx ring for socket");
return EXIT_FAILURE;
}
thd_opt->mmap_buf = NULL;
thd_opt->rd = NULL;
if (thd_opt->sk_mode == SKT_TX) {
thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0);
if (thd_opt->mmap_buf == MAP_FAILED) {
perror("mmap failed");
return EXIT_FAILURE;
}
} else {
thd_opt->mmap_buf = mmap(NULL, (thd_opt->block_sz * thd_opt->block_nr), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock_fd, 0);
if (thd_opt->mmap_buf == MAP_FAILED) {
perror("mmap failed");
return EXIT_FAILURE;
}
// Per bock rings in Rx mode (TPACKET_V3)
thd_opt->rd = (struct iovec*)calloc(thd_opt->tpacket_req3.tp_block_nr * sizeof(struct iovec), 1);
for (uint16_t i = 0; i < thd_opt->tpacket_req3.tp_block_nr; ++i) {
thd_opt->rd[i].iov_base = thd_opt->mmap_buf + (i * thd_opt->tpacket_req3.tp_block_size);
thd_opt->rd[i].iov_len = thd_opt->tpacket_req3.tp_block_size;
}
}
Run Code Online (Sandbox Code Playgroud)
更新1:针对物理接口的结果
有人提到我使用PACKET_MMAP时可能没有看到性能差异的一个原因是因为我正在向loopback接口发送流量(一方面,它没有QDISC ).由于运行其中任何一个packet_tx_mmap()或packet_tx()例程可以产生超过10Gbps的速度而且我只有10Gbps接口供我使用,我已将两个绑定在一起,这些是结果,显示与上面几乎相同,两个函数之间的速度差异很小:
packet_tx() 到20G bond0
packet_tx_mmap() 到20G bond0:
这是帧大小为1514字节(以保持与上面的原始环回测试相同).
在所有上述测试中,软IRQ的数量大致相同(使用此脚本测量).在一个线程运行packet_tx()的情况下,CPU核心上每秒大约有40k个中断.有2个和3个线程分别在2和3个核心上运行40k.使用时的结果packet_tx_mmap()相同.大约40k个软IRQ,用于一个CPU内核上的单个线程.运行2和3个线程时,每个核心40k.
更新2:完整源代码
我现在已经上传了完整的源代码,我仍在编写这个应用程序,所以它可能有很多缺陷,但它们超出了这个问题的范围:https://github.com/jwbensley/EtherateMT
Jim*_* D. 24
Many interfaces to the linux kernel are not well documented. Or even if they seem well documented, they can be pretty complex and that can make it hard to understanding what the functional or, often even harder, nonfunctional properties of the interface are.
For this reason, my advice to anyone wanting a solid understanding of kernel APIs or needing to create high performance applications using kernel APIs needs to be able to engage with kernel code to be successful.
In this case the questioner wants to understand the performance characteristics of sending raw frames though a shared memory interface (packet mmap) to the kernel.
The linux documentation is here. It has a stale link to a "how to," which can now be found here and includes a copy of packet_mmap.c (I have a slightly different version available here.
The documentation is largely geared towards reading, which is the typical use case for using packet mmap: efficiently reading raw frames from an interface for, e.g. efficiently obtaining a packet capture from a high speed interface with little or no loss.
The OP however is interested in high performance writing, which is a much less common use case, but potentially useful for a traffic generator/simulator which appears to be what the OP wants to do with it. Thankfully, the "how to" is all about writing frames.
Even so, there is very little information provided about how this actually works, and nothing of obvious help to answer the OPs question about why using packet mmap doesn't seem to be faster than not using it and instead sending one frame at a time.
Thankfully the kernel source is open source and well indexed, so we can turn to the source to help us get the answer to the question.
In order to find the relevant kernel code there are several keywords you could search for, but PACKET_TX_RING stands out as a socket option unique to this feature. Searching on the interwebs for "PACKET_TX_RING linux cross reference" turns up a small number of references, including af_packet.c, which with a little inspection appears to be the implementation of all the AF_PACKET functionality, including packet mmap.
Looking through af_packet.c, it appears that the core of the work for transmitting with packet mmap takes place in tpacket_snd(). But is this correct? How can we tell if this has anything to do with what we think it does?
A very powerful tool for getting information like this out of the kernel is SystemTap. (Using this requires installing debugging symbols for your kernel. I happen to be using Ubuntu, and this is a recipe for getting SystemTap working on Ubuntu.)
Once you have SystemTap working, you can use SystemTap in conjuction with packet_mmap.c to see if tpacket_snd() is even invoked by installing a probe on the kernel function tpacket_snd, and then running packet_mmap to send a frame via a shared TX ring:
$ sudo stap -e 'probe kernel.function("tpacket_snd") { printf("W00T!\n"); }' &
[1] 19961
$ sudo ./packet_mmap -c 1 eth0
[...]
STARTING TEST:
data offset = 32 bytes
start fill() thread
send 1 packets (+150 bytes)
end of task fill()
Loop until queue empty (0)
END (number of error:0)
W00T!
W00T!
Run Code Online (Sandbox Code Playgroud)
W00T! We are on to something; tpacket_snd is actually being called. But our victory will be short lived. If we continue to try to get more information out of a stock kernel build, SystemTap will complain that it can't find the variables we want to inspect and function arguments will print out with values as ? or ERROR. This is because the kernel is compiled with optimization and all of the functionality for AF_PACKET is defined in the single translation unit af_packet.c; many of the functions are inlined by the compiler, effectively losing local variables and arguments.
In order to pry more information out of af_packet.c, we are going to have to build a version of the kernel where af_packet.c is built without optimization. Look here for some guidance. I'll wait.
OK, hopefully that wasn't too hard and you have successfully booted a kernel that SystemTap can get lots of good information from. Keep in mind that this kernel version is just to help us figure out how packet mmap is working. We can't get any direct performance information from this kernel because af_packet.c was build without optimization. If it turns out that we need to get information on how the optimized version would behave, we can build another kernel with af_packet.c compiled with optimization, but with some instrumentation code added that exposes information via variables that won't get optimized out so that SystemTap can see them.
So let's use it to get some information. Take a look at status.stp:
# This is specific to net/packet/af_packet.c 3.13.0-116
function print_ts() {
ts = gettimeofday_us();
printf("[%10d.%06d] ", ts/1000000, ts%1000000);
}
# 325 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
# 326 {
# 327 union tpacket_uhdr h;
# 328
# 329 h.raw = frame;
# 330 switch (po->tp_version) {
# 331 case TPACKET_V1:
# 332 h.h1->tp_status = status;
# 333 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
# 334 break;
# 335 case TPACKET_V2:
# 336 h.h2->tp_status = status;
# 337 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
# 338 break;
# 339 case TPACKET_V3:
# 340 default:
# 341 WARN(1, "TPACKET version not supported.\n");
# 342 BUG();
# 343 }
# 344
# 345 smp_wmb();
# 346 }
probe kernel.statement("__packet_set_status@net/packet/af_packet.c:334") {
print_ts();
printf("SET(V1): %d (0x%.16x)\n", $status, $frame);
}
probe kernel.statement("__packet_set_status@net/packet/af_packet.c:338") {
print_ts();
printf("SET(V2): %d\n", $status);
}
# 348 static int __packet_get_status(struct packet_sock *po, void *frame)
# 349 {
# 350 union tpacket_uhdr h;
# 351
# 352 smp_rmb();
# 353
# 354 h.raw = frame;
# 355 switch (po->tp_version) {
# 356 case TPACKET_V1:
# 357 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
# 358 return h.h1->tp_status;
# 359 case TPACKET_V2:
# 360 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
# 361 return h.h2->tp_status;
# 362 case TPACKET_V3:
# 363 default:
# 364 WARN(1, "TPACKET version not supported.\n");
# 365 BUG();
# 366 return 0;
# 367 }
# 368 }
probe kernel.statement("__packet_get_status@net/packet/af_packet.c:358") {
print_ts();
printf("GET(V1): %d (0x%.16x)\n", $h->h1->tp_status, $frame);
}
probe kernel.statement("__packet_get_status@net/packet/af_packet.c:361") {
print_ts();
printf("GET(V2): %d\n", $h->h2->tp_status);
}
# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
# 2089 {
# [...]
# 2136 do {
# 2137 ph = packet_current_frame(po, &po->tx_ring,
# 2138 TP_STATUS_SEND_REQUEST);
# 2139
# 2140 if (unlikely(ph == NULL)) {
# 2141 schedule();
# 2142 continue;
# 2143 }
# 2144
# 2145 status = TP_STATUS_SEND_REQUEST;
# 2146 hlen = LL_RESERVED_SPACE(dev);
# 2147 tlen = dev->needed_tailroom;
# 2148 skb = sock_alloc_send_skb(&po->sk,
# 2149 hlen + tlen + sizeof(struct sockaddr_ll),
# 2150 0, &err);
# 2151
# 2152 if (unlikely(skb == NULL))
# 2153 goto out_status;
# 2154
# 2155 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
# 2156 addr, hlen);
# [...]
# 2176 skb->destructor = tpacket_destruct_skb;
# 2177 __packet_set_status(po, ph, TP_STATUS_SENDING);
# 2178 atomic_inc(&po->tx_ring.pending);
# 2179
# 2180 status = TP_STATUS_SEND_REQUEST;
# 2181 err = dev_queue_xmit(skb);
# 2182 if (unlikely(err > 0)) {
# [...]
# 2195 }
# 2196 packet_increment_head(&po->tx_ring);
# 2197 len_sum += tp_len;
# 2198 } while (likely((ph != NULL) ||
# 2199 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
# 2200 (atomic_read(&po->tx_ring.pending))))
# 2201 );
# 2202
# [...]
# 2213 return err;
# 2214 }
probe kernel.function("tpacket_snd") {
print_ts();
printf("tpacket_snd: args(%s)\n", $$parms);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2140") {
print_ts();
printf("tpacket_snd:2140: current frame ph = 0x%.16x\n", $ph);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2141") {
print_ts();
printf("tpacket_snd:2141: (ph==NULL) --> schedule()\n");
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2142") {
print_ts();
printf("tpacket_snd:2142: flags 0x%x, pending %d\n",
$msg->msg_flags, $po->tx_ring->pending->counter);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2197") {
print_ts();
printf("tpacket_snd:2197: flags 0x%x, pending %d\n",
$msg->msg_flags, $po->tx_ring->pending->counter);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2213") {
print_ts();
printf("tpacket_snd: return(%d)\n", $err);
}
# 1946 static void tpacket_destruct_skb(struct sk_buff *skb)
# 1947 {
# 1948 struct packet_sock *po = pkt_sk(skb->sk);
# 1949 void *ph;
# 1950
# 1951 if (likely(po->tx_ring.pg_vec)) {
# 1952 __u32 ts;
# 1953
# 1954 ph = skb_shinfo(skb)->destructor_arg;
# 1955 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
# 1956 atomic_dec(&po->tx_ring.pending);
# 1957
# 1958 ts = __packet_set_timestamp(po, ph, skb);
# 1959 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
# 1960 }
# 1961
# 1962 sock_wfree(skb);
# 1963 }
probe kernel.statement("tpacket_destruct_skb@net/packet/af_packet.c:1959") {
print_ts();
printf("tpacket_destruct_skb:1959: ph = 0x%.16x, ts = 0x%x, pending %d\n",
$ph, $ts, $po->tx_ring->pending->counter);
}
Run Code Online (Sandbox Code Playgroud)
This defines a function (print_ts to print out unix epoch time with microsecond resolution) and a number of probes.
First we define probes to print out information when packets in the tx_ring have their status set or read. Next we define probes for the call and return of tpacket_snd and at points within the do {...} while (...) loop processing the packets in the tx_ring. Finally we add a probe to the skb destructor.
We can start the SystemTap script with sudo stap status.stp. Then run sudo packet_mmap -c 2 <interface> to send 2 frames through the interface. Here is the output I got from the SystemTap script:
[1492581245.839850] tpacket_snd: args(po=0xffff88016720ee38 msg=0x14)
[1492581245.839865] GET(V1): 1 (0xffff880241202000)
[1492581245.839873] tpacket_snd:2140: current frame ph = 0xffff880241202000
[1492581245.839887] SET(V1): 2 (0xffff880241202000)
[1492581245.839918] tpacket_snd:2197: flags 0x40, pending 1
[1492581245.839923] GET(V1): 1 (0xffff88013499c000)
[1492581245.839929] tpacket_snd:2140: current frame ph = 0xffff88013499c000
[1492581245.839935] SET(V1): 2 (0xffff88013499c000)
[1492581245.839946] tpacket_snd:2197: flags 0x40, pending 2
[1492581245.839951] GET(V1): 0 (0xffff88013499e000)
[1492581245.839957] tpacket_snd:2140: current frame ph = 0x0000000000000000
[1492581245.839961] tpacket_snd:2141: (ph==NULL) --> schedule()
[1492581245.839977] tpacket_snd:2142: flags 0x40, pending 2
[1492581245.839984] tpacket_snd: return(300)
[1492581245.840077] tpacket_snd: args(po=0x0 msg=0x14)
[1492581245.840089] GET(V1): 0 (0xffff88013499e000)
[1492581245.840098] tpacket_snd:2140: current frame ph = 0x0000000000000000
[1492581245.840093] tpacket_destruct_skb:1959: ph = 0xffff880241202000, ts = 0x0, pending 1
[1492581245.840102] tpacket_snd:2141: (ph==NULL) --> schedule()
[1492581245.840104] SET(V1): 0 (0xffff880241202000)
[1492581245.840112] tpacket_snd:2142: flags 0x40, pending 1
[1492581245.840116] tpacket_destruct_skb:1959: ph = 0xffff88013499c000, ts = 0x0, pending 0
[1492581245.840119] tpacket_snd: return(0)
[1492581245.840123] SET(V1): 0 (0xffff88013499c000)
Run Code Online (Sandbox Code Playgroud)
And here is the network capture:
There is a lot of useful information in the SystemTap output. We can see tpacket_snd get the status of the first frame in the ring (TP_STATUS_SEND_REQUEST is 1) and then set it to TP_STATUS_SENDING (2). It does the same with the second. The next frame has status TP_STATUS_AVAILABLE (0), which is not a send request, so it calls schedule() to yield, and continues the loop. Since there are no more frames to send (ph==NULL) and non-blocking has been requested (msg->msg_flags ==MSG_DONTWAIT) the do {...} while (...) loop terminates, and tpacket_snd returns 300, the number of bytes queued for transmission.
Next, packet_mmap calls sendto again (via the "loop until queue empty" code), but there is no more data to send in the tx ring, and non-blocking is requested, so it immediately returns 0, as no data has been queued. Note that the frame it checked the status of is the same frame it checked last in the previous call --- it did not start with the first frame in the tx ring, it checked the head (which is not available in userland).
Asynchronously, the destructor is called, first on the first frame, setting the status of the frame to TP_STATUS_AVAILABLE and decrementing the pending count, and then on the second frame. Note that if non-blocking was not requested, the test at the end of the do {...} while (...) loop will wait until all of the pending packets have been transferred to the NIC (assuming it supports scattered data) before returning. You can watch this by running packet_mmap with the -t option for "threaded" which uses blocking I/O (until it gets to "loop until queue empty").
A couple of things to note. First, the timestamps on the SystemTap output are not increasing: it is not safe to infer temporal ordering from SystemTap ouput. Second, note that the timestamps on the network capture (done locally) are different. FWIW, the interface is a cheap 1G in a cheap tower computer.
So at this point, I think we more or less know how af_packet is processing the shared tx ring. What comes next is how the frames in the tx ring find their way to the network interface. It might be helpful to review this section (on how layer 2 transmission is handled) of an overview of the control flow in the linux networking kernel.
OK, so if you have a basic understanding of how layer 2 transmission is handled, it would seem like this packet mmap interface should be an enormous fire hose; load up a shared tx ring with packets, call sendto() with MSG_DONTWAIT, and then tpacket_snd will iterate through the tx queue creating skb's and enqueueing them onto the qdisc. Asychronously, skb's will be dequeued from the qdisc and sent to the hardware tx ring. The skb's should be non-linear so they will reference the data in the tx ring rather than copy, and a nice modern NIC should be able to handle scattered data and reference the data in the tx rings as well. Of course, any of these assumptions could be wrong, so lets try to dump a whole lot of hurt on a qdisc with this fire hose.
But first, a not commonly understood fact about how qdiscs work. They hold a bounded amount of data (generally counted in number of frames, but in some cases it could be measured in bytes) and if you try to enqueue a frame to a full qdisc, the frame will generally be dropped (depending on what the enqueuer decides to do). So I will give out the hint that my original hypothesis was that the OP was using packet mmap to blast frames into a qdisc so fast that many were being dropped. But don't hold too fast to that idea; it takes you in a direction, but always keep an open mind. Let's give it a try to find out what happens.
First problem in trying this out is that the default qdisc pfifo_fast doesn't keep statistics. So let's replace that with the qdisc pfifo which does. By default pfifo limits the queue to TXQUEUELEN frames (which generally defaults to 1000). But since we want demonstrate overwhelming a qdisc, let's explicitly set it to 50:
$ sudo tc qdisc add dev eth0 root pfifo limit 50
$ tc -s -d qdisc show dev eth0
qdisc pfifo 8004: root refcnt 2 limit 50p
Sent 42 bytes 1 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
Run Code Online (Sandbox Code Playgroud)
Let's also measure how long it takes to process the frames in tpacket_snd with the SystemTap script call-return.stp:
# This is specific to net/packet/af_packet.c 3.13.0-116
function print_ts() {
ts = gettimeofday_us();
printf("[%10d.%06d] ", ts/1000000, ts%1000000);
}
# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
# 2089 {
# [...]
# 2213 return err;
# 2214 }
probe kernel.function("tpacket_snd") {
print_ts();
printf("tpacket_snd: args(%s)\n", $$parms);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2213") {
print_ts();
printf("tpacket_snd: return(%d)\n", $err);
}
Run Code Online (Sandbox Code Playgroud)
Start the SystemTap script with sudo stap call-return.stp and then let's blast 8096 1500 byte frames into that qdisc with a meager 50 frame capacity:
$ sudo ./packet_mmap -c 8096 -s 1500 eth0
[...]
STARTING TEST:
data offset = 32 bytes
start fill() thread
send 8096 packets (+12144000 bytes)
end of task fill()
Loop until queue empty (0)
END (number of error:0)
Run Code Online (Sandbox Code Playgroud)
So let's check how many packets were dropped by the qdisc:
$ tc -s -d qdisc show dev eth0
qdisc pfifo 8004: root refcnt 2 limit 50p
Sent 25755333 bytes 8606 pkt (dropped 1, overlimits 0 requeues 265)
backlog 0b 0p requeues 265
Run Code Online (Sandbox Code Playgroud)
WAT? Dropped one of 8096 frames dumped onto a 50 frame qdisc? Let's check the SystemTap output:
[1492603552.938414] tpacket_snd: args(po=0xffff8801673ba338 msg=0x14)
[1492603553.036601] tpacket_snd: return(12144000)
[1492603553.036706] tpacket_snd: args(po=0x0 msg=0x14)
[1492603553.036716] tpacket_snd: return(0)
Run Code Online (Sandbox Code Playgroud)
WAT? It took nearly 100ms to process 8096 frames in tpacket_snd? Let's check how long that would actually take to transmit; that's 8096 frames at 1500 bytes/frame at 1gigabit/s ~= 97ms. WAT? It smells like something is blocking.
Let's take a closer look at tpacket_snd. Groan:
skb = sock_alloc_send_skb(&po->sk,
hlen + tlen + sizeof(struct sockaddr_ll),
0, &err);
Run Code Online (Sandbox Code Playgroud)
That 0 looks pretty innocuous, but that is actually the noblock argument. It should be msg->msg_flags & MSG_DONTWAIT (it turns out this is fixed in 4.1). What is happening here is that the size of the qdisc is not the only limiting resource. If allocating space for the skb would exceed the size of the socket's sndbuf limit, then this call will either block to wait for skb's to be freed up or return -EAGAIN to a non-blocking caller. In the fix in V4.1, if non-blocking is requested it will return the number of bytes written if non-zero, otherwise -EAGAIN to the caller, which almost seems like someone doesn't want you to figure out how to use this (e.g. you fill up a tx ring with 80MB of data, call sendto with MSG_DONTWAIT, and you get back a result that you sent 150KB rather than EWOULDBLOCK).
So if you are running a kernel prior to 4.1 (I believe the OP is running >4.1 and is not affected by this bug), you will need to patch af_packet.c and build a new kernel or upgrade to a kernel 4.1 or better.
I have now booted a patched version of my kernel, since the machine I am using is running 3.13. While we won't block if the sndbuf is full, we still will return with -EAGAIN. I made some changes to packet_mmap.c to increase the default size of the sndbuf and to use SO_SNDBUFFORCE to override the system max per socket if necessary (it appears to need about 750 bytes + the frame size for each frame). I also made some additions to call-return.stp to log the sndbuf max size (sk_sndbuf), the amount used (sk_wmem_alloc), any error returned by sock_alloc_send_skb and any error returned from dev_queue_xmit on enqueuing the skb to the qdisc. Here is the new version:
# This is specific to net/packet/af_packet.c 3.13.0-116
function print_ts() {
ts = gettimeofday_us();
printf("[%10d.%06d] ", ts/1000000, ts%1000000);
}
# 2088 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
# 2089 {
# [...]
# 2133 if (size_max > dev->mtu + reserve + VLAN_HLEN)
# 2134 size_max = dev->mtu + reserve + VLAN_HLEN;
# 2135
# 2136 do {
# [...]
# 2148 skb = sock_alloc_send_skb(&po->sk,
# 2149 hlen + tlen + sizeof(struct sockaddr_ll),
# 2150 msg->msg_flags & MSG_DONTWAIT, &err);
# 2151
# 2152 if (unlikely(skb == NULL))
# 2153 goto out_status;
# [...]
# 2181 err = dev_queue_xmit(skb);
# 2182 if (unlikely(err > 0)) {
# 2183 err = net_xmit_errno(err);
# 2184 if (err && __packet_get_status(po, ph) ==
# 2185 TP_STATUS_AVAILABLE) {
# 2186 /* skb was destructed already */
# 2187 skb = NULL;
# 2188 goto out_status;
# 2189 }
# 2190 /*
# 2191 * skb was dropped but not destructed yet;
# 2192 * let's treat it like congestion or err < 0
# 2193 */
# 2194 err = 0;
# 2195 }
# 2196 packet_increment_head(&po->tx_ring);
# 2197 len_sum += tp_len;
# 2198 } while (likely((ph != NULL) ||
# 2199 ((!(msg->msg_flags & MSG_DONTWAIT)) &&
# 2200 (atomic_read(&po->tx_ring.pending))))
# 2201 );
# [...]
# 2213 return err;
# 2214 }
probe kernel.function("tpacket_snd") {
print_ts();
printf("tpacket_snd: args(%s)\n", $$parms);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2133") {
print_ts();
printf("tpacket_snd:2133: sk_sndbuf = %d sk_wmem_alloc = %d\n",
$po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2153") {
print_ts();
printf("tpacket_snd:2153: sock_alloc_send_skb err = %d, sk_sndbuf = %d sk_wmem_alloc = %d\n",
$err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2182") {
if ($err != 0) {
print_ts();
printf("tpacket_snd:2182: dev_queue_xmit err = %d\n", $err);
}
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2187") {
print_ts();
printf("tpacket_snd:2187: destructed: net_xmit_errno = %d\n", $err);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2194") {
print_ts();
printf("tpacket_snd:2194: *NOT* destructed: net_xmit_errno = %d\n", $err);
}
probe kernel.statement("tpacket_snd@net/packet/af_packet.c:2213") {
print_ts();
printf("tpacket_snd: return(%d) sk_sndbuf = %d sk_wmem_alloc = %d\n",
$err, $po->sk->sk_sndbuf, $po->sk->sk_wmem_alloc->counter);
}
Run Code Online (Sandbox Code Playgroud)
Let's try again:
$ sudo tc qdisc add dev eth0 root pfifo limit 50
$ tc -s -d qdisc show dev eth0
qdisc pfifo 8001: root refcnt 2 limit 50p
Sent 2154 bytes 21 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
$ sudo ./packet_mmap -c 200 -s 1500 eth0
[...]
c_sndbuf_sz: 1228800
[...]
STARTING TEST:
data offset = 32 bytes
send buff size = 1228800
got buff size = 425984
buff size smaller than desired, trying to force...
got buff size = 2457600
start fill() thread
send: No buffer space available
end of task fill()
send: No buffer space available
Loop until queue empty (-1)
[repeated another 17 times]
send 3 packets (+4500 bytes)
Loop until queue empty (4500)
Loop until queue empty (0)
END (number of error:0)
$ tc -s -d qdisc show dev eth0
qdisc pfifo 8001: root refcnt 2 limit 50p
Sent 452850 bytes 335 pkt (dropped 19, overlimits 0 requeues 3)
backlog 0b 0p requeues 3
Run Code Online (Sandbox Code Playgroud)
And here is the SystemTap output:
[1492759330.907151] tpacket_snd: args(po=0xffff880393246c38 msg=0x14)
[1492759330.907162] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 1
[1492759330.907491] tpacket_snd:2182: dev_queue_xmit err = 1
[1492759330.907494] tpacket_snd:2187: destructed: net_xmit_errno = -105
[1492759330.907500] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 218639
[1492759330.907646] tpacket_snd: args(po=0x0 msg=0x14)
[1492759330.907653] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337
[1492759330.907688] tpacket_snd:2182: dev_queue_xmit err = 1
[1492759330.907691] tpacket_snd:2187: destructed: net_xmit_errno = -105
[1492759330.907694] tpacket_snd: return(-105) sk_sndbuf = 2457600 sk_wmem_alloc = 189337
[repeated 17 times]
[1492759330.908541] tpacket_snd: args(po=0x0 msg=0x14)
[1492759330.908543] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 189337
[1492759330.908554] tpacket_snd: return(4500) sk_sndbuf = 2457600 sk_wmem_alloc = 196099
[1492759330.908570] tpacket_snd: args(po=0x0 msg=0x14)
[1492759330.908572] tpacket_snd:2133: sk_sndbuf = 2457600 sk_wmem_alloc = 196099
[1492759330.908576] tpacket_snd: return(0) sk_sndbuf = 2457600 sk_wmem_alloc = 196099
Run Code Online (Sandbox Code Playgroud)
Now things are working as expected; we have fixed a bug causing us to block of the sndbuf limit is exceeded and we have adjusted the sndbuf limit so that it should not be a constraint, and now we see the frames from the tx ring are enqueued onto the qdisc until it is full, at which point we get returned ENOBUFS.
The next problem is now how to efficiently keep publishing to the qdisc to keep the interface busy. Note that the implementation of packet_poll is useless in the case that that we fill up the qdisc and get back ENOBUFS, because it just queries if the head is TP_STATUS_AVAILABLE, which in this case will remain TP_STATUS_SEND_REQUEST until a subsequent call to sendto succeeds in queueing the frame to the qdisc. A simple expediency (updated in packet_mmap.c) is to loop on the sendto until success or an error other than ENOBUFS or EAGAIN.
Anyway, we know way more than enough to answer the OPs question now, even if we don't have a complete solution to efficiently keep the NIC from being starved.
From what we have learned, we know that when OP calls sendto with a tx ring in blocking mode, tpacket_snd will start enqueuing skbs onto the qdisc until the sndbuf limit is exceeded (and the default is generally quite small, about 213K, and further, I discovered that frame data referenced in the shared tx ring is counted towards this) when it will block (while still holding pg_vec_lock). As skb's free up, more frames wil be enqueued, and maybe the sndbuf will be exceeded again and we will block again. Eventually, all the data will have beeen queued to the qdisc but tpacket_snd will continue to block until all of the frames have been transmitted (you can't mark a frame in the tx ring as available until the NIC has received it, as an skb in the driver ring references a frame in the tx ring) while still holding pg_vec_lock. At this point the NIC is starved, and any other socket writers have been blocked by the lock.
On the other hand, when OP publishes a packet at a time, it will be handled by packet_snd which will block if there is no room in the sndbuf and then enqueue the frame onto the qdisc, and immediately return. It does not wait for the frame to be transmitted. As the qdisc is being drained, additional frames can be enqueued. If the publisher can keep up, the NIC will never be starved.
Further, the op is copying into the tx ring for every sendto call and comparing that to passing a fixed frame buffer when not using a tx ring. You won't see a speedup from not copying that way (although that is not the only benefit of using the tx ring).