为什么内核 5.2 上的系统调用性能远低于内核 2.6?

0 linux performance

我正在测试不同内核上系统调用的性能(硬件相同):

测试代码

#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
       int i;
       pid_t tid;
       unsigned long cnt = 1000000;

       for(i = 0; i < cnt; i++)
       {
               tid = syscall(SYS_gettid);
       }
       return 0;
}

Run Code Online (Sandbox Code Playgroud)

内核2.6的结果:

processor       : 3
vendor_id       : GenuineIntel
cpu family      : 6
model           : 55
model name      : Intel(R) Celeron(R) CPU  J1900  @ 1.99GHz
stepping        : 9
cpu MHz         : 2000.029
cache size      : 1024 KB
physical id     : 0
siblings        : 4
core id         : 3
cpu cores       : 4
apicid          : 6
initial apicid  : 6
fdiv_bug        : no
hlt_bug         : no
f00f_bug        : no
coma_bug        : no
fpu             : yes
fpu_exception   : yes
cpuid level     : 11
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx rdtscp lm constant_tsc arch_perfmon pebs bts xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt lahf_lm 3dnowprefetch arat tpr_shadow vnmi flexpriority ept vpid
bogomips        : 3999.80
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux

[HFOS] $ strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
99.90    0.023803           0   1000000           gettid
 0.10    0.000023           1        40        39 open
 0.00    0.000000           0         1           read
 0.00    0.000000           0         1           close
 0.00    0.000000           0         1           execve
 0.00    0.000000           0         1         1 access
 0.00    0.000000           0         1           brk
 0.00    0.000000           0         5           mmap2
 0.00    0.000000           0        39        35 stat64
 0.00    0.000000           0         1           fstat64
 0.00    0.000000           0         1           set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00    0.023826               1000091        75 total
Run Code Online (Sandbox Code Playgroud)

内核5.2的结果:

processor       : 3
vendor_id       : GenuineIntel
cpu family      : 6
model           : 55
model name      : Intel(R) Celeron(R) CPU  J1900  @ 1.99GHz
stepping        : 9
microcode       : 0x90a
cpu MHz         : 1332.848
cache size      : 1024 KB
physical id     : 0
siblings        : 4
core id         : 3
cpu cores       : 4
apicid          : 6
initial apicid  : 6
fpu             : yes
fpu_exception   : yes
cpuid level     : 11
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 movbe popcnt tsc_deadline_timer rdrand lahf_lm 3dnowprefetch epb pti ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid tsc_adjust smep erms dtherm arat
bugs            : cpu_meltdown spectre_v1 spectre_v2 mds msbds_only
bogomips        : 3998.40
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

localhost:ipc # uname -a
Linux localhost 5.2.8 #2 SMP Wed May 6 12:51:13 CST 2020 x86_64 GNU/Linux

strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
99.99    7.964185           7   1000000           gettid
 0.00    0.000384         384         1           execve
 0.00    0.000152          21         7           mmap
 0.00    0.000084          21         4           mprotect
 0.00    0.000045          22         2           openat
 0.00    0.000031          31         1           munmap
 0.00    0.000027          13         2           fstat
 0.00    0.000021          10         2           close
 0.00    0.000020          20         1         1 access
 0.00    0.000016          16         1           read
 0.00    0.000011          11         1           brk
 0.00    0.000010          10         1           arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00    7.964986           7   1000023         1 total
Run Code Online (Sandbox Code Playgroud)

我很困惑为什么它在新内核上这么慢。请帮我。非常感谢。

设置[mitigations=off]后,系统调用性能几乎相同(执行时间几乎相同,但strace时间不同)。

localhost:~ # dmesg | grep iso
[    0.006693] Kernel/User page tables isolation: disabled on command line.

Run Code Online (Sandbox Code Playgroud)

结果如下所示。

内核 2.6:

[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
    int i;
    pid_t tid;
    unsigned long cnt = 100000000;

    for(i = 0; i < cnt; i++)
    {
        tid = syscall(SYS_gettid);
    }
    return 0;
}

[HFOS] $ time ./sc

real    0m16.736s
user    0m5.529s
sys     0m11.204s

[HFOS] $ time strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00    0.275026           0  10000000           gettid
  0.00    0.000000           0         1           read
  0.00    0.000000           0        40        39 open
  0.00    0.000000           0         1           close
  0.00    0.000000           0         1           execve
  0.00    0.000000           0         1         1 access
  0.00    0.000000           0         1           brk
  0.00    0.000000           0         5           mmap2
  0.00    0.000000           0        39        35 stat64
  0.00    0.000000           0         1           fstat64
  0.00    0.000000           0         1           set_thread_area
------ ----------- ----------- --------- --------- ----------------
100.00    0.275026              10000091        75 total

real    2m57.054s
user    0m28.704s
sys     2m27.259s



Run Code Online (Sandbox Code Playgroud)

内核 5.2:

localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # cat sc.c
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>

int main(int argc, char *argv[])
{
    int i;
    pid_t tid;
    unsigned long cnt = 100000000;

    for(i = 0; i < cnt; i++)
    {
        tid = syscall(SYS_gettid);
    }
    return 0;
}

localhost:test # time ./sc

real    0m19.043s
user    0m8.501s
sys     0m10.532s

localhost:test # time strace -c ./sc
% time     seconds  usecs/call     calls    errors syscall
------ ----------- ----------- --------- --------- ----------------
100.00   77.250398           7  10000000           gettid
  0.00    0.000405         405         1           execve
  0.00    0.000159          22         7           mmap
  0.00    0.000088          22         4           mprotect
  0.00    0.000048          24         2           openat
  0.00    0.000031          31         1           munmap
  0.00    0.000028          14         2           fstat
  0.00    0.000024          12         2           close
  0.00    0.000021          21         1         1 access
  0.00    0.000016          16         1           read
  0.00    0.000013          13         1           brk
  0.00    0.000012          12         1           arch_prctl
------ ----------- ----------- --------- --------- ----------------
100.00   77.251243           7  10000023         1 total

real    6m7.443s
user    0m55.590s
sys     6m23.482s


Run Code Online (Sandbox Code Playgroud)

但是 UNIX 域套接字性能没有改变。测试代码如下所示。

svr.c:

#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <ctype.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>

#define TEST_SOCK_FILE  "/tmp/test.sock"

int main(int argc, char **argv)
{
    int fd;
    int cfd;
    int r;
    int cnt = 0;
    socklen_t  sklen;
    struct sockaddr caddr;
    char rbuf[1024];
    char sbuf[512];
    struct sockaddr_un  svraddr;

    unlink(TEST_SOCK_FILE);
    svraddr.sun_family = AF_UNIX;
    snprintf(svraddr.sun_path, sizeof(svraddr.sun_path), TEST_SOCK_FILE);

    fd = socket(AF_UNIX, SOCK_STREAM, 0);
    if(fd < 0)
    {
        printf("Create socket failed : %s\n", strerror(errno));
        return -1;
    }

    if(bind(fd, (struct sockaddr *)&svraddr, sizeof(svraddr)) < 0)
    {
        printf("Bind socket failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    if(listen(fd, 10) < 0)
    {
        printf("Listen socket failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    while(1)
    {
        sklen = sizeof(caddr);
        memset(&caddr, 0, sizeof(caddr));
        cfd = accept(fd, &caddr, &sklen);
        if(cfd < 0)
        {
            printf("Accept failed : %s", strerror(errno));
            return -1;
        }

        cnt = 0;
        while(1)
        {
            r = read(cfd, rbuf, sizeof(rbuf));
            if(r <= 0)
            {
                printf("recv failed : %s\n", strerror(errno));
                break;
            }
            if(rbuf[0] == 0x22)
            {
                break;
            }
            r = write(cfd, sbuf, sizeof(sbuf));
            if(r <= 0)
            {
                printf("send failed : %s\n", strerror(errno));
                break;
            }
            cnt++;
        }

        printf("Recv packet : %d\n", cnt);
        close(cfd);
    }
    close(fd);
    return 0;
}

Run Code Online (Sandbox Code Playgroud)

cli.c:

#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <time.h>
#include <errno.h>
#include <sys/un.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>


int main(int argc, char **argv)
{
    int fd;
    int r;
    int i;
    int cnt = 1000000;
    struct sockaddr_un unaddr;
    time_t ts, te, tu;
    char sbuf[1024];
    char rbuf[512];

    unaddr.sun_family = AF_UNIX;
    strcpy(unaddr.sun_path, "/tmp/test.sock");
    fd = socket(SOCK_STREAM, SOCK_STREAM, 0);
    if(fd < 0)
    {
        printf("Create socket failed : %s\n", strerror(errno));
        return -1;
    }

    r = connect(fd, (struct sockaddr *)&unaddr, sizeof(unaddr));
    if(r < 0)
    {
        printf("Connect failed : %s\n", strerror(errno));
        close(fd);
        return -1;
    }

    ts = time(NULL);
    for(i = 0; i < cnt; i++)
    {
        sbuf[0] = 0x11;
        r = write(fd, sbuf, sizeof(sbuf));
        if(r <= 0)
        {
            printf("Send failed : %s\n", strerror(errno));
            break;
        }
        r = read(fd, rbuf, sizeof(rbuf));
        if(r <= 0)
        {
            printf("Recv failed : %s\n", strerror(errno));
            break;
        }
    }

    sbuf[0] = 0x22;
    write(fd, sbuf, sizeof(sbuf));

    te = time(NULL);
    tu = te > ts ? (te - ts) : 1;
    printf("PPS(%d) : %d packet used %lu seconds\n", cnt / tu, cnt, tu);

    close(fd);
    return 0;
}

Run Code Online (Sandbox Code Playgroud)

结果如下所示:

localhost:test # uname -a
Linux localhost 5.2.8 #2 SMP Thu May 14 02:46:43 CST 2020 x86_64 GNU/Linux
localhost:test # ./svr &
[1] 955
localhost:test # ./cli
PPS(34482) : 1000000 packet used 29 seconds
Recv packet : 1000000
Run Code Online (Sandbox Code Playgroud)
[HFOS] $ uname -a
Linux HFOS 2.6.32.10 #1 SMP Fri Sep 9 16:11:47 CST 2016 i686 pentium3 i386 GNU/Linux
[HFOS] $ ./svr &
[1] 32624
[HFOS] $ ./cli
Recv packet : 1000000
PPS(71428) : 1000000 packet used 14 seconds

Run Code Online (Sandbox Code Playgroud)

Ste*_*itt 5

与 2.6.32 内核相比,5.2 内核上的系统调用性能较低的大部分原因可能是内核页表隔离和其他与安全相关的更改。KPTI 涉及在用户空间和内核中运行时使用不同的页表。因此,每个系统调用都会两次更改页表,并产生级联结果,例如 TLB 刷新(在较旧的硬件上)。

去年发表了一篇有趣的论文,跟踪 Linux 内核的性能变化;这篇博文中对其进行了详细描述,并且该论文本身可以在 ACM DL 上获得(并且在 6 月底之前可以公开访问)。