使用mmap重叠页面(MAP_FIXED)

fon*_*ons 13 c linux mmap libc

由于与此问题无关的一些模糊原因,我需要使用MAP_FIXED来获取靠近libc的文本部分存储在内存中的页面.

在阅读mmap(2)之前(我本来应该做的),如果我用MAP_FIXED调用mmap并且基地址与已经映射的区域重叠,那么我期望得到一个错误.

然而事实并非如此.例如,这是某些进程的/ proc/maps的一部分

7ffff7299000-7ffff744c000 r-xp 00000000 08:05 654098                     /lib/x86_64-linux-gnu/libc-2.15.so
Run Code Online (Sandbox Code Playgroud)

其中,在进行以下mmap调用之后......

  mmap(0x7ffff731b000,
       getpagesize(),
       PROT_READ | PROT_WRITE | PROT_EXEC,
       MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED,
       0,
       0);
Run Code Online (Sandbox Code Playgroud)

... 变成:

7ffff7299000-7ffff731b000 r-xp 00000000 08:05 654098                     /lib/x86_64-linux-gnu/libc-2.15.so
7ffff731b000-7ffff731c000 rwxp 00000000 00:00 0 
7ffff731c000-7ffff744c000 r-xp 00083000 08:05 654098                     /lib/x86_64-linux-gnu/libc-2.15.so
Run Code Online (Sandbox Code Playgroud)

这意味着我用自己的页面覆盖了专用于libc的部分虚拟地址空间.显然不是我想要的......

在mmap(2)手册的MAP_FIXED部分,它清楚地说明:

如果addr和len指定的内存区域与任何现有映射的页面重叠,则将丢弃现有映射的重叠部分.

这解释了我所看到的,但我有几个问题:

  1. 有没有办法检测某些东西是否已映射到某个地址?没有访问/ proc/maps?
  2. 有没有办法在找到重叠页面的情况下强制mmap失败?

Nom*_*mal 8

  1. 使用page = sysconf(SC_PAGE_SIZE)找出页面大小,然后扫描你希望每个页面大小的块使用检查msync(addr, page, 0)(用(unsigned long)addr % page == 0,即addr对齐页).如果返回-1errno == ENOMEM,该页面没有被映射.

    编辑:正如下文评论的那样,mincore(addr,page,&dummy)优于msync().(系统调用的实现mm/mincore.c在Linux内核源代码中,C库通常提供更新的包装errno.由于系统调用在确保addr页面对齐后立即进行映射检查,因此在未映射的情况下是最佳的(ENOMEM)如果页面已经映射,它会起作用,所以如果性能是最重要的,请尽量避免检查你知道映射的页面.

    您必须单独为每个页面单独执行此操作,因为对于大于单个页面ENOMEM的区域,表示该区域未完全映射; 它可能仍然是部分映射的.映射始终是粒度到页面大小的单位.

  2. 据我所知,mmap()如果区域已经映射,或者包含已映射的页面,则无法告诉失败.(这同样适用于mremap(),因此您无法创建映射,然后将其移动到所需的区域.)

    这意味着您面临竞争风险.最好自己执行实际的系统调用,而不是C库包装器,以防它们在内部进行内存分配或更改内存映射:

    #define _GNU_SOURCE
    #include <unistd.h>
    #include <sys/syscall.h>
    
    static size_t page = 0;
    static inline size_t page_size(void)
    {
        if (!page)
            page = (size_t)sysconf(_SC_PAGESIZE);
        return page;
    }
    
    
    static inline int raw_msync(void *addr, size_t length, int flags)
    {
        return syscall(SYS_msync, addr, length, flags);
    }
    
    static inline void *raw_mmap(void *addr, size_t length, int prot, int flags)
    {
        return (void *)syscall(SYS_mmap, addr, length, prot, flags, -1, (off_t)0);
    }
    
    Run Code Online (Sandbox Code Playgroud)

但是,我怀疑无论你想做什么,你最终都需要解析/proc/self/maps.

  • 我建议stdio.h完全避免使用标准I/O (因为各种操作将动态分配内存,从而更改映射),而是使用较低级别的unistd.h接口,这些接口不太可能影响映射.这是一组简单粗略的函数,您可以使用它们找出每个映射区域以及该区域中启用的保护(并丢弃其他信息).在实践中,它使用大约一千字节的代码而不是堆栈中的代码,因此即使在有限的体系结构(例如,嵌入式设备)上它也非常有用.

    #include <unistd.h>
    #include <fcntl.h>
    #include <errno.h>
    #include <string.h>
    
    #ifndef   INPUT_BUFFER
    #define   INPUT_BUFFER   512
    #endif /* INPUT_BUFFER */
    
    #ifndef   INPUT_EOF
    #define   INPUT_EOF     -256
    #endif /* INPUT_EOF */
    
    #define   PERM_PRIVATE  16
    #define   PERM_SHARED    8
    #define   PERM_READ      4
    #define   PERM_WRITE     2
    #define   PERM_EXEC      1
    
    typedef struct {
        int            descriptor;
        int            status;
        unsigned char *next;
        unsigned char *ends;
        unsigned char  buffer[INPUT_BUFFER + 16];
    } input_buffer;
    
    /* Refill input buffer. Returns the number of new bytes.
     * Sets status to ENODATA at EOF.
    */
    static size_t input_refill(input_buffer *const input)
    {
        ssize_t n;
    
        if (input->status)
            return (size_t)0;
    
        if (input->next > input->buffer) {
            if (input->ends > input->next) {
                memmove(input->buffer, input->next,
                        (size_t)(input->ends - input->next));
                input->ends = input->buffer + (size_t)(input->ends - input->next);
                input->next = input->buffer;
            } else {
                input->ends = input->buffer;
                input->next = input->buffer;
            }
        }
    
        do {
            n = read(input->descriptor, input->ends,
                     INPUT_BUFFER - (size_t)(input->ends - input->buffer));
        } while (n == (ssize_t)-1 && errno == EINTR);
        if (n > (ssize_t)0) {
            input->ends += n;
            return (size_t)n;
    
        } else
        if (n == (ssize_t)0) {
            input->status = ENODATA;
            return (size_t)0;
        }
    
        if (n == (ssize_t)-1)
            input->status = errno;
        else
            input->status = EIO;
    
        return (size_t)0;
    }
    
    /* Low-lever getchar() equivalent.
    */
    static inline int input_next(input_buffer *const input)
    {
        if (input->next < input->ends)
            return *(input->next++);
        else
        if (input_refill(input) > 0)
            return *(input->next++);
        else
            return INPUT_EOF;
    }
    
    /* Low-level ungetc() equivalent.
    */
    static inline int input_back(input_buffer *const input, const int c)
    {
        if (c < 0 || c > 255)
            return INPUT_EOF;
        else
        if (input->next > input->buffer)
            return *(--input->next) = c;
        else
        if (input->ends >= input->buffer + sizeof input->buffer)
            return INPUT_EOF;
    
        memmove(input->next + 1, input->next, (size_t)(input->ends - input->next));
        input->ends++;
        return *(input->next) = c;
    }
    
    /* Low-level fopen() equivalent.
    */
    static int input_open(input_buffer *const input, const char *const filename)
    {
        if (!input)
            return errno = EINVAL;
    
        input->descriptor = -1;
        input->status = 0;
        input->next = input->buffer;
        input->ends = input->buffer;
    
        if (!filename || !*filename)
            return errno = input->status = EINVAL;
    
        do {
            input->descriptor = open(filename, O_RDONLY | O_NOCTTY);
        } while (input->descriptor == -1 && errno == EINTR);
        if (input->descriptor == -1)
            return input->status = errno;
    
        return 0;
    }
    
    /* Low-level fclose() equivalent.
    */
    static int input_close(input_buffer *const input)
    {
        int result;
    
        if (!input)
            return errno = EINVAL;
    
        /* EOF is not an error; we use ENODATA for that. */
        if (input->status == ENODATA)
            input->status = 0;
    
        if (input->descriptor != -1) {
            do {
                result = close(input->descriptor);
            } while (result == -1 && errno == EINTR);
            if (result == -1 && !input->status)
                input->status = errno;
        }
    
        input->descriptor = -1;
        input->next = input->buffer;
        input->ends = input->buffer;
    
        return errno = input->status;
    }
    
    /* Read /proc/self/maps, and fill in the arrays corresponding to the fields.
     * The function will return the number of mappings, even if not all are saved.
    */
    size_t read_maps(size_t const n,
                     void **const ptr, size_t *const len,
                     unsigned char *const mode)
    {
        input_buffer    input;
        size_t          i = 0;
        unsigned long   curr_start, curr_end;
        unsigned char   curr_mode;
        int             c;
    
        errno = 0;
    
        if (input_open(&input, "/proc/self/maps"))
            return (size_t)0; /* errno already set. */
    
        c = input_next(&input);
        while (c >= 0) {
    
            /* Skip leading controls and whitespace */
            while (c >= 0 && c <= 32)
                c = input_next(&input);
    
            /* EOF? */
            if (c < 0)
                break;
    
            curr_start = 0UL;
            curr_end = 0UL;
            curr_mode = 0U;
    
            /* Start of address range. */
            while (1)
                if (c >= '0' && c <= '9') {
                    curr_start = (16UL * curr_start) + c - '0';
                    c = input_next(&input);
                } else
                if (c >= 'A' && c <= 'F') {
                    curr_start = (16UL * curr_start) + c - 'A' + 10;
                    c = input_next(&input);
                } else
                if (c >= 'a' && c <= 'f') {
                    curr_start = (16UL * curr_start) + c - 'a' + 10;
                    c = input_next(&input);
                } else
                    break;
            if (c == '-')
                c = input_next(&input);
            else {
                errno = EIO;
                return (size_t)0;
            }
    
            /* End of address range. */
            while (1)
                if (c >= '0' && c <= '9') {
                    curr_end = (16UL * curr_end) + c - '0';
                    c = input_next(&input);
                } else
                if (c >= 'A' && c <= 'F') {
                    curr_end = (16UL * curr_end) + c - 'A' + 10;
                    c = input_next(&input);
                } else
                if (c >= 'a' && c <= 'f') {
                    curr_end = (16UL * curr_end) + c - 'a' + 10;
                    c = input_next(&input);
                } else
                    break;
            if (c == ' ')
                c = input_next(&input);
            else {
                errno = EIO;
                return (size_t)0;
            }
    
            /* Permissions. */
            while (1)
                if (c == 'r') {
                    curr_mode |= PERM_READ;
                    c = input_next(&input);
                } else
                if (c == 'w') {
                    curr_mode |= PERM_WRITE;
                    c = input_next(&input);
                } else
                if (c == 'x') {
                    curr_mode |= PERM_EXEC;
                    c = input_next(&input);
                } else
                if (c == 's') {
                    curr_mode |= PERM_SHARED;
                    c = input_next(&input);
                } else
                if (c == 'p') {
                    curr_mode |= PERM_PRIVATE;
                    c = input_next(&input);
                } else
                if (c == '-') {
                    c = input_next(&input);
                } else
                    break;
            if (c == ' ')
                c = input_next(&input);
            else {
                errno = EIO;
                return (size_t)0;
            }
    
            /* Skip the rest of the line. */
            while (c >= 0 && c != '\n')
                c = input_next(&input);
    
            /* Add to arrays, if possible. */
            if (i < n) {
                if (ptr) ptr[i] = (void *)curr_start;
                if (len) len[i] = (size_t)(curr_end - curr_start);
                if (mode) mode[i] = curr_mode;
            }
            i++;
        }
    
        if (input_close(&input))
            return (size_t)0; /* errno already set. */
    
        errno = 0;
        return i;
    }
    
    Run Code Online (Sandbox Code Playgroud)

    read_maps()函数读取到n的区域,开始地址作为void *进入ptr阵列,长度到len阵列和权限进入mode阵列,返回图的总数(可能大于n),或零与errno集如果发生错误.

    很可能将syscall用于上面的低级I/O,因此您不使用任何C库功能,但我认为这根本不是必需的.(就我所知,C库使用围绕实际系统调用的非常简单的包装器.)

希望这个对你有帮助.

  • 如果已经采用地址,有一种简单的方法可以告诉`mmap`失败:省略`MAP_FIXED`.在这种情况下,它将"虚假地"成功选择与您请求的地址不同的地址.如果可以接受不同选择的地址,只需使用它即可.如果没有,请调用`munmap`并再次尝试不同的地址. (2认同)

Eli*_*nda 6

"这解释了我所看到的,但我有几个问题:"

"有没有办法检测某些东西是否已经映射到某个地址?没有访问/ proc/maps?"

是的,使用没有MAP_FIXED的mmap.

"在找到重叠页面的情况下,有没有办法强制mmap失败?"

显然不是,但只是在mmap之后使用munmap,如果mmap返回的映射不是请求的地址.

没有 MAP_FIXED的情况下使用时,linux和Mac OS X上的mmap(我也怀疑其他地方)如果存在[address,address + length]范围内的现有映射,则服从地址参数.因此,如果mmap在与您提供的地址不同的地址处回答映射,则可以推断出该范围内已存在映射,您需要使用不同的范围.由于mmap通常会在忽略地址参数时回答非常高地址的映射,因此只需使用munmap取消映射该区域,然后在不同的地址再次尝试.

使用mincore来检查地址范围的使用不仅浪费时间(一次必须探测页面),它可能无法工作.较旧的Linux内核只会在文件映射中适当地使mincore失败.对于MAP_ANON映射,它们根本不会回答任何问题.但正如我所指出的,你需要的只是mmap和munmap.

我刚刚完成了为Smalltalk VM实现内存管理器的练习.我使用sbrk(0)找出我可以映射第一个段的第一个地址,然后使用mmap和1Mb的增量来搜索后续段的空间:

static long          pageSize = 0;
static unsigned long pageMask = 0;

#define roundDownToPage(v) ((v)&pageMask)
#define roundUpToPage(v) (((v)+pageSize-1)&pageMask)

void *
sqAllocateMemory(usqInt minHeapSize, usqInt desiredHeapSize)
{
    char *hint, *address, *alloc;
    unsigned long alignment, allocBytes;

    if (pageSize) {
        fprintf(stderr, "sqAllocateMemory: already called\n");
        exit(1);
    }
    pageSize = getpagesize();
    pageMask = ~(pageSize - 1);

    hint = sbrk(0); /* the first unmapped address above existing data */

    alignment = max(pageSize,1024*1024);
    address = (char *)(((usqInt)hint + alignment - 1) & ~(alignment - 1));

    alloc = sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto
                (roundUpToPage(desiredHeapSize), address, &allocBytes);
    if (!alloc) {
        fprintf(stderr, "sqAllocateMemory: initial alloc failed!\n");
        exit(errno);
    }
    return (usqInt)alloc;
}

/* Allocate a region of memory of at least size bytes, at or above minAddress.
 *  If the attempt fails, answer null.  If the attempt succeeds, answer the
 * start of the region and assign its size through allocatedSizePointer.
 */
void *
sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto(sqInt size, void *minAddress, sqInt *allocatedSizePointer)
{
    char *address, *alloc;
    long bytes, delta;

    address = (char *)roundUpToPage((unsigned long)minAddress);
    bytes = roundUpToPage(size);
    delta = max(pageSize,1024*1024);

    while ((unsigned long)(address + bytes) > (unsigned long)address) {
        alloc = mmap(address, bytes, PROT_READ | PROT_WRITE,
                     MAP_ANON | MAP_PRIVATE, -1, 0);
        if (alloc == MAP_FAILED) {
            perror("sqAllocateMemorySegmentOfSizeAboveAllocatedSizeInto mmap");
            return 0;
        }
        /* is the mapping both at or above address and not too far above address? */
        if (alloc >= address && alloc <= address + delta) {
            *allocatedSizePointer = bytes;
            return alloc;
        }
        /* mmap answered a mapping well away from where Spur prefers.  Discard
         * the mapping and try again delta higher.
         */
        if (munmap(alloc, bytes) != 0)
            perror("sqAllocateMemorySegment... munmap");
        address += delta;
    }
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

这看起来效果很好,在跳过任何现有映射的同时在升序地址处分配内存.

HTH