std :: unordered_map不释放内存

kre*_*ieg 4 c++ memory-management stl unordered-map visual-c++

I'm observing odd behavior of std::unordered_map in MSVC14 (VS2015). Consider following scenario. I create an unordered map and fill it with dummy struct which consumes considerable amount of memory, lets say 1Gb, overall 100k elements inserted. Then you start to delete elements from the map. Lets say you have deleted half of elements, then, you expect half of memory being freed. Right? Wrong! I see that memory is released when number of elements in map pass some threshold, in my case it was 1443 elements.
One may say that it is malloc optimization to allocate large chunk from OS using VirtualAllocEx or HeapAlloc and actually it is not freeing memory back to system since the optimization dictate the policy and may not call HeapFree for future reuse of already allocated memory.
To eliminate this case I've employed custom allocator for allocate_shared, it didnt do the trick. So the main question is why it happens and what can be done to "compact" memory used by unordered_map?
The code

#include <windows.h>
#include <memory>
#include <vector>
#include <map>
#include <unordered_map>
#include <random>
#include <thread>
#include <iostream>
#include <allocators>

HANDLE heap = HeapCreate(0, 0, 0);
template <class Tp>
struct SimpleAllocator
{
    typedef Tp value_type;
    SimpleAllocator() noexcept
    {}
    template <typename U>
    SimpleAllocator(const SimpleAllocator<U>& other) throw()
    {};
    Tp* allocate(std::size_t n)
    {
        return static_cast<Tp*>(HeapAlloc(heap, 0, n * sizeof(Tp)));
    }
    void deallocate(Tp* p, std::size_t n)
    {
        HeapFree(heap, 0, p);
    }
};
template <class T, class U>
bool operator==(const SimpleAllocator<T>&, const SimpleAllocator<U>&)
{
    return true;
}
template <class T, class U>
bool operator!=(const SimpleAllocator<T>& a, const SimpleAllocator<U>& b)
{
    return !(a == b);
}

struct Entity
{
    Entity()
    {
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }
    int _1 = 1;
    int _2 = 2;
    double _3 = 3;
    double _4 = 5;
    float _5 = 3.14f;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"},{1, "b"},{2, "c"},{3, "d"},{4, "e"},
    {5, "f"},{6, "g"},{7, "h"},{8, "e"},{9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

using Container = std::unordered_map<long long, std::shared_ptr<Entity>>;

void printContainerInfo(std::shared_ptr<Container> container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
        << ", Size: " << container->size() << ", Bucket count: " << container->bucket_count()
        << ", Load factor: " << container->load_factor() << ", Max load factor: " << container->max_load_factor()
        << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    constexpr size_t ps = 10'000;
    stdext::allocators::allocator_chunklist<Entity> _allocator;
    std::shared_ptr<Container> test = std::make_shared<Container>();
    test->reserve(maxEntites);

    for(size_t i = 0; i < maxEntites; ++i)
    {
        test->emplace(i, std::make_shared<Entity>());
    }

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<size_t> dis(0, maxEntites);
    size_t cycles = 0;
    while(test->size() > 0)
    {
        size_t counter = 0;
        std::cout << "Press any key..." << std::endl;
        std::cin.get();
        while(test->size() > 1443)
        {
            test->erase(dis(gen));
        }
        printContainerInfo(test);
        std::cout << "Press any key..." << std::endl;
        std::cin.get();
        std::cout << std::endl;
    }
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

Things I tried so far: Try to rehash/resize when the load factor reaches some threshold - in the erasing while add something like this

if(test->load_factor() < 0.2)
{
    test->max_load_factor(1 / test->load_factor());
    test->rehash(test->size());
    test->reserve(test->size());
    printContainerInfo(test);
    test->max_load_factor(1);
    test->rehash(test->size());
    test->reserve(test->size());
}
Run Code Online (Sandbox Code Playgroud)

Then when it doesnt help try something silly like creating temporary container, copying/moving remaining entries, clear the original one, and copy/move back from temp to the original. Something like this

if(test->load_factor() < 0.2)
{
    Container tmp;
    std::copy(test->begin(), test->end(), std::inserter(tmp, tmp.begin()));
    test->clear();
    test.reset();
    test = std::make_shared<Container>();
    std::copy(tmp.begin(), tmp.end(), std::inserter(*test, test->begin()));
}
Run Code Online (Sandbox Code Playgroud)

最后,更换shared_ptrallocate_shared和通过SimpleAllocator实例来它。
另外,我在这里有修改STL代码,比如调用std::vector::shrink_to_fitstd::unordered_map's vector(MSVC的STL实现的unordered_map是基于listvector),它没有工作要么。

EDIT001:对于所有非信徒。以下代码与以前的代码大致相同,但使用std::vector<Entity>代替unordered_map。内存由OS回收。

#include <memory>
#include <vector>
#include <map>
#include <random>
#include <thread>
#include <iostream>

struct Entity
{
    Entity()
    {
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }
    int _1 = 1;
    int _2 = 2;
    double _3 = 3;
    double _4 = 5;
    float _5 = 3.14f;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"}, {1, "b"}, {2, "c"}, {3, "d"}, {4, "e"},
                                           {5, "f"}, {6, "g"}, {7, "h"}, {8, "e"}, {9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

using Container = std::vector<std::shared_ptr<Entity>>;

void printContainerInfo(std::shared_ptr<Container> container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
              << ", Size: " << container->size() << ", Capacity: " << container->capacity() << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    constexpr size_t ps = 10'000;
    std::shared_ptr<Container> test = std::make_shared<Container>();
    test->reserve(maxEntites);

    for(size_t i = 0; i < maxEntites; ++i)
    {
        test->emplace_back(std::make_shared<Entity>());
    }

    std::random_device rd;
    std::mt19937 gen(rd());
    size_t cycles = 0;
    while(test->size() > 0)
    {
        std::uniform_int_distribution<size_t> dis(0, test->size());
        size_t counter = 0;
        while(test->size() > 0 && counter < ps)
        {
            test->pop_back();
            ++counter;
        }
        ++cycles;
        if(cycles % 7 == 0)
        {
            std::cout << "Inflating..." << std::endl;
            while(test->size() < maxEntites)
            {
                test->emplace_back(std::make_shared<Entity>());
            }
        }
        std::this_thread::sleep_for(std::chrono::seconds(1));
        printContainerInfo(test);
        std::cout << std::endl;
    }
    return 0;
}
Run Code Online (Sandbox Code Playgroud)

Dav*_*aim 5

您是正确的,但部分正确。

unordered_map在VC ++中实现C ++的方式是使用内部存储区std::vector,该存储区是存储区列表,而内部存储区std::list包含映射的节点

在图中,它看起来像这样:

buckets : [][][*][][][][][][*][][][][][][*]
               |            |            |
               |            |            | 
             ---             ------      |
             |                    |      |
             V                    V      V
elements: [1,3]->[5,7]->[7,1]->[8,11]->[10,3]->[-1,2]
Run Code Online (Sandbox Code Playgroud)

现在,当您删除节点时,它们实际上已从列表中删除,但是它并没有说明buckets数组。在达到某个阈值后,可以重新调整存储桶数组的大小(通过每个存储桶中的元素过多,或通过存储元素数量的存储桶来实现)。

太证明了我的观点,这是使用最新的VC ++编译的示例:

std::unordered_map<int, std::vector<char>> map;
for (auto i = 0; i < 1000; i++) {
    map.emplace(i, std::vector<char>(10000));
}

for (auto i = 0; i < 900; i++) {
    map.erase(i);
}
Run Code Online (Sandbox Code Playgroud)

查看调试器中的原始视图,我们看到:

+       _List   { size=100 }    std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=2048 }   std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >
Run Code Online (Sandbox Code Playgroud)

意味着尽管我们只有100个元素,但地图保留了2048个存储桶。

因此,删除元素时不会释放所有内存。映射会保留另一部分内存来预订存储桶本身,并且该内存比元素内存更顽固。

编辑:
让我们更加疯狂

std::unordered_map<int, std::vector<char>> map;
for (auto i = 0; i < 100'000; i++) {
    map.emplace(i, std::vector<char>(10000));
}

for (auto i = 0; i < 90'000; i++) {
    map.erase(i);
}
Run Code Online (Sandbox Code Playgroud)

擦除循环结束时的结果:

+       _List   { size=10000 }  std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=262144 } std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >
Run Code Online (Sandbox Code Playgroud)

现在,在64位上,的大小std::_List_unchecked_iterator<...>为8个字节。我们有262144个数据,因此我们保存262144 * 8 /(1024 * 1024)= 2MB的大量未使用数据。这是您看到的高内存使用率

map.rehash(1024*10)删除所有多余的节点后调用似乎有助于减少内存消耗:

+       _List   { size=10000 }  std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=32768 }  std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >
Run Code Online (Sandbox Code Playgroud)

这是您正在寻找的解决方案。

(PS:我最近违背了我的意愿做了很多。即使在可能的情况下,在.NET中执行此类操作也将是一个活生生的麻烦。


kre*_*ieg 2

好的,在向 Microsoft 开出高级支持票后,我得到了以下答案。其中大部分我们已经知道,但有一部分我们没有考虑到。

  1. 在windows中内存以Pages的形式在堆中分配
  2. 在STL中没有任何缓存,在调用erase之后我们直接调用RtlHeapFree
  3. 你看到的是Windows如何管理Heap
  4. 一旦您将某些内容标记为删除,它可能不会返回到没有内存压力的操作系统,它可能会决定将来重新分配内存的成本比将其保留在进程中要多
  5. 这就是任何堆算法的工作原理
  6. 另一件需要考虑的事情是;如果您要删除的值恰好分布在多个页面上;除非页面内的所有值都为空,否则它将驻留在内存中
  7. 如果您非常注重立即减少私有字节,您可能必须编写自己的内存管理器,而不是依赖 Windows 堆句柄。

重点是我的。我想它回答了这个问题,或者问题就像“这就是 Windows 堆管理的工作方式”一样简单。无论如何,这个问题没有(简单)的解决方案,也许最好使用像 boost::intrusive 容器这样的东西,理论上它应该提供更好的局部性,以便 Windows 内存管理器有更好的机会将内存返回给操作系统。

UPDATE001:Boost 侵入式容器也没有达到目的。

struct Entity : public boost::intrusive::unordered_set_base_hook<>
{
    explicit Entity(size_t id)
    {
        first = id;
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }

    size_t first = 1;
    int _1 = 1;
    int _2 = 2;
    float _5 = 3.14f;
    double _3 = 3;
    double _4 = 5;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"}, {1, "b"}, {2, "c"}, {3, "d"}, {4, "e"},
                                           {5, "f"}, {6, "g"}, {7, "h"}, {8, "e"}, {9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

struct first_is_key
{
    typedef size_t type;

    const type& operator()(const Entity& v) const { return v.first; }
};

using Container = boost::intrusive::unordered_set<Entity, boost::intrusive::key_of_value<first_is_key>>;

void printContainerInfo(const Container& container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
              << ", Size: " << container.size() << ", Bucket count: " << container.bucket_count() << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    Container::bucket_type* base_buckets = new Container::bucket_type[maxEntites];
    Container test(Container::bucket_traits(base_buckets, maxEntites));

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<size_t> dis;

    while(test.size() < maxEntites)
    {
        auto data = new Entity(dis(gen));
        auto res = test.insert(*data);
        if(!res.second)
        {
            delete data;
        }
    }

    printContainerInfo(test);
    while(test.size() > 0)
    {
        while(test.size() > maxEntites * 2 / 3)
        {
            test.erase_and_dispose(test.begin(), [](Entity* entity)
                                   {
                                       delete entity;
                                   });
        }

        printContainerInfo(test);
        while(test.size() < maxEntites)
        {
            auto data = new Entity(dis(gen));
            auto res = test.insert(*data);
            if(!res.second)
            {
                delete data;
            }
        }
    }
    return 0;
}
Run Code Online (Sandbox Code Playgroud)