rha*_*qtl 5 c++ benchmarking erase-remove-idiom
当谈到从容器中删除多个元素时,C++ 中有一个“擦除-删除”习惯用法,并且有关于替代“调整大小-删除”方式的讨论,例如,此处。人们说“擦除删除”比“调整大小删除”更好,但根据我的测试,后者在矢量上(稍微)更快。那么,当涉及到矢量时我应该使用“调整大小删除”吗?
这是我的基准测试代码:
#include <benchmark/benchmark.h>
#include <algorithm>
#include <functional>
#include <iostream>
#include <random>
#include <vector>
using namespace std;
constexpr size_t N_ELEMS = 1000000;
constexpr int MAX_VAL = N_ELEMS / 10;
constexpr int THRESH = MAX_VAL / 5 * 3;
static vector<int> generate_input() {
vector<int> nums(N_ELEMS);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dist(0, N_ELEMS);
std::generate(nums.begin(), nums.end(), std::bind(dist, std::ref(gen)));
return std::move(nums);
}
static void bm_erase_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
auto nums = generate_input();
state.ResumeTiming();
nums.erase(std::remove_if(nums.begin(), nums.end(),
[](int x) { return x < THRESH; }),
nums.end());
benchmark::DoNotOptimize(nums);
}
}
BENCHMARK(bm_erase_remove);
static void bm_resize_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
auto nums = generate_input();
state.ResumeTiming();
nums.resize(std::distance(
nums.begin(), std::remove_if(nums.begin(), nums.end(),
[](int x) { return x < THRESH; })));
benchmark::DoNotOptimize(nums);
}
}
BENCHMARK(bm_resize_remove);
BENCHMARK_MAIN();
Run Code Online (Sandbox Code Playgroud)
输出:
$ g++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
2023-05-24T20:07:22+08:00
Running ./a.out
Run on (16 X 3193.91 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 512 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 0.16, 0.14, 0.16
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
bm_erase_remove 822789 ns 759162 ns 838
bm_resize_remove 818217 ns 754749 ns 935
Run Code Online (Sandbox Code Playgroud)
使用时差异更大clang++:
$ clang++ main.cpp -lbenchmark -O3 -pthread
$ ./a.out
Load Average: 0.25, 0.18, 0.17
-----------------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------------
bm_erase_remove 1165085 ns 1074667 ns 611
bm_resize_remove 958856 ns 884584 ns 782
Run Code Online (Sandbox Code Playgroud)
额外的信息:
g++的版本是13.1.1,clang++的版本是15.0.75.15.90.1-microsoft-standard-WSL2AMD Ryzen 7 6800H with Radeon Graphics更新:有趣的是,当我单独运行基准测试(使用选项benchmark_filter)时,结果是相同的。这是因为缓存吗?如果是这样,缓存机制如何工作?
更新(2023/5/25):如果两个BENCHMARK语句交换,则会显示完全相反的结果。
它们的性能大致相同,这是因为这std::remove_if是唯一修改数组的函数,然后差异来自其他函数,进行重新分配std::vector::resize以适应新的大小(std::distance仅返回大小,因此可以忽略不计)并std::vector::erase仅采用容器使其速度稍快一些。
正如 @Peter Cordes 所指出的“实际上保证不会重新分配”,std::vector::resize如果新大小较小,则不会调整向量的大小,因此差异应该来自擦除(vs,clang)所做的额外移动(vs,clang)和resize( vs , clang ) 则不然。
为了能够测量差异,generate_input()需要为所有测试返回相同的向量,在您的实现中,每次调用都会返回一个新的随机向量,这使得无法区分运行与运行差异与向量变化。
这样,@463035818_is_not_a_number 提出了一个有趣的观点,但出于与之前相同的原因,这些函数之间没有区别。这并不意味着情况相同,对于一个结构体来说,来自坏分支的惩罚要大得多,成为std::remove_if优化的主要目标,请参见下面的窗口图。
所有数据均来自 50 次运行,绿色是最佳运行和平均值之间的范围,红色是从平均值到最差结果的范围,这是为了显示运行之间的差异。
manjaro 上的 i5-1155G7 + 3200MHz RAM,带有 clang-13 和 -O3
Windows 10 (22H2) 上 1600X + 1067MHz RAM,带 vs2022 和 /O2
Windows 10 (22H2) 上的 1600X + 1067MHz RAM,带有 clang-16 和 -O3 -pthread
ubuntu 22 上的 1600X + 1067MHz RAM,带有 clang-13 和 -O3
对于 vs,似乎存在一个优化错误(?),这使得simple_removeu32 和 u64 表现不佳。
#include <cstdint>
#include <random>
#include <cstring>
#include <vector>
#include <tuple>
// #include <typeinfo> // For clang
#include <benchmark/benchmark.h>
#pragma warning(disable:4267) // Conversion errors
#pragma warning(disable:4244) // Conversion errors
constexpr int N_ELEMS = 500;
constexpr int RAND_SEED = 8888;
template <typename T>
struct Filler {
T * ptr = nullptr;
int64_t padd [sizeof(T)];
Filler() {
ptr = new T(0);
memset(padd, 0, sizeof(T) * 8);
}
Filler(const T num){
ptr = new T(num);
for (int64_t& e : padd){
e = num;
}
}
Filler(const Filler& other){
ptr = new T(*other.ptr);
memcpy(padd, other.padd, sizeof(T) * 8);
}
~Filler() {
delete ptr;
}
Filler& operator=(Filler const& other){
memcpy(padd, other.padd, sizeof(T) * 8);
*ptr = *other.ptr;
return *this;
}
inline bool operator < (const Filler& other) { return *ptr < *other.ptr; }
inline bool operator <= (const Filler& other) { return *ptr <= *other.ptr; }
inline bool operator > (const Filler& other) { return *ptr > *other.ptr; }
inline bool operator >= (const Filler& other) { return *ptr >= *other.ptr; }
inline bool operator == (const Filler& other) { return *ptr == *other.ptr; }
inline bool operator != (const Filler& other) { return *ptr != *other.ptr; }
inline bool operator < (const T other) { return *ptr < other; }
inline bool operator <= (const T other) { return *ptr <= other; }
inline bool operator > (const T other) { return *ptr > other; }
inline bool operator >= (const T other) { return *ptr >= other; }
inline bool operator == (const T other) { return *ptr == other; }
inline bool operator != (const T other) { return *ptr != other; }
};
static size_t THRESH;
template <typename T>
struct Foo {
static std::vector<T> generate_input(size_t max = 0) {
static size_t dist_max = 0;
static std::vector<T> nums;
if (nums.empty() || max){
if (max) {
THRESH = max / 2;
dist_max = max;
}
std::mt19937 gen(RAND_SEED);
std::uniform_int_distribution<uint64_t> dist(0, dist_max);
for (auto& n : nums = std::vector<T>(N_ELEMS)){
n = T(dist(gen));
}
}
return nums;
}
static void just_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
std::ignore = std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
);
benchmark::DoNotOptimize(nums);
}
}
static void erase_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.erase(
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
),
nums.end()
);
benchmark::DoNotOptimize(nums);
}
}
static void resize_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
nums.resize(
std::distance(
nums.begin(),
std::remove_if(
nums.begin(), nums.end(),
[](T x) { return x < THRESH; }
)
)
);
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
do {
if (*m >= thresh){
*(n++) = std::move(*m);
}
} while (m++ < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
static void simple_remove_unroll(benchmark::State &state) {
for (auto _ : state) {
state.PauseTiming();
std::vector<T> nums = generate_input();
state.ResumeTiming();
T * n = &nums.front();
T * m = &nums.front();
const T thresh = T(THRESH);
const T * back = &nums.back();
switch (nums.size() % 4) {
case 3:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 2:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
case 1:
if (*m >= thresh){
*(n++) = std::move(*(m++));
} else {
m++;
}
}
do {
if (*(m + 0) >= thresh){ *(n++) = std::move(*(m + 0)); }
if (*(m + 1) >= thresh){ *(n++) = std::move(*(m + 1)); }
if (*(m + 2) >= thresh){ *(n++) = std::move(*(m + 2)); }
if (*(m + 3) >= thresh){ *(n++) = std::move(*(m + 3)); }
m += 4;
} while (m < back);
nums.resize(n - &nums.front());
benchmark::DoNotOptimize(nums);
}
}
};
template<typename T>
void benchmark_batch(size_t max_num) {
std::string type = typeid(T).name();
Foo<T>::generate_input(max_num);
benchmark::RegisterBenchmark(
std::string("just_remove/") + type,
Foo<T>::just_remove
);
benchmark::RegisterBenchmark(
std::string("erase_remove/") + type,
Foo<T>::erase_remove
);
benchmark::RegisterBenchmark(
std::string("resize_remove/") + type,
Foo<T>::resize_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove/") + type,
Foo<T>::simple_remove
);
benchmark::RegisterBenchmark(
std::string("simple_remove_unroll/") + type,
Foo<T>::simple_remove_unroll
);
}
int main(int argc, char** argv) {
benchmark_batch<uint8_t>(INT8_MAX);
benchmark_batch<uint32_t>(INT32_MAX);
benchmark_batch<uint64_t>(INT64_MAX);
benchmark_batch<Filler<uint8_t>>(INT8_MAX);
benchmark_batch<Filler<uint32_t>>(INT32_MAX);
benchmark_batch<Filler<uint64_t>>(INT64_MAX);
benchmark::Initialize(&argc, argv);
benchmark::RunSpecifiedBenchmarks();
benchmark::Shutdown();
return 0;
}
Run Code Online (Sandbox Code Playgroud)
以及重新创建情节的代码。
0..49 | % { .\Release\test_cxx.exe --benchmark_min_warmup_time=0.1 --benchmark_format=csv > "./data/run_$_.csv" }
Get-ChildItem ./data | Select-Object -ExpandProperty FullName | Import-Csv | Export-Csv .\benchmark.csv -NoTypeInformation -Append
Run Code Online (Sandbox Code Playgroud)
import matplotlib.ticker as tck
import matplotlib.pyplot as plt
import csv
def read_data():
data = {}
test_len = 0
with open('./build/benchmark.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader :
test_len += 1
name = row['name'];
if not name in data:
data[name] = {
'min': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'max': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
'avg': {
'iterations': float(row['iterations']),
'real_time': float(row['real_time']),
'cpu_time': float(row['cpu_time']),
},
}
else:
for k in ['iterations', 'real_time', 'cpu_time']:
data[name]['avg'][k] += float(row[k])
if float(row[k]) < float(data[name]['min'][k]):
data[name]['min'][k] = float(row[k])
if float(row[k]) > float(data[name]['max'][k]):
data[name]['max'][k] = float(row[k])
test_len /= len(data.keys())
for k in data:
for kk in data[k]['avg']:
data[k]['avg'][kk] /= test_len
return data
def plot_data(data, key):
labels = []
values = {
'max': [], 'avg': [], 'min': [],
}
labels_struct = []
values_struct = {
'max': [], 'avg': [], 'min': [],
}
for k in list(data.keys()):
if 'struct' in k or '6' in k:
labels_struct.append(k.replace('/', '\n').replace('struct ', ''))
values_struct['min'].append(data[k]['min'][key])
values_struct['max'].append(data[k]['max'][key])
values_struct['avg'].append(data[k]['avg'][key])
else:
labels.append(k.replace('/', '\n'))
values['min'].append(data[k]['min'][key])
values['max'].append(data[k]['max'][key])
values['avg'].append(data[k]['avg'][key])
return labels, values, labels_struct, values_struct
thickness = 0.8
benckmark_value = 'iterations'
colors = ['#1dad2b', '#af1c23', '#e0e0e0']
if __name__ == '__main__':
data = read_data()
labels, values, labels_struct, values_struct = plot_data(data, benckmark_value)
fig = plt.figure(layout="constrained")
spec = fig.add_gridspec(ncols=2, nrows=1)
int_formater = lambda x, p: format(int(x), ',')
ax0 = fig.add_subplot(spec[0, 0])
ax0.set_ylabel(benckmark_value)
ax0.set_title('std::vector<T>')
ax0.set_xticklabels(labels, rotation=90)
ax0.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
ax1 = fig.add_subplot(spec[0, 1])
ax1.set_ylabel(benckmark_value)
ax1.set_title('std::vector<Filler<T>>')
ax1.set_xticklabels(labels_struct, rotation=90)
ax1.get_yaxis().set_major_formatter(tck.FuncFormatter(int_formater))
for i, (k, v) in enumerate(values.items()):
ax0.bar(labels, v, thickness, color=colors[i])
for i, (k, v) in enumerate(values_struct.items()):
ax1.bar(labels_struct, v, thickness, color=colors[i])
plt.show()
Run Code Online (Sandbox Code Playgroud)