Dmi*_*ruk 6 c# sse simd avx system.numerics
我已经实现了一种使用 .NET 中可用的 SIMD 内在函数解析长度 <= 8 的无符号整数字符串的方法,如下所示:
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var parsed = Sse3.LoadDquVector128((byte*) c);
var shift = (8 - text.Length) * 2;
var shifted = Sse2.ShiftLeftLogical128BitLane(parsed,
(byte) (shift));
Vector128<byte> digit0 = Vector128.Create((byte) '0');
var reduced = Sse2.SubtractSaturate(shifted, digit0);
var shortMult = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
var collapsed2 = Sse2.MultiplyAddAdjacent(reduced.As<byte, short>(), shortMult);
var repack = Sse41.PackUnsignedSaturate(collapsed2, collapsed2);
var intMult = Vector128.Create((short)0, 0, 0, 0, 100, 1, 100, 1);
var collapsed3 = Sse2.MultiplyAddAdjacent(repack.As<ushort,short>(), intMult);
var e1 = collapsed3.GetElement(2);
var e2 = collapsed3.GetElement(3);
return (uint) (e1 * 10000 + e2);
}
}
Run Code Online (Sandbox Code Playgroud)
可悲的是,与基线的比较uint.Parse()
给出了以下相当不起眼的结果:
方法 | 意思 | 错误 | 标准差 |
---|---|---|---|
基线 | 15.157 纳秒 | 0.0325 纳秒 | 0.0304 纳秒 |
解析模拟 | 3.269 纳秒 | 0.0115 纳秒 | 0.0102 纳秒 |
上面的代码有哪些可以改进的方法?我特别关注的领域是:
text.Length
MultiplyAddAdjacent
包含0
s 和1
~~的向量对UTF-16 数据进行解包GetElement()
- 也许在ToScalar()
某处可能发生某些调用?C#(感谢@aepot)
public unsafe uint ParseUint(string text)
{
fixed (char* c = text)
{
Vector128<byte> mul1 = Vector128.Create(0x14C814C8, 0x010A0A64, 0, 0).AsByte();
Vector128<short> mul2 = Vector128.Create(0x00FA61A8, 0x0001000A, 0, 0).AsInt16();
Vector128<long> shift_amount = Sse2.ConvertScalarToVector128Int32(8 - text.Length << 3).AsInt64();
Vector128<short> vs = Sse2.LoadVector128((short*)c);
Vector128<byte> vb = Sse2.PackUnsignedSaturate(vs, vs);
vb = Sse2.SubtractSaturate(vb, Vector128.Create((byte)'0'));
vb = Sse2.ShiftLeftLogical(vb.AsInt64(), shift_amount).AsByte();
Vector128<int> v = Sse2.MultiplyAddAdjacent(Ssse3.MultiplyAddAdjacent(mul1, vb.AsSByte()), mul2);
v = Sse2.Add(Sse2.Add(v, v), Sse2.Shuffle(v, 1));
return (uint)v.GetElement(0);
}
}
Run Code Online (Sandbox Code Playgroud)
使用 SSSE3 的 C 解决方案:
#include <uchar.h> // char16_t
#include <tmmintrin.h> // pmaddubsw
unsigned ParseUint(char16_t* ptr, size_t len) {
const __m128i mul1 = _mm_set_epi32(0, 0, 0x010A0A64, 0x14C814C8);
const __m128i mul2 = _mm_set_epi32(0, 0, 0x0001000A, 0x00FA61A8);
const __m128i shift_amount = _mm_cvtsi32_si128((8 - len) * 8);
__m128i v = _mm_loadu_si128((__m128i*)ptr); // unsafe chunking
v = _mm_packus_epi16(v,v); // convert digits from UTF16-LE to ASCII
v = _mm_subs_epu8(v, _mm_set1_epi8('0'));
v = _mm_sll_epi64(v, shift_amount); // shift off non-digit trash
// convert
v = _mm_madd_epi16(_mm_maddubs_epi16(mul1, v), mul2);
v = _mm_add_epi32(_mm_add_epi32(v,v), _mm_shuffle_epi32(v, 1));
return (unsigned)_mm_cvtsi128_si32(v);
}
Run Code Online (Sandbox Code Playgroud)
无论如何移动/对齐字符串(请参阅aepot 的 anwser),我们都希望远离pmulld
. SSE基本上具有 16 位整数乘法,而 32 位乘法具有双倍的延迟和微指令。但是,必须注意pmaddubsw
和的符号扩展行为pmaddwd
。
使用标量x64
:
// untested && I don't know C#
public unsafe static uint ParseUint(string text)
{
fixed (char* c = text)
{
var xmm = Sse2.LoadVector128((ushort*)c); // unsafe chunking
var packed = Sse2.PackSignedSaturate(xmm,xmm); // convert digits from UTF16-LE to ASCII
ulong val = Sse2.X64.ConvertToUInt64(packed); // extract to scalar
val -= 0x3030303030303030; // subtract '0' from each digit
val <<= ((8 - text.Length) * 8); // shift off non-digit trash
// convert
const ulong mask = 0x000000FF000000FF;
const ulong mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
const ulong mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
val = (val * 10) + (val >> 8);
val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
return (uint)val;
}
}
Run Code Online (Sandbox Code Playgroud)
如果我们事先不知道数字的长度怎么办?
// C pseudocode & assumes ascii text
uint64_t v, m, len;
v = unaligned_load_little_endian_u64(p);
m = v + 0x4646464646464646; // roll '9' to 0x7F
v -= 0x3030303030303030; // unpacked binary coded decimal
m = (m | v) & 0x8080808080808080; // detect first non-digit
m = _tzcnt_u64(m >> 7); // count run of digits
if (((uint8_t)v) > 9) return error_not_a_number;
v <<= 64 - m; // shift off any "trailing" chars that are not digits
p += m >> 3; // consume bytes
v = parse_8_chars(v);
Run Code Online (Sandbox Code Playgroud)
或者如果我们有一个要处理的字符串列表:
// assumes ascii text
__m256i parse_uint_x4(void* base_addr, __m256i offsets_64x4)
{
const __m256i x00 = _mm256_setzero_si256();
const __m256i x0A = _mm256_set1_epi8(0x0A);
const __m256i x30 = _mm256_set1_epi8(0x30);
const __m256i x08 = _mm256_and_si256(_mm256_srli_epi32(x30, 1), x0A);
const __m256i mul1 = _mm256_set1_epi64x(0x010A0A6414C814C8);
const __m256i mul2 = _mm256_set1_epi64x(0x0001000A00FA61A8);
__m256i v, m;
// process 4 strings at once, up to 8 digits in each string...
// (the 64-bit chunks could be manually loaded using 3 shuffles)
v = _mm256_i64gather_epi64((long long*)base_addr, offsets_64x4, 1);
// rebase digits from 0x30..0x39 to 0x00..0x09
v = _mm256_xor_si256(v, x30);
// range check
// (unsigned gte compare)
v = _mm256_min_epu8(v, x0A);
m = _mm256_cmpeq_epi8(x0A, v);
// mask of lowest non-digit and above
m = _mm256_or_si256(m, _mm256_sub_epi64(x00, m));
// align the end of the digit-string to the top of the u64 lane
// (shift off masked bytes and insert leading zeros)
m = _mm256_sad_epu8(_mm256_and_si256(m, x08), x00);
v = _mm256_sllv_epi64(v, m);
// convert to binary
// (the `add(v,v)` allow us to keep `mul2` unsigned)
v = _mm256_madd_epi16(_mm256_maddubs_epi16(mul1, v), mul2);
v = _mm256_add_epi32(_mm256_shuffle_epi32(v, 0x31), _mm256_add_epi32(v,v));
// zero the hi-dwords of each qword
v = _mm256_blend_epi32(v, x00, 0xAA);
return v;
}
Run Code Online (Sandbox Code Playgroud)
我做了一些优化。
public unsafe uint ParseUint2(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
raw = Sse2.ShiftLeftLogical128BitLane(raw, (byte)(8 - text.Length << 1));
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Run Code Online (Sandbox Code Playgroud)
减少了类型转换和最终计算的数量vphaddd
。结果它快了大约 10%。
但是...imm8
必须是编译时常量。这意味着您不能使用变量 whereimm8
是参数。否则 JIT 编译器不会产生操作的内在指令。它会call
在这个地方创建一个外部方法(也许有一些解决方法)。感谢@PeterCordes 的帮助。
这个怪物并不显着,但比上面一个更快,无论text.Length
.
public unsafe uint ParseUint3(string text)
{
fixed (char* c = text)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c);
switch (text.Length)
{
case 0: raw = Vector128<ushort>.Zero; break;
case 1: raw = Sse2.ShiftLeftLogical128BitLane(raw, 14); break;
case 2: raw = Sse2.ShiftLeftLogical128BitLane(raw, 12); break;
case 3: raw = Sse2.ShiftLeftLogical128BitLane(raw, 10); break;
case 4: raw = Sse2.ShiftLeftLogical128BitLane(raw, 8); break;
case 5: raw = Sse2.ShiftLeftLogical128BitLane(raw, 6); break;
case 6: raw = Sse2.ShiftLeftLogical128BitLane(raw, 4); break;
case 7: raw = Sse2.ShiftLeftLogical128BitLane(raw, 2); break;
};
Vector128<ushort> digit0 = Vector128.Create('0');
raw = Sse2.SubtractSaturate(raw, digit0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
res = Ssse3.HorizontalAdd(res, res);
res = Ssse3.HorizontalAdd(res, res);
return (uint)res.GetElement(0);
}
}
Run Code Online (Sandbox Code Playgroud)
同样,@PeterCordes 不允许我编写缓慢的代码。以下版本有 2 个改进。现在加载的字符串已经移位,然后通过相同的偏移量减去移位的掩码。这避免ShiftLeftLogical128BitLane
了可变计数的缓慢回退。
第二个改进是替换vphaddd
用pshufd
+ paddd
。
// Note that this loads up to 14 bytes before the data part of the string. (Or 16 for an empty string)
// This might or might not make it possible to read from an unmapped page and fault, beware.
public unsafe uint ParseUint4(string text)
{
const string mask = "\xffff\xffff\xffff\xffff\xffff\xffff\xffff\xffff00000000";
fixed (char* c = text, m = mask)
{
Vector128<ushort> raw = Sse3.LoadDquVector128((ushort*)c - 8 + text.Length);
Vector128<ushort> mask0 = Sse3.LoadDquVector128((ushort*)m + text.Length);
raw = Sse2.SubtractSaturate(raw, mask0);
Vector128<short> mul0 = Vector128.Create(10, 1, 10, 1, 10, 1, 10, 1);
Vector128<int> res = Sse2.MultiplyAddAdjacent(raw.AsInt16(), mul0);
Vector128<int> mul1 = Vector128.Create(1000000, 10000, 100, 1);
res = Sse41.MultiplyLow(res, mul1);
Vector128<int> shuf = Sse2.Shuffle(res, 0x1b); // 0 1 2 3 => 3 2 1 0
res = Sse2.Add(shuf, res);
shuf = Sse2.Shuffle(res, 0x41); // 0 1 2 3 => 1 0 3 2
res = Sse2.Add(shuf, res);
return (uint)res.GetElement(0);
}
}
Run Code Online (Sandbox Code Playgroud)
~比初始解决方案快两倍。(o_O) 至少在我的 Haswell i7 上。
首先,5 倍的提升并不是\xe2\x80\x9crather 不起眼的\xe2\x80\x9d。
\n我不会使用标量代码执行最后一步,这里\xe2\x80\x99s 是一个替代方案:
\n// _mm_shuffle_epi32( x, _MM_SHUFFLE( 3, 3, 2, 2 ) )\ncollapsed3 = Sse2.Shuffle( collapsed3, 0xFA );\n// _mm_mul_epu32\nvar collapsed4 = Sse2.Multiply( collapsed3.As<int, uint>(), Vector128.Create( 10000u, 0, 1, 0 ) ).As<ulong, uint>();\n// _mm_add_epi32( x, _mm_srli_si128( x, 8 ) )\ncollapsed4 = Sse2.Add( collapsed4, Sse2.ShiftRightLogical128BitLane( collapsed4, 8 ) );\nreturn collapsed4.GetElement( 0 );\n
Run Code Online (Sandbox Code Playgroud)\nC++ 版本将比我的 PC (.NET Core 3.1) 上的速度快得多。生成的代码不好。他们像这样初始化常量:
\n00007FFAD10B11B6 xor ecx,ecx \n00007FFAD10B11B8 mov dword ptr [rsp+20h],ecx \n00007FFAD10B11BC mov dword ptr [rsp+28h],64h \n00007FFAD10B11C4 mov dword ptr [rsp+30h],1 \n00007FFAD10B11CC mov dword ptr [rsp+38h],64h \n00007FFAD10B11D4 mov dword ptr [rsp+40h],1 \n
Run Code Online (Sandbox Code Playgroud)\n他们使用堆栈内存而不是另一个向量寄存器。看起来 JIT 开发人员忘记了 \xe2\x80\x99re 16 个向量寄存器,完整的函数仅使用xmm0
.
00007FFAD10B1230 vmovapd xmmword ptr [rbp-0C0h],xmm0 \n00007FFAD10B1238 vmovapd xmm0,xmmword ptr [rbp-0C0h] \n00007FFAD10B1240 vpsrldq xmm0,xmm0,8 \n00007FFAD10B1245 vpaddd xmm0,xmm0,xmmword ptr [rbp-0C0h] \n
Run Code Online (Sandbox Code Playgroud)\n
归档时间: |
|
查看次数: |
417 次 |
最近记录: |