cat*_*hei 7 c# generics performance
我在实现无分配 Linq 时遇到了有趣的案例。当我稍微更改代码时,基准测试时间突然增加,我认为更改是微不足道的。我将范围缩小到这个测试代码:
[Benchmark]
public int ComparingGenericArgument1()
{
var comparer = new Wrap<int>(Comparer<int>.Default);
return CompareSum<Wrap<int>>.Calc(10000, comparer);
}
[Benchmark]
public int ComparingGenericArgument2()
{
var comparer = new Wrap<int, Comparer<int>>(Comparer<int>.Default);
return CompareSum<Wrap<int, Comparer<int>>>.Calc(10000, comparer);
}
static class CompareSum<TComparer> where TComparer : struct, IComparer<int>
{
public static int Calc(int count, TComparer comparer)
{
int sum = 0;
for (int i = 0; i < count; ++i)
for (int j = 0; j < count; ++j)
sum += comparer.Compare(i, j);
return sum;
}
}
struct Wrap<T> : IComparer<T>
{
IComparer<T> comparer;
public Wrap(IComparer<T> comparer)
{
this.comparer = comparer;
}
public int Compare(T x, T y)
{
return comparer.Compare(x, y);
}
}
struct Wrap<T, TComparer> : IComparer<T>
where TComparer : IComparer<T>
{
TComparer comparer;
public Wrap(TComparer comparer)
{
this.comparer = comparer;
}
public int Compare(T x, T y)
{
return comparer.Compare(x, y);
}
}
Run Code Online (Sandbox Code Playgroud)
当我运行上面的基准测试时,结果如下:
// * Summary *
BenchmarkDotNet=v0.13.2, OS=Windows 10 (10.0.19043.2130/21H1/May2021Update)
AMD Ryzen 5 3600, 1 CPU, 12 logical and 6 physical cores
.NET SDK=6.0.101
[Host] : .NET 6.0.1 (6.0.121.56705), X64 RyuJIT AVX2
DefaultJob : .NET 6.0.1 (6.0.121.56705), X64 RyuJIT AVX2
| Method | Mean | Error | StdDev | Allocated |
|-------------------------- |---------:|--------:|--------:|----------:|
| ComparingGenericArgument1 | 254.2 ms | 3.72 ms | 3.29 ms | 1804 B |
| ComparingGenericArgument2 | 458.0 ms | 5.04 ms | 4.71 ms | 480 B |
Run Code Online (Sandbox Code Playgroud)
几乎翻了一番。什么?
当我直接调用Wrap<T>or时Wrap<T1, T2>,性能是相同的。但是,当我有使其嵌套泛型的包装类时,性能突然下降。Comparer<int>和的性能持续下降IComparer<int>。
我将代码放入 SharpLab 来检查生成的程序集,嵌套的通用版本显然使其变得更大。但我不是装配专家,所以无法获得足够的洞察力。这是简化版本的结果return comparer.CompareTo(0, count):
C+CompareSum`1[[C+Wrap`1[[System.Int32, System.Private.CoreLib]], _]].Calc(Int32, Wrap`1<Int32>)
L0000: push ebp
L0001: mov ebp, esp
L0003: push ecx
L0004: mov ecx, [ebp+8]
L0007: xor edx, edx
L0009: call dword ptr [0x10d87008]
L000f: pop ebp
L0010: ret 4
C+CompareSum`1[[C+Wrap`2[[System.Int32, System.Private.CoreLib],[System.__Canon, System.Private.CoreLib]], _]].Calc(Int32, Wrap`2<Int32,System.__Canon>)
L0000: push ebp
L0001: mov ebp, esp
L0003: push esi
L0004: push eax
L0005: mov [ebp-8], edx
L0008: mov esi, ecx
L000a: mov ecx, [edx+0x20]
L000d: mov ecx, [ecx]
L000f: mov eax, [ecx+8]
L0012: test eax, eax
L0014: je short L0018
L0016: jmp short L0024
L0018: mov ecx, edx
L001a: mov edx, 0x10d8d530
L001f: call 0x0f429aa0
L0024: push esi
L0025: lea ecx, [ebp+8]
L0028: xor edx, edx
L002a: call eax
L002c: pop ecx
L002d: pop esi
L002e: pop ebp
L002f: ret 4
Run Code Online (Sandbox Code Playgroud)
那么,当我使用嵌套泛型类型时,是什么造成了如此大的差异呢?我希望分享一些知识。谢谢你!
该问题由以下通用实现调用:
static class CompareSum<TComparer> where TComparer : struct, IComparer<int>
{
public static int Calc(int count, TComparer comparer)
Run Code Online (Sandbox Code Playgroud)
此定义会导致生成以下 IL:
.method public hidebysig static
int32 Calc (
int32 count,
!TComparer comparer
) cil managed
{
// Method begins at RVA 0x20d0
// Code size 46 (0x2e)
.maxstack 4
.locals init (
[0] int32 sum,
[1] int32 i,
[2] int32 j
)
IL_0000: ldc.i4.0
IL_0001: stloc.0
IL_0002: ldc.i4.0
IL_0003: stloc.1
// sequence point: hidden
IL_0004: br.s IL_0028
// loop start (head: IL_0028)
IL_0006: ldc.i4.0
IL_0007: stloc.2
// sequence point: hidden
IL_0008: br.s IL_0020
// loop start (head: IL_0020)
IL_000a: ldloc.0
IL_000b: ldarga.s comparer
IL_000d: ldloc.1
IL_000e: ldloc.2
IL_000f: constrained. !TComparer
IL_0015: callvirt instance int32 class [System.Runtime]System.Collections.Generic.IComparer`1<int32>::Compare(!0, !0)
IL_001a: add
IL_001b: stloc.0
IL_001c: ldloc.2
IL_001d: ldc.i4.1
IL_001e: add
IL_001f: stloc.2
IL_0020: ldloc.2
IL_0021: ldarg.0
IL_0022: blt.s IL_000a
// end loop
IL_0024: ldloc.1
IL_0025: ldc.i4.1
IL_0026: add
IL_0027: stloc.1
IL_0028: ldloc.1
IL_0029: ldarg.0
IL_002a: blt.s IL_0006
// end loop
IL_002c: ldloc.0
IL_002d: ret
} // end of method CompareSum`1::Calc
Run Code Online (Sandbox Code Playgroud)
关键说明是:
IL_000f: constrained. !TComparer
IL_0015: callvirt instance int32 class [System.Runtime]System.Collections.Generic.IComparer`1<int32>::Compare(!0, !0)
Run Code Online (Sandbox Code Playgroud)
该constrained指令有一些非常有趣的行为:
当 callvirt 方法指令带有 constrained thisType 前缀时,该指令将按如下方式执行:
如果 thisType 是引用类型(而不是值类型),则 ptr 被取消引用并作为“this”指针传递给方法的 callvirt。
如果 thisType 是值类型并且 thisType 实现方法,则 ptr 未经修改地作为“this”指针传递到调用方法指令,以便通过 thisType 实现方法。
如果 thisType 是值类型并且 thisType 未实现方法,则 ptr 将被取消引用、装箱,并作为“this”指针传递给 callvirt 方法指令。
在第一种情况 ( Wrap<T>) 中,似乎缺少附加引用类型参数导致出现第二个要点,并且指针未经修改地传递。
在第二种情况(Wrap<T, TComparer>)中,它似乎调用了要点三,导致装箱和性能回归。
有趣的是,如果将实现更改为以下内容:
static class CompareSum<TComparer> where TComparer : struct, IComparer<int>
{
public static int Calc(int count, IComparer<int> comparer)
{
int sum = 0;
for (int i = 0; i < count; ++i)
for (int j = 0; j < count; ++j)
sum += comparer.Compare(i, j);
return sum;
}
}
Run Code Online (Sandbox Code Playgroud)
受约束的调用消失了,尽管由于调用 Calc 时的装箱,这两种情况的性能都很差。
如果 相反TComparer被强制转换为而不是接口抽象,则 if是引用类型还是值类型Compare<int>不再存在歧义,并且指令消失:TCompareconstrained
class CompareSum<TComparer> where TComparer : struct, IComparer<int>
{
public static int Calc(int count, TComparer comparer)
{
int sum = 0;
Comparer<int> castComparer = Unsafe.As<TComparer, Comparer<int>>(ref comparer);
for (int i = 0; i < count; ++i)
for (int j = 0; j < count; ++j)
sum += castComparer.Compare(i, j);
return sum;
}
}
Run Code Online (Sandbox Code Playgroud)
这会产生以下 IL:
.method public hidebysig static
int32 Calc (
int32 count,
!TComparer comparer
) cil managed
{
// Method begins at RVA 0x20d0
// Code size 48 (0x30)
.maxstack 4
.locals init (
[0] int32 sum,
[1] class [System.Collections]System.Collections.Generic.Comparer`1<int32> castComparer,
[2] int32 i,
[3] int32 j
)
IL_0000: ldc.i4.0
IL_0001: stloc.0
IL_0002: ldarga.s comparer
IL_0004: call !!1& [System.Runtime.CompilerServices.Unsafe]System.Runtime.CompilerServices.Unsafe::As<!TComparer, class [System.Collections]System.Collections.Generic.Comparer`1<int32>>(!!0&)
IL_0009: ldind.ref
IL_000a: stloc.1
IL_000b: ldc.i4.0
IL_000c: stloc.2
// sequence point: hidden
IL_000d: br.s IL_002a
// loop start (head: IL_002a)
IL_000f: ldc.i4.0
IL_0010: stloc.3
// sequence point: hidden
IL_0011: br.s IL_0022
// loop start (head: IL_0022)
IL_0013: ldloc.0
IL_0014: ldloc.1
IL_0015: ldloc.2
IL_0016: ldloc.3
IL_0017: callvirt instance int32 class [System.Collections]System.Collections.Generic.Comparer`1<int32>::Compare(!0, !0)
IL_001c: add
IL_001d: stloc.0
IL_001e: ldloc.3
IL_001f: ldc.i4.1
IL_0020: add
IL_0021: stloc.3
IL_0022: ldloc.3
IL_0023: ldarg.0
IL_0024: blt.s IL_0013
// end loop
IL_0026: ldloc.2
IL_0027: ldc.i4.1
IL_0028: add
IL_0029: stloc.2
IL_002a: ldloc.2
IL_002b: ldarg.0
IL_002c: blt.s IL_000f
// end loop
IL_002e: ldloc.0
IL_002f: ret
} // end of method CompareSum`1::Calc
} // end of class CompareSum`1
Run Code Online (Sandbox Code Playgroud)
请注意,现在 IL 不再包含该constrained指令,并且在基准测试中,性能几乎持平。
解决此问题的另一种方法是不再对类使用通用约束,而是使用采用Wrap结构实现的显式方法。这与绕过接口调度具有相同的效果:
static class CompareSum
{
public static int Calc(int count, Wrap<int> comparer)
{
int sum = 0;
for (int i = 0; i < count; ++i)
for (int j = 0; j < count; ++j)
sum += comparer.Compare(i, j);
return sum;
}
public static int Calc(int count, Wrap<int, Comparer<int>> comparer)
{
int sum = 0;
for (int i = 0; i < count; ++i)
for (int j = 0; j < count; ++j)
sum += comparer.Compare(i, j);
return sum;
}
}
Run Code Online (Sandbox Code Playgroud)
这完全消除了constrainedIL 并放弃了装箱,使您的实现重新保持一致。作为一个额外的好处,它简化了基准测试的调用:
[Benchmark]
public int ComparingGenericArgument1()
{
var comparer = new Wrap<int>(Comparer<int>.Default);
return CompareSum.Calc(10000, comparer);
}
[Benchmark]
public int ComparingGenericArgument2()
{
var comparer = new Wrap<int, Comparer<int>>(Comparer<int>.Default);
return CompareSum.Calc(10000, comparer);
}
Run Code Online (Sandbox Code Playgroud)