maf*_*afu 10 c# performance unsafe
我发现我的应用程序花了25%的时间在循环中执行此操作:
private static int Diff (int c0, int c1)
{
unsafe {
byte* pc0 = (byte*) &c0;
byte* pc1 = (byte*) &c1;
int d0 = pc0[0] - pc1[0];
int d1 = pc0[1] - pc1[1];
int d2 = pc0[2] - pc1[2];
int d3 = pc0[3] - pc1[3];
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
}
Run Code Online (Sandbox Code Playgroud)
如何提高此方法的性能?我的想法到目前为止:
编辑:为方便起见,一些反映真实环境和用例的测试代码.(实际上,涉及的数据更多,并且数据不会在单个大块中进行比较,而是在每个kb的许多块中进行比较.)
public static class ByteCompare
{
private static void Main ()
{
const int n = 1024 * 1024 * 20;
const int repeat = 20;
var rnd = new Random (0);
Console.Write ("Generating test data... ");
var t0 = Enumerable.Range (1, n)
.Select (x => rnd.Next (int.MinValue, int.MaxValue))
.ToArray ();
var t1 = Enumerable.Range (1, n)
.Select (x => rnd.Next (int.MinValue, int.MaxValue))
.ToArray ();
Console.WriteLine ("complete.");
GC.Collect (2, GCCollectionMode.Forced);
Console.WriteLine ("GCs: " + GC.CollectionCount (0));
{
var sw = Stopwatch.StartNew ();
long res = 0;
for (int reps = 0; reps < repeat; reps++) {
for (int i = 0; i < n; i++) {
int c0 = t0[i];
int c1 = t1[i];
res += ByteDiff_REGULAR (c0, c1);
}
}
sw.Stop ();
Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_REGULAR");
}
{
var sw = Stopwatch.StartNew ();
long res = 0;
for (int reps = 0; reps < repeat; reps++) {
for (int i = 0; i < n; i++) {
int c0 = t0[i];
int c1 = t1[i];
res += ByteDiff_UNSAFE (c0, c1);
}
}
sw.Stop ();
Console.WriteLine ("res=" + res + ", t=" + sw.Elapsed.TotalSeconds.ToString ("0.00") + "s - ByteDiff_UNSAFE_PTR");
}
Console.WriteLine ("GCs: " + GC.CollectionCount (0));
Console.WriteLine ("Test complete.");
Console.ReadKey (true);
}
public static int ByteDiff_REGULAR (int c0, int c1)
{
var c00 = (byte) (c0 >> (8 * 0));
var c01 = (byte) (c0 >> (8 * 1));
var c02 = (byte) (c0 >> (8 * 2));
var c03 = (byte) (c0 >> (8 * 3));
var c10 = (byte) (c1 >> (8 * 0));
var c11 = (byte) (c1 >> (8 * 1));
var c12 = (byte) (c1 >> (8 * 2));
var c13 = (byte) (c1 >> (8 * 3));
var d0 = (c00 - c10);
var d1 = (c01 - c11);
var d2 = (c02 - c12);
var d3 = (c03 - c13);
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
private static int ByteDiff_UNSAFE (int c0, int c1)
{
unsafe {
byte* pc0 = (byte*) &c0;
byte* pc1 = (byte*) &c1;
int d0 = pc0[0] - pc1[0];
int d1 = pc0[1] - pc1[1];
int d2 = pc0[2] - pc1[2];
int d3 = pc0[3] - pc1[3];
d0 *= d0;
d1 *= d1;
d2 *= d2;
d3 *= d3;
return d0 + d1 + d2 + d3;
}
}
}
Run Code Online (Sandbox Code Playgroud)
这对我来说(在i5上作为x64版本运行):
Generating test data... complete.
GCs: 8
res=18324555528140, t=1.46s - ByteDiff_REGULAR
res=18324555528140, t=1.15s - ByteDiff_UNSAFE
res=18324555528140, t=1.73s - Diff_Alex1
res=18324555528140, t=1.63s - Diff_Alex2
res=18324555528140, t=3.59s - Diff_Alex3
res=18325828513740, t=3.90s - Diff_Alex4
GCs: 8
Test complete.
Run Code Online (Sandbox Code Playgroud)
最明显的是,这将受益于 SIMD,但我们假设我不想去那里,因为它有点麻烦。
如果您愿意,我们可以避免使用它,但实际上 C# 直接对它提供了很好的支持。如果没有卸载到 GPU,如果更大的算法适合 SIMD 处理,我预计这将是迄今为止最大的性能赢家。
http://www.drdobbs.com/architecture-and-design/simd-enabled-vector-types-with-c/240168888
多线程
当然,每个 CPU 核心使用一个线程。您还可以使用 Parallel.For 之类的构造,让 .NET 决定要使用多少个线程。它在这方面做得相当好,但既然你知道这肯定是 CPU 限制的,你可能(也可能不会)通过自己管理线程获得更优化的结果。
至于加速实际代码块,使用位掩码和位移位来处理各个值可能比使用指针更快。这有一个额外的好处,你不需要不安全的代码块,例如
byte b0_leftmost = (c0 & 0xff000000) >> 24;
Run Code Online (Sandbox Code Playgroud)
归档时间: |
|
查看次数: |
228 次 |
最近记录: |