NPS*_*000 7 c# optimization multithreading task-parallel-library
当我遇到一个有趣的结果时,我正在做一些有趣的实验计算:
Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 19636ms
For Loop: 12612ms
Parallel.For Loop: 3835ms
Run Code Online (Sandbox Code Playgroud)
这不是我的预期.
系统:Windows 7 64,i3 2120 [双核,4个线程],Visual Studio 2010.
构建:优化开启,释放模式[无调试器],32位.
次要的兴趣是令人失望的64位性能.虽然它更符合我对比率的预期,但它通过全面放慢来实现这一目标.
Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 23409ms
For Loop: 24373ms
Parallel.For Loop: 6839ms
Run Code Online (Sandbox Code Playgroud)
计算很简单:对于索引x和y找到最接近的Vector3并将其存储在2D数组中.
如果你敢,问题是试图解释为什么内联循环是如此缓慢.用于解释64位版本缺乏性能的奖励积分.
using System;
using System.Diagnostics;
using System.Threading.Tasks;
namespace TextureFromPoints
{
class Program
{
const int numPoints = 700;
const int textureSize = 1024;
static Random rnd = new Random();
static void Main(string[] args)
{
while (true)
{
Console.WriteLine("Starting");
Console.WriteLine();
var pointCloud = new Vector3[numPoints];
for (int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);
var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];
var sw1 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);
for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
var sw2 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
var sw3 = Stopwatch.StartNew();
Parallel.For(0, textureSize, x =>
{
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine(); Console.WriteLine();
Console.ReadLine();
}
}
private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
if (!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}
private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);
for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}
struct Vector3
{
public float x;
public float y;
public float z;
public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}
public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}
public float sqrMagnitude()
{
return x * x + y * y + z * z;
}
public float DistanceToPoint(Vector3 point)
{
return (this - point).sqrMagnitude();
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
更新:感谢Drew Marsh的努力,我们现在拥有这个超级优化的版本,可以内联所有的V3操作.
using System;
using System.Diagnostics;
using System.Threading.Tasks;
namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;
static Random rnd = new Random();
static void Main(string[] args)
{
while (true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();
var pointCloud = new Vector3[numPoints];
for (int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);
var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];
var sw1 = Inline(pointCloud, result1);
var sw2 = NotInline(pointCloud, result2);
var sw3 = Parallelized(pointCloud, result3);
Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}
private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();
Parallel.For(0, textureSize, x =>
{
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}
private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}
private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;
for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}
private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for (int x = 0; x < textureSize; x++)
for (int y = 0; y < textureSize; y++)
if (!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}
private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = new Vector3(nearestV3.x - targetPos.x, nearestV3.y - targetPos.y, nearestV3.z - targetPos.z);
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;
for (int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = new Vector3(currentV3.x - targetPos.x, currentV3.y - targetPos.y, currentV3.z - targetPos.z);
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if (currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}
struct Vector3
{
public float x;
public float y;
public float z;
public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
它给出了以下结果:
86
Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 3820ms
For Loop: 3962ms
Parallel.For Loop: 1681ms
Run Code Online (Sandbox Code Playgroud)
64位
Completed 1024x1024 pixels with 700 points in...
For Loop (Inline): 10978ms
For Loop: 10924ms
Parallel.For Loop: 3073ms
Run Code Online (Sandbox Code Playgroud)
所以好消息是我们可以大大提高这段代码的性能 - 让单线程版本以一定的速度运行,与其并行表达保持一致.
坏消息是,这意味着完全放弃x64并手动输入所有数学.
在这个阶段,我对编译器的性能非常失望 - 我希望它们会好得多.
结论
这是令人沮丧和悲伤的...虽然我们真的不知道为什么我们可以做出有根据的猜测它是由一个愚蠢的编译器引起的.简单地通过将编译器从x64更改为x86并进行一些手动内嵌而不是我期望的24s到3.8s.然而,我已经完成了我正在编写的概念验证,并且由于简单的空间哈希,我可以计算出1024 x 1024的图像,在0.7秒内有70,000'点' - 比我原来的x64场景快〜340000%没有穿线或内衬.因此我接受了一个答案 - 即时需求已经消失,尽管我仍然在研究这个问题.
所有数据来自8核i7,Win7,x64
令人惊讶的是你肯定得到了5倍.与这个测试,你都写过的一个问题是,你把你的主要方法,这三种方法这迫使gobblygook编译器有能力创造并保持同步的实现在使用封闭的需求Parallel.For在越来越内联方法的方式.如果您按如下方式分解工作,您将在所有三种实现中看到明显更快的性能...至少对于x86:
在x86之前:
For Loop (Inline): 24313ms
For Loop: 25236ms
Parallel.For Loop: 3840ms
Run Code Online (Sandbox Code Playgroud)
x86之后:
For Loop (Inline): 13007ms
For Loop: 13013ms
Parallel.For Loop: 2208ms
Run Code Online (Sandbox Code Playgroud)
所以,看看我的x86 Parallel.For结果,你会发现它的大小约为5.9倍,每个版本在隔离时速度要快得多.
接下来,有趣的是,在同样的变化之后,x64绝对没有增益.事实上,在3次测试中的2次测试中,它每次运行的结果都略高一些.
在x64之前
For Loop (Inline): 24222ms
For Loop: 25197ms
Parallel.For Loop: 3810ms
Run Code Online (Sandbox Code Playgroud)
x64之后
For Loop (Inline): 25302ms
For Loop: 25209ms
Parallel.For Loop: 3821ms
Run Code Online (Sandbox Code Playgroud)
我没有直接回答为什么x64会如此糟糕,除了人们不断提出像这样的代码使得x64 JIT看起来很糟糕的事实,所以也许其他人可以参与其中.
这就是说我还有另外一件事你可能要考虑在这样的实现中考虑:缓存行无效.这里有一篇很棒的MSDN文章,由@StephenToub撰写,解释了这是什么.TL; DR; 这是因为所有数据都存储在一个数组和diff中.具有不同本地(L2)高速缓存的核心将修改该数组的部分,以便将数据与其重叠的其他核心同步.如果部分差异.核心工作太紧密了,你最终会得到很多这些同步,这可能会影响你的并行收益.本文展示了一种技术,您可以在工作数组中实际分配额外的空间,足以分离包含您将要处理的数据的实际部分,以便当这些核处理数据时,它们不必使另一个无效.核心.for循环而不是接近8倍.我敢打赌,如果你投入工作来解决任何缓存行无效,你可以挤出另外10%+.请记住,设置和协调并行工作总会有一些开销,所以你永远不会完全达到100%.
这是您的程序的修订版本,每种方法都考虑在不同的方法中:
using System;
using System.Diagnostics;
using System.Threading.Tasks;
namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;
static Random rnd = new Random();
static void Main(string[] args)
{
while(true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();
var pointCloud = new Vector3[numPoints];
for(int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);
var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];
var sw1 = Inline(pointCloud, result1);
var sw2 = NotInline(pointCloud, result2);
var sw3 = Parallelized(pointCloud, result3);
Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}
private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();
Parallel.For(0, textureSize, x =>
{
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}
private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}
private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);
for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}
private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
if(!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}
private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
var nearestV3Distance = nearestV3.DistanceToPoint(targetPos);
for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
var currentV3Distance = currentV3.DistanceToPoint(targetPos);
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}
struct Vector3
{
public float x;
public float y;
public float z;
public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}
public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}
public float sqrMagnitude()
{
return x * x + y * y + z * z;
}
public float DistanceToPoint(Vector3 point)
{
return (this - point).sqrMagnitude();
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
根据Feng Yuan指出的方法没有被x64 JIT内联的方法,你可以改变程序来进行内联计算,并从x64版本获得比x86版本更好的性能.这显然很糟糕,但这是我见过x64 JIT之前毁坏的那种东西.这是新的数字:
内联x64之后:
For Loop (Inline): 19032ms
For Loop: 19209ms
Parallel.For Loop: 3015ms
Run Code Online (Sandbox Code Playgroud)
内联版本的代码:
using System;
using System.Diagnostics;
using System.Threading.Tasks;
namespace TextureFromPoints
{
class RevisedProgram
{
const int numPoints = 700;
const int textureSize = 1024;
static Random rnd = new Random();
static void Main(string[] args)
{
while(true)
{
Console.WriteLine("Starting REVISED");
Console.WriteLine();
var pointCloud = new Vector3[numPoints];
for(int i = 0; i < numPoints; i++)
pointCloud[i] = new Vector3(textureSize);
var result1 = new Vector3[textureSize, textureSize];
var result2 = new Vector3[textureSize, textureSize];
var result3 = new Vector3[textureSize, textureSize];
var sw1 = Inline(pointCloud, result1);
var sw2 = NotInline(pointCloud, result2);
var sw3 = Parallelized(pointCloud, result3);
Console.WriteLine("Completed {0}x{0} pixels with {1} points in...", textureSize, numPoints);
Console.WriteLine("{0}: {1}ms", "For Loop (Inline)", sw1.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "For Loop", sw2.ElapsedMilliseconds);
Console.WriteLine("{0}: {1}ms", "Parallel.For Loop", sw3.ElapsedMilliseconds);
Console.WriteLine();
Console.Write("Verifying Data: ");
Console.WriteLine(CheckResults(result1, result2) && CheckResults(result1, result3) ? "Valid" : "Error");
Console.WriteLine();
Console.WriteLine();
Console.ReadLine();
}
}
private static Stopwatch Parallelized(Vector3[] pointCloud, Vector3[,] result3)
{
var sw3 = Stopwatch.StartNew();
Parallel.For(0, textureSize, x =>
{
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result3, x, y);
});
sw3.Stop();
return sw3;
}
private static Stopwatch NotInline(Vector3[] pointCloud, Vector3[,] result2)
{
var sw2 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
Computation(pointCloud, result2, x, y);
sw2.Stop();
return sw2;
}
private static Stopwatch Inline(Vector3[] pointCloud, Vector3[,] result1)
{
var sw1 = Stopwatch.StartNew();
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = nearestV3 - targetPos;
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;
for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = currentV3 - targetPos;
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result1[x, y] = nearestV3;
}
sw1.Stop();
return sw1;
}
private static bool CheckResults(Vector3[,] lhs, Vector3[,] rhs)
{
for(int x = 0; x < textureSize; x++)
for(int y = 0; y < textureSize; y++)
if(!lhs[x, y].Equals(rhs[x, y]))
return false;
return true;
}
private static void Computation(Vector3[] pointCloud, Vector3[,] result, int x, int y)
{
var targetPos = new Vector3(x, y, 0);
var nearestV3 = pointCloud[0];
Vector3 temp1 = nearestV3 - targetPos;
var nearestV3Distance = temp1.x * temp1.x + temp1.y * temp1.y + temp1.z * temp1.z;
for(int i = 1; i < numPoints; i++)
{
var currentV3 = pointCloud[i];
Vector3 temp2 = currentV3 - targetPos;
var currentV3Distance = temp2.x * temp2.x + temp2.y * temp2.y + temp2.z * temp2.z;
if(currentV3Distance < nearestV3Distance)
{
nearestV3 = currentV3;
nearestV3Distance = currentV3Distance;
}
}
result[x, y] = nearestV3;
}
private static float DistanceToPoint(Vector3 vector, Vector3 point)
{
Vector3 final = vector - point;
return final.x * final.x + final.y * final.y + final.z * final.z;
}
struct Vector3
{
public float x;
public float y;
public float z;
public Vector3(float x, float y, float z)
{
this.x = x;
this.y = y;
this.z = z;
}
public Vector3(float randomDistance)
{
this.x = (float)rnd.NextDouble() * randomDistance;
this.y = (float)rnd.NextDouble() * randomDistance;
this.z = (float)rnd.NextDouble() * randomDistance;
}
public static Vector3 operator -(Vector3 a, Vector3 b)
{
return new Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
}
}
}
}
Run Code Online (Sandbox Code Playgroud)