Way*_*yne 7 .net c# parallel-processing vmware multithreading
以下nunit测试比较了运行单个线程与在双核机器上运行2个线程之间的性能.具体来说,这是一台运行在四核Linux SLED主机上的VMWare双核虚拟Windows 7计算机,戴尔Inspiron 503.
每个线程简单地循环并递增2个计数器,addCounter和readCounter.此测试是对Queue实施的原始测试,该实施被发现在多核机器上表现更差.因此,在将问题缩小到可重复性较小的代码时,你在这里没有只增加变量的队列,而且令人震惊和沮丧,它只有2个线程然后一个慢得多.
运行第一个测试时,任务管理器显示1个核心100%忙于另一个核心几乎空闲.这是单线程测试的测试输出:
readCounter 360687000
readCounter2 0
total readCounter 360687000
addCounter 360687000
addCounter2 0
Run Code Online (Sandbox Code Playgroud)
你会看到超过3.6亿的增量!
接下来,双线程测试显示在整个5秒的测试持续时间内两个内核100%忙碌.但是它的输出只显示:
readCounter 88687000
readCounter2 134606500
totoal readCounter 223293500
addCounter 88687000
addCounter2 67303250
addFailure0
Run Code Online (Sandbox Code Playgroud)
这只是2.23亿读取增量.什么是上帝的创造是那些2 CPU在5秒钟内完成的工作少了?
任何可能的线索?你可以在你的机器上运行测试,看看你是否得到不同的结果?一个想法是,VMWare双核性能可能不是您所希望的.
using System;
using System.Threading;
using NUnit.Framework;
namespace TickZoom.Utilities.TickZoom.Utilities
{
[TestFixture]
public class ActiveMultiQueueTest
{
private volatile bool stopThread = false;
private Exception threadException;
private long addCounter;
private long readCounter;
private long addCounter2;
private long readCounter2;
private long addFailureCounter;
[SetUp]
public void Setup()
{
stopThread = false;
addCounter = 0;
readCounter = 0;
addCounter2 = 0;
readCounter2 = 0;
}
[Test]
public void TestSingleCoreSpeed()
{
var speedThread = new Thread(SpeedTestLoop);
speedThread.Name = "1st Core Speed Test";
speedThread.Start();
Thread.Sleep(5000);
stopThread = true;
speedThread.Join();
if (threadException != null)
{
throw new Exception("Thread failed: ", threadException);
}
Console.Out.WriteLine("readCounter " + readCounter);
Console.Out.WriteLine("readCounter2 " + readCounter2);
Console.Out.WriteLine("total readCounter " + (readCounter + readCounter2));
Console.Out.WriteLine("addCounter " + addCounter);
Console.Out.WriteLine("addCounter2 " + addCounter2);
}
[Test]
public void TestDualCoreSpeed()
{
var speedThread1 = new Thread(SpeedTestLoop);
speedThread1.Name = "Speed Test 1";
var speedThread2 = new Thread(SpeedTestLoop2);
speedThread2.Name = "Speed Test 2";
speedThread1.Start();
speedThread2.Start();
Thread.Sleep(5000);
stopThread = true;
speedThread1.Join();
speedThread2.Join();
if (threadException != null)
{
throw new Exception("Thread failed: ", threadException);
}
Console.Out.WriteLine("readCounter " + readCounter);
Console.Out.WriteLine("readCounter2 " + readCounter2);
Console.Out.WriteLine("totoal readCounter " + (readCounter + readCounter2));
Console.Out.WriteLine("addCounter " + addCounter);
Console.Out.WriteLine("addCounter2 " + addCounter2);
Console.Out.WriteLine("addFailure" + addFailureCounter);
}
private void SpeedTestLoop()
{
try
{
while (!stopThread)
{
for (var i = 0; i < 500; i++)
{
++addCounter;
}
for (var i = 0; i < 500; i++)
{
readCounter++;
}
}
}
catch (Exception ex)
{
threadException = ex;
}
}
private void SpeedTestLoop2()
{
try
{
while (!stopThread)
{
for (var i = 0; i < 500; i++)
{
++addCounter2;
i++;
}
for (var i = 0; i < 500; i++)
{
readCounter2++;
}
}
}
catch (Exception ex)
{
threadException = ex;
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
编辑:我在没有vmware的四核笔记本电脑上测试了上述内容,并获得了类似的降级性能.所以我写了另一个类似于上面的测试,但是每个线程方法都在一个单独的类中.我这样做的目的是测试4个核心.
嗯,该测试显示出优异的结果,其与1,2,3或4个核心几乎线性地改善.
现在在两台机器上进行了一些实验,似乎只有在主线程方法位于不同实例而不是同一实例时才会发生正确的性能.
换句话说,如果在特定类的同一个实例上有多个线程主入口方法,那么对于你添加的每个线程,多核上的性能会更差,而不是像你想象的那样更好.
几乎看起来CLR正在"同步",因此一次只能有一个线程在该方法上运行.但是,我的测试表明情况并非如此.所以目前还不清楚发生了什么.
但是我自己的问题似乎只是通过将运行线程的方法的单独实例作为起点来解决.
真诚的,韦恩
编辑:
这是一个更新的单元测试,它测试1,2,3和4个线程,它们都在同一个类的实例上.使用带变量的数组在线程循环中使用至少10个元素.对于添加的每个线程,性能仍然显着降低.
using System;
using System.Threading;
using NUnit.Framework;
namespace TickZoom.Utilities.TickZoom.Utilities
{
[TestFixture]
public class MultiCoreSameClassTest
{
private ThreadTester threadTester;
public class ThreadTester
{
private Thread[] speedThread = new Thread[400];
private long[] addCounter = new long[400];
private long[] readCounter = new long[400];
private bool[] stopThread = new bool[400];
internal Exception threadException;
private int count;
public ThreadTester(int count)
{
for( var i=0; i<speedThread.Length; i+=10)
{
speedThread[i] = new Thread(SpeedTestLoop);
}
this.count = count;
}
public void Run()
{
for (var i = 0; i < count*10; i+=10)
{
speedThread[i].Start(i);
}
}
public void Stop()
{
for (var i = 0; i < stopThread.Length; i+=10 )
{
stopThread[i] = true;
}
for (var i = 0; i < count * 10; i += 10)
{
speedThread[i].Join();
}
if (threadException != null)
{
throw new Exception("Thread failed: ", threadException);
}
}
public void Output()
{
var readSum = 0L;
var addSum = 0L;
for (var i = 0; i < count; i++)
{
readSum += readCounter[i];
addSum += addCounter[i];
}
Console.Out.WriteLine("Thread readCounter " + readSum + ", addCounter " + addSum);
}
private void SpeedTestLoop(object indexarg)
{
var index = (int) indexarg;
try
{
while (!stopThread[index*10])
{
for (var i = 0; i < 500; i++)
{
++addCounter[index*10];
}
for (var i = 0; i < 500; i++)
{
++readCounter[index*10];
}
}
}
catch (Exception ex)
{
threadException = ex;
}
}
}
[SetUp]
public void Setup()
{
}
[Test]
public void SingleCoreTest()
{
TestCores(1);
}
[Test]
public void DualCoreTest()
{
TestCores(2);
}
[Test]
public void TriCoreTest()
{
TestCores(3);
}
[Test]
public void QuadCoreTest()
{
TestCores(4);
}
public void TestCores(int numCores)
{
threadTester = new ThreadTester(numCores);
threadTester.Run();
Thread.Sleep(5000);
threadTester.Stop();
threadTester.Output();
}
}
}
Run Code Online (Sandbox Code Playgroud)
这只是2.23亿读取增量.什么是上帝的创造是那些2 CPU在5秒钟内完成的工作少了?
您可能正在遇到缓存争用 - 当单个CPU递增整数时,它可以在自己的L1缓存中执行此操作,但只要两个CPU开始"争夺"相同的值,它所在的缓存行就会有每次访问它们时,在它们的缓存之间来回复制.在缓存之间复制数据所花费的额外时间加起来很快,特别是当您正在进行的操作(递增整数)非常简单时.
| 归档时间: |
|
| 查看次数: |
1134 次 |
| 最近记录: |