jli*_*lim 12 .net c# parallel-processing multithreading
我正在玩C#,想加快一个程序.我做了改变,并且能够这样做.但是,我需要帮助理解为什么变化使它变得更快.
我试图将代码简化为更容易理解的问题.Score1和Report1是较慢的方式.Score2和Report2是更快的方式.第一种方法首先并行地在结构中存储字符串和int.接下来,在串行循环中,它循环遍历这些结构的数组并将其数据写入缓冲区.第二种方法首先将数据并行写入字符串缓冲区.接下来,在串行循环中,它将字符串数据写入缓冲区.以下是一些示例运行时间:
运行1总平均时间= 0.492087秒运行2总平均时间= 0.273619秒
当我使用早期的非并行版本时,时间几乎相同.为什么与并行版本的区别?
即使我减少Report1中的循环以将单行输出写入缓冲区,它仍然较慢(总时间约为.42秒).
这是简化的代码:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.Threading.Tasks;
using System.IO;
namespace OptimizationQuestion
{
class Program
{
struct ValidWord
{
public string word;
public int score;
}
ValidWord[] valid;
StringBuilder output;
int total;
public void Score1(string[] words)
{
valid = new ValidWord[words.Length];
for (int i = 0; i < words.Length; i++)
{
StringBuilder builder = new StringBuilder();
foreach (char c in words[i])
{
if (c != 'U')
builder.Append(c);
}
if (words[i].Length == 3)
{
valid[i] = new ValidWord
{ word = builder.ToString(), score = words[i].Length };
}
}
}
public void Report1(StringBuilder outputBuffer)
{
int total = 0;
foreach (ValidWord wordInfo in valid)
{
if (wordInfo.score > 0)
{
outputBuffer.AppendLine(String.Format("{0} {1}", wordInfo.word.ToString(), wordInfo.score));
total += wordInfo.score;
}
}
outputBuffer.AppendLine(string.Format("Total = {0}", total));
}
public void Score2(string[] words)
{
output = new StringBuilder();
total = 0;
for (int i = 0; i < words.Length; i++)
{
StringBuilder builder = new StringBuilder();
foreach (char c in words[i])
{
if (c != 'U')
builder.Append(c);
}
if (words[i].Length == 3)
{
output.AppendLine(String.Format("{0} {1}", builder.ToString(), words[i].Length));
total += words[i].Length;
}
}
}
public void Report2(StringBuilder outputBuffer)
{
outputBuffer.Append(output.ToString());
outputBuffer.AppendLine(string.Format("Total = {0}", total));
}
static void Main(string[] args)
{
Program[] program = new Program[100];
for (int i = 0; i < program.Length; i++)
program[i] = new Program();
string[] words = File.ReadAllLines("words.txt");
Stopwatch stopwatch = new Stopwatch();
const int TIMING_REPETITIONS = 20;
double averageTime1 = 0.0;
StringBuilder output = new StringBuilder();
for (int i = 0; i < TIMING_REPETITIONS; ++i)
{
stopwatch.Reset();
stopwatch.Start();
output.Clear();
Parallel.ForEach<Program>(program, p =>
{
p.Score1(words);
});
for (int k = 0; k < program.Length; k++)
program[k].Report1(output);
stopwatch.Stop();
averageTime1 += stopwatch.Elapsed.TotalSeconds;
GC.Collect();
}
averageTime1 /= (double)TIMING_REPETITIONS;
Console.WriteLine(string.Format("Run 1 Total Average Time = {0:0.000000} sec", averageTime1));
double averageTime2 = 0.0;
for (int i = 0; i < TIMING_REPETITIONS; ++i)
{
stopwatch.Reset();
stopwatch.Start();
output.Clear();
Parallel.ForEach<Program>(program, p =>
{
p.Score2(words);
});
for (int k = 0; k < program.Length; k++)
program[k].Report2(output);
stopwatch.Stop();
averageTime2 += stopwatch.Elapsed.TotalSeconds;
GC.Collect();
}
averageTime2 /= (double)TIMING_REPETITIONS;
Console.WriteLine(string.Format("Run 2 Total Average Time = {0:0.000000} sec", averageTime2));
Console.ReadLine();
}
}
}
Run Code Online (Sandbox Code Playgroud)
首先,您正在并行重复运行。这将改善您的基准测试时间,但可能对您的实际生产运行时间没有太大帮助。为了准确测量实际运行完一个单词列表需要多长时间,您需要一次只运行一个单词列表。否则,处理所有列表的各个线程在某种程度上会相互竞争系统资源,并且每个列表的时间都会受到影响,即使处理所有列表的总时间更快。
为了加快处理一个单词列表的时间,您需要并行处理列表中的各个单词,一次只处理一个列表。为了获得足够的定义/大小以进行良好的测量,要么使列表很长,要么连续处理该列表多次。
就您而言,这有点棘手,因为最终产品所需的字符串构建器没有记录为线程安全的。不过,情况并没有那么糟糕。下面是为单个单词列表调用并行 foreach 的示例:
var locker = new Object(); //I'd actually make this static, but it should end up as a closure and so still work
var OutputBuffer = new StringBuilder(); // you can improve things futher if you can make a good estimate for the final size and force it allocate all the memory it will need up front
int score = 0;
Parallel.ForEach(words, w =>
{
// We want to push as much of the work to the individual threads as possible.
// If run in 1 thread, a stringbuilder per word would be bad.
// Run in parallel, it allows us to do a little more of the work outside of locked code.
var buf = new StringBuilder(w.Length + 5);
string word = buf.Append(w.Where(c=>c!='U').Concat(' ').ToArray()).Append(w.Length).ToString();
lock(locker)
{
OutputBuffer.Append(word);
score += w.Length;
}
});
OutputBuffer.Append("Total = ").Append(score);
Run Code Online (Sandbox Code Playgroud)
只需在正常的顺序处理 for 循环中调用 20 次即可。同样,它完成基准测试的速度可能会慢一些,但我认为由于基准测试中存在缺陷,它在现实世界中的执行速度会更快一些。另请注意,我将其直接输入到回复窗口中 - 我从未尝试过编译它,因此它不可能一开始就完美。
修复基准以更准确地反映并行代码将如何影响您的实际处理时间后,下一步是进行一些分析,以了解您的程序实际将时间花在哪里。这样您就知道需要在哪些方面进行改进。
出于好奇,我也想知道这个版本的表现如何:
var agg = new {score = 0, OutputBuffer = new StringBuilder()};
agg = words.Where(w => w.Length == 3)
.Select(w => new string(w.Where(c => c!='U').ToArray())
.Aggregate(agg, (a, w) => {a.OutputBuffer.AppendFormat("{0} {1}\n", w, w.Length); score += w.Length;});
agg.OutputBuffer.Append("Total = ").Append(score);
Run Code Online (Sandbox Code Playgroud)