我有100Mb文本文件,我需要检查每一行的特殊字.我正在寻找快速的方法来做到这一点.
所以我把文件分成10:
public void ParseTheFile(BackgroundWorker bg)
{
Lines = File.ReadAllLines(FilePath);
this.size = Lines.Length;
chankSise=size/10;
reports reportInst = new reports(bg,size);
ParserThread [] ParserthreadArray = new ParserThread[10];
for (int i = 0; i <ParserthreadArray.Length; i++)
{
ParserthreadArray[i] = new ParserThread((reportInst));
ParserthreadArray[i].Init(SubArray(Lines,i * chankSise, chankSise), OutputPath);
}
Thread oThread0 = new Thread(ParserthreadArray[0].run);
oThread0.IsBackground = true;
Thread oThread1 = new Thread(ParserthreadArray[1].run);
oThread1.IsBackground = true;
Thread oThread2 = new Thread(ParserthreadArray[2].run);
oThread2.IsBackground = true;
Thread oThread3 = new Thread(ParserthreadArray[3].run);
oThread3.IsBackground = true;
Thread oThread4 = new Thread(ParserthreadArray[4].run);
oThread4.IsBackground = true;
Thread oThread5 = new Thread(ParserthreadArray[5].run);
oThread5.IsBackground = true;
Thread oThread6 = new Thread(ParserthreadArray[6].run);
oThread6.IsBackground = true;
Thread oThread7 = new Thread(ParserthreadArray[7].run);
oThread7.IsBackground = true;
Thread oThread8 = new Thread(ParserthreadArray[8].run);
oThread8.IsBackground = true;
Thread oThread9 = new Thread(ParserthreadArray[9].run);
oThread9.IsBackground = true;
oThread0.Start();
oThread1.Start();
oThread2.Start();
oThread3.Start();
oThread4.Start();
oThread5.Start();
oThread6.Start();
oThread7.Start();
oThread8.Start();
oThread9.Start();
oThread0.Join();
oThread1.Join();
oThread2.Join();
oThread3.Join();
oThread4.Join();
oThread5.Join();
oThread6.Join();
oThread7.Join();
oThread8.Join();
oThread9.Join();
Run Code Online (Sandbox Code Playgroud)
这是Init方法:
public void Init(string [] olines,string outputPath)
{
Lines = olines;
OutputPath = outputPath+"/"+"ThreadTemp"+threadID;
}
Run Code Online (Sandbox Code Playgroud)
这是SubArray方法:
public string [] SubArray(string [] data, int index, int length)
{
string [] result = new string[length];
Array.Copy(data, index, result, 0, length);
return result;
}
Run Code Online (Sandbox Code Playgroud)
并且每个线程执行此操作:
public void run()
{
if (!System.IO.Directory.Exists(OutputPath))
{
System.IO.Directory.CreateDirectory(OutputPath);
DirectoryInfo dir = new DirectoryInfo(OutputPath);
dir.Attributes |= FileAttributes.Hidden;
}
this.size = Lines.Length;
foreach (string line in Lines)
{
bgReports.sendreport(allreadychecked);
allreadychecked++;
hadHandlerOrEngine = false;
words = line.Split(' ');
if (words.Length>4)
{
for (int i = 5; i < words.Length; i++)
{
if (words[i] == "Handler" | words[i] == "Engine")
{
hadHandlerOrEngine = true;
string num = words[1 + i];
int realnum = int.Parse(num[0].ToString());
cuurentEngine = (realnum);
if (engineArry[realnum] == false)
{
File.Create(OutputPath + "/" + realnum + ".txt").Close();
engineArry[realnum] = true;
}
TextWriter tw = new StreamWriter(OutputPath + "/" + realnum + ".txt", true);
tw.WriteLine(line);
tw.Close();
break;
}
}
}
if (hadHandlerOrEngine == false)
{
if (engineArry[cuurentEngine] == true)
{
TextWriter tw = new StreamWriter(OutputPath + "/" + cuurentEngine + ".txt", true);
tw.WriteLine(line);
tw.Close();
}
}
}
Run Code Online (Sandbox Code Playgroud)
我的问题是有什么方法可以让它运行得更快
你没有展示你的Init方法,但目前你的每个线程看起来都会检查所有的线条.此外,看起来所有这些可能都试图写入相同的文件 - 而不是以异常安全的方式(使用using语句)这样做.
编辑:好的,现在我们可以看到,Init但我们看不到SubArray.据推测,它只是复制了阵列的一大块.
如果你避免使用线程开始,这有多慢?这肯定太慢了吗?你的表现目标是什么?使用10个线程似乎不太可能有所帮助,因为此时它完全受内存/ CPU限制.(您还应该尝试避免重复这么多代码来启动所有线程 - 为什么不使用集合呢?)
你可能是IO绑定的,所以我猜多线程不会有多大帮助.(你的程序花费大部分时间在这里的几率:Lines = File.ReadAllLines(FilePath);并没有那么多时间实际解析.你应该测量.)实际上,你的SubArray拆分可能比你刚刚通过整个事情到一个解析器线程.
我会看看MemoryMappedFile(如果这是.NET 4),它可以通过不必复制所有源数据来帮助一些IO.