arc*_*pus 3 c++ multithreading file
在我的程序中,我想读取几个文本文件(超过约800个文件),每个文件有256行,文件名从1.txt到n.txt,并在几个处理步骤后存储到数据库中.我的问题是数据的读取速度.通过使用OpenMP多线程读取循环,我可以将程序速度提高到以前的两倍.有没有办法加快速度?我的实际代码是
std::string CCD_Folder = CCDFolder; //CCDFolder is a pointer to a char array
int b = 0;
int PosCounter = 0;
int WAVENUMBER, WAVELUT;
std::vector<std::string> tempstr;
std::string inputline;
//Input
omp_set_num_threads(YValue);
#pragma omp parallel for private(WAVENUMBER) private(WAVELUT) private(PosCounter) private(tempstr) private(inputline)
for(int i = 1; i < (CCD_Filenumbers+1); i++)
{
//std::cout << omp_get_thread_num() << ' ' << i << '\n';
//Umwandlung und Erstellung des Dateinamens, Öffnen des Lesekanals
std::string CCD_Filenumber = boost::lexical_cast<string>(i);
std::string CCD_Filename = CCD_Folder + '\\' + CCD_Filenumber + ".txt";
std::ifstream datain(CCD_Filename, std::ifstream::in);
while(!datain.eof())
{
std::getline(datain, inputline);
//Processing
};
};
Run Code Online (Sandbox Code Playgroud)
此处未定义的所有变量都在我的代码中的其他位置定义,并且它正在工作.那么有可能加快这个代码的速度吗?
非常感谢你!
一些实验:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <Windows.h>
void generateFiles(int n) {
char fileName[32];
char fileStr[1032];
for (int i=0;i<n;i++) {
sprintf( fileName, "c:\\t\\%i.txt", i );
FILE * f = fopen( fileName, "w" );
for (int j=0;j<256;j++) {
int lineLen = rand() % 1024;
memset(fileStr, 'X', lineLen );
fileStr[lineLen] = 0x0D;
fileStr[lineLen+1] = 0x0A;
fileStr[lineLen+2] = 0x00;
fwrite( fileStr, 1, lineLen+2, f );
}
fclose(f);
}
}
void readFiles(int n) {
char fileName[32];
for (int i=0;i<n;i++) {
sprintf( fileName, "c:\\t\\%i.txt", i );
FILE * f = fopen( fileName, "r" );
fseek(f, 0L, SEEK_END);
int size = ftell(f);
fseek(f, 0L, SEEK_SET);
char * data = (char*)malloc(size);
fread(data, size, 1, f);
free(data);
fclose(f);
}
}
DWORD WINAPI readInThread( LPVOID lpParam )
{
int * number = (int *)lpParam;
char fileName[32];
sprintf( fileName, "c:\\t\\%i.txt", *number );
FILE * f = fopen( fileName, "r" );
fseek(f, 0L, SEEK_END);
int size = ftell(f);
fseek(f, 0L, SEEK_SET);
char * data = (char*)malloc(size);
fread(data, size, 1, f);
free(data);
fclose(f);
return 0;
}
int main(int argc, char ** argv) {
long t1 = GetTickCount();
generateFiles(256);
printf("Write: %li ms\n", GetTickCount() - t1 );
t1 = GetTickCount();
readFiles(256);
printf("Read: %li ms\n", GetTickCount() - t1 );
t1 = GetTickCount();
const int MAX_THREADS = 256;
int pDataArray[MAX_THREADS];
DWORD dwThreadIdArray[MAX_THREADS];
HANDLE hThreadArray[MAX_THREADS];
for( int i=0; i<MAX_THREADS; i++ )
{
pDataArray[i] = (int) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY,
sizeof(int));
pDataArray[i] = i;
hThreadArray[i] = CreateThread(
NULL,
0,
readInThread,
&pDataArray[i],
0,
&dwThreadIdArray[i]);
}
WaitForMultipleObjects(MAX_THREADS, hThreadArray, TRUE, INFINITE);
printf("Read (threaded): %li ms\n", GetTickCount() - t1 );
}
Run Code Online (Sandbox Code Playgroud)
第一个函数只是制作一个测试数据集的丑陋的东西(我知道它可以做得更好,但我老实说没有时间)
第一个实验 - 顺序读取第二个实验 - 并行读取
结果:
256个文件:
Write: 250 ms
Read: 140 ms
Read (threaded): 78 ms
Run Code Online (Sandbox Code Playgroud)
1024个文件:
Write: 1250 ms
Read: 547 ms
Read (threaded): 843 ms
Run Code Online (Sandbox Code Playgroud)
我认为第二次尝试清楚地表明,从长远来看,"愚蠢"的线程创建只会让事情变得更糟.当然,它需要在预分配工作者,某些线程池等方面进行改进,但我认为通过从磁盘读取100-200k这样快速的操作,将此功能转移到线程中并没有什么好处.我没有时间编写更"聪明"的解决方案,但我怀疑它会更快,因为你必须为互斥锁等添加系统调用......
走极端,你可以想到预分配内存池等..但正如在代码之前提到你发布的错误..这是一个毫秒的问题,但肯定不是秒
800个文件(每行20个字符,256行)
Write: 250 ms
Read: 63 ms
Read (threaded): 500 ms
Run Code Online (Sandbox Code Playgroud)
结论:
答案是:
您的阅读代码是错误的,您阅读文件的速度非常慢,以至于速度显着提高,然后您就可以并行运行任务.在上面的代码中,读取实际上比生成线程的开销更快