Paq*_*ito 6 c memory-management matrix
我做了一个执行矩阵乘法的程序(没有优化).
for(i=0; i<a_r; i++)
for(j=0;j<b_c; j++)
for(k=0;k<a_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
Run Code Online (Sandbox Code Playgroud)
根据我分配内存的方式,计算时间不同.
重要的是要注意三点:
我用三种不同的分配来测试这个程序:
测试1:全局分配
Int a[2500][2500], b[2500][2500], c[2500][2500];
Main {matrix multiplication a*b=c}
Run Code Online (Sandbox Code Playgroud)
计算时间非常稳定(大约41s).
测试2:动态分配(数组)
Main{
int **a,**b,**c;
a=(int **) malloc(sizeof(int)*2500);
for( i=0;i<a_c; i++)
a[i]=(int *) malloc(sizeof(int)*2500);
…
matrix multiplication a*b=c
}
Run Code Online (Sandbox Code Playgroud)
当我以原始方式多次执行程序时,我获得这些运行时间:260秒,180,110,110 ... 如果我等待大约5秒钟并再次启动程序,我会得到相同的结果.
测试3:动态分配(线)
Main{
int *a, *b, *c;
a=(int *) malloc(sizeof(int)*2500*2500);
… (same for b and c)
matrix multiplication a*b=c
}
Run Code Online (Sandbox Code Playgroud)
计算时间非常稳定(大约44秒).
我认为测试2效率较低,因为数据存储在内存中的方式.就像本文中的解释(网站今天出来)或在这个问题中.通过内存的一些方法更有效,因此一些分配内存的方法可以为您提供更高效的程序.
但是(在测试2中),我不知道为什么程序随着时间的推移会更快.有没有人有想法解释这种现象?提前致谢.
PS我在带有CentOS 6.3和内核Linux 3.11.1的Intel Xeon E5-2625上进行了这些测试.编辑:我的计算机上禁用了频率缩放功能.频率是恒定的.
这是代码:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <float.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
/****************************************************
* Random generate a random int between _Min and _Max
*/
int Random(int _iMin,int _iMax)
{
return (_iMin + (rand()%(_iMax-_iMin+1)));
}
/****************************************************
* Return the struct of getrusage.
*/
struct rusage give_ru()
{
struct rusage ru;
getrusage(RUSAGE_SELF, &ru);
//getrusage(RUSAGE_CHILDREN, &ru);
return ru;
}
/****************************************************
* Do a matrix multiplication a*b=c
* This program isn't optimized.
*/
int main()
{
/*
* Initialize variables and array sizes.
*/
int **a,**b,**c;
printf("\nenter Taille de la matrice : 2500");
int a_r=2500,a_c=2500,b_r=2500,b_c=2500;
clock_t start,end;
struct rusage start1,end1;
/* variables to store time difference between
start of matrix multiplication and end of multiplication. */
double dif; /*variable to calculate the time difference between the parallelization */
int i,j,k;
if(a_c!=b_r )
{
printf("\ncan not multiply");
goto again;
}
/*
* Memory allocation.
*/
start =clock();
a=(int **) malloc(sizeof(int *)*a_r);
if (a == NULL)
{
printf("Allocation of matrix a failed \n");
exit(0);
}
for( i=0;i<a_c; i++)
{
a[i]=(int *) malloc(sizeof(int)*a_c);
if (a[i] == NULL)
{
printf("Allocation of matrix a[%d] failed \n",i);
exit(0);
}
}
b=(int **) malloc(sizeof(int *)*b_r);
if (b == NULL)
{
printf("Allocation of matrix b failed \n");
exit(0);
}
for( i=0;i<b_c; i++)
{
b[i]=(int *) malloc(sizeof(int)*b_c);
if (b[i] == NULL)
{
printf("Allocation of matrix b[%d] failed \n",i);
exit(0);
}
}
c=(int **) malloc(sizeof(int *)*a_r);
if (c == NULL)
{
printf("Allocation of matrix c failed \n");
exit(0);
}
for( i=0;i< b_c; i++)
{
c[i]=(int *) malloc(sizeof(int)*b_c);
if (c[i] == NULL)
{
printf("Allocation of matrix c[%d] failed \n",i);
exit(0);
}
}
/* Memory is lock*/
printf("Starting mlockall.\n");
if(mlockall(MCL_CURRENT | MCL_FUTURE)<0) {
perror("mlockall");
return 1;
}
/*
* Matrix initialization (with random integer).
*/
printf("Initializing matrices...\n");
//initializing first matrix
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j<a_c; j++)
{
a[i][j] = Random(0,100);//i+j;
//printf("%d \n", a[i][j]);
}
}
// initializing second matrix
srand(time(NULL));
for(i=0;i<b_r; i++)
{
for(j=0;j<b_c; j++)
{
b[i][j] = Random(0,100);//i*j;
}
}
/*initialize product matrix */
srand(time(NULL));
for(i=0;i<a_r; i++)
{
for(j=0;j< b_c; j++)
{
c[i][j]= Random(0,100);//0;
}
}
end= clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
printf("Malloc and initialization took %f sec. time.\n", dif);
/*
* Matrix Multiplication.
*/
start =clock(); //start the timer
start1 = give_ru(); //start the timer
/* multiply matrix one and matrix two */
for(i=0;i<a_r; i++)
for(j=0;j<a_c; j++)
for(k=0;k<b_c; k++)
c[i][j]=c[i][j]+a[i][k]*b[k][j];
end1 = give_ru(); //end the timer
end = clock(); //end the timer
dif = ((double) (end - start)) / CLOCKS_PER_SEC; //store the difference
struct timeval timeUserStart = start1.ru_utime;
double utimeStart = (double)timeUserStart.tv_sec + (double)timeUserStart.tv_usec / 1000000.0;
struct timeval timeSystemStart = start1.ru_stime;
double stimeStart = (double)timeSystemStart.tv_sec + (double)timeSystemStart.tv_usec / 1000000.0;
struct timeval timeUserEnd = end1.ru_utime;
double utimeEnd = (double)timeUserEnd.tv_sec + (double)timeUserEnd.tv_usec / 1000000.0;
struct timeval timeSystemEnd = end1.ru_stime;
double stimeEnd = (double)timeSystemEnd.tv_sec + (double)timeSystemEnd.tv_usec / 1000000.0;
double difuTime = utimeEnd - utimeStart;
double difsTime = stimeEnd - stimeStart;
long pageReclaims = end1.ru_minflt;//start1.ru_minflt-
long pageFaults = end1.ru_majflt;//start1.ru_majflt-
printf("Parallelization took %f sec. time.\n", dif);
printf("Time User : %f .\n", difuTime );
printf("Time System : %f .\n", difsTime );
printf("Page reclaims : %ld .\n", end1.ru_minflt);
printf("Page faults : %ld .\n", end1.ru_majflt);
/*
* free memory.
*/
printf("Finished, cleaning up \n");
munlockall();
/*free memory*/
for(i=0;i<a_r; i++)
{
free(a[i]);
a[i] = NULL;
}
free(a);
a = NULL;
for(i=0;i<a_c; i++)
{
free(b[i]);
b[i] = NULL;
}
free(b);
b = NULL;
for(i=0;i<b_c; i++)
{
free(c[i]);
c[i] = NULL;
}
free(c);
c = NULL;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
我找出了答案的一部分。“最近的 Linux 内核支持透明大页,程序会在不知情的情况下使用它。” 如果我禁用大页,所有三个测试都需要 260 秒才能运行。
对于测试 1 和 3,他们总是使用大页。在这里您可以看到测试 1 和 3 中 AnonHugePages 的数量:

对于测试 2,它使用大页,但并非在所有情况下都使用大页,并且从程序开始就不再使用:

我仍然不明白为什么测试 2 仍然如此不规则。然而测试 3 使用系统调用 mmap() 来分配内存,而测试 2 使用 brk()。也许答案就在这里,我会继续探究。
(我忘记说了,程序运行了三遍。)