您的位置:首页 > 其它

矩阵乘法-分块计算

2012-05-28 20:43 225 查看
将整个矩阵分解为这样的小块,每次完成一对小块的计算,以提高Cache的命中率。
提示:



图中n=N/m
计算次序为A11*B11, A11*B12,…, A11*B1n,,由于反复使用A11,因此可以提高Cache的命中率。

/*
矩阵分开计算
C=A*B  --- C(i,j)等于A的第i行乘以第j列
*/
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <windows.h>

/*
生成n*n矩阵
*/
void GenerateMatrix(float *m, int n);
void PrintMatrix(float *p, int n);
void GeneralMul(float *A, float *B, float *C, int n);
void ClearMatrix(float *m, int n);
/*
矩阵分块计算
*/
void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m);
/*
两个矩阵的误差
*/
float diff(float *C1, float *C0, int n);

struct ARG {
float *A;
int ax, ay;
float *B;
int bx, by;
float *C;
int cx, cy;
int m;
int n;
};
int main(int argc, char **argv)
{
if (argc != 4)
{
printf("Usage: %s N thread_num M\n", argv[0]);
return 0;
}
int n=atoi(argv[1]);
int thread_num = atoi(argv[2]);
int m = atoi(argv[3]);
float *A = new float[n*n];
float *B = new float[n*n];
float *C = new float[n*n];
float *C0 = new float[n*n];

GenerateMatrix(A, n);
GenerateMatrix(B, n);

clock_t start;
float time_used;

ClearMatrix(C0, n);
start=clock();
GeneralMul(A, B, C0, n);
time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000;
printf("General:   time = %f\n", time_used);

ClearMatrix(C, n);
start=clock();
BlockCacul(A, B, C, n, thread_num, m);
time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000;
printf("Block:  time = %f\n", time_used);
printf("Difference of two result: %f\n", diff(C0, C, n));

delete [] A;
delete [] B;
delete [] C;
delete [] C0;
return 0;
}

void ClearMatrix(float *m, int n)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
m[i*n+j]=0.0;
}
}
/*
普通矩阵相乘
*/
void GeneralMul(float *A, float *B, float *C, int n)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
{
float *p = C+i*n+j;
for (int k=0; k<n; k++)
{
*p += A[i*n+k]*B[k*n+j];
}
}
}
}

DWORD WINAPI Mul_Fun(LPVOID arg)
{
struct ARG *p = (struct ARG *)arg;

float *A = p->A;
float *B = p->B;
float *C = p->C;
int m = p->m;
int n = p->n;
for (int i=0; i<m; i++)
{
for (int j=0; j<m; j++)
{
float *t = C+(i+p->cx)*n+p->cy+j;

for (int k=0; k<m; k++)
{
*t += A[(p->ax+i)*n+p->ay+k]*B[(p->bx+k)*n+p->by+j];
}
}
}

return 0;
}

void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m)
{
//m = static_cast<int>(sqrt(m));
struct ARG *args = new struct ARG[thread_num];
HANDLE *handle = new HANDLE[thread_num];
int t=0;
int i;
for (i=0; i<thread_num; i++)
{
args[i].A = A;
args[i].B = B;
args[i].C = C;
args[i].m = m;
args[i].n = n;
}
//分成n/m x n/m块
//A i行j列
for (i=0; i<n; i+=m)
{
for (int j=0; j<n; j+=m)
{
//B j行k列
for (int k=0; k<n; k+=m)
{
args[t].ax = i;
args[t].ay = j;
args[t].bx = j;
args[t].by = k;
args[t].cx = i;
args[t].cy = k;
if (t<thread_num)
{
handle[t] = CreateThread(NULL, 0, Mul_Fun, (LPVOID)(&args[t]), 0, 0 );
t++;
}
if (t==thread_num)
{
for (int ii=0; ii<t; ii++)
WaitForMultipleObjects(thread_num, &handle[ii],TRUE,INFINITE);
t = 0;
}
}
}
}

}
void GenerateMatrix(float *p, int n)
{
srand(time(NULL)+rand());
for (int i=0; i<n*n; i++)
{
*p = static_cast<float>(rand())/ (static_cast<float>(rand())+ static_cast<float>(0.55));
p++;
}
}

float diff(float *C1, float *C0, int n)
{
float rst=0.0;
float t;

for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
{
t = C1[i*n+j]-C0[i*n+j];
if (t<0)
t = -t;
rst += t;
}
}
return rst;
}

void PrintMatrix(float *p, int n)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
{
printf("%.2f\t", p[i*n+j]);
}
printf("\n");
}
printf("\n");
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: