矩阵乘法-分块计算
2012-05-28 20:43
225 查看
将整个矩阵分解为这样的小块,每次完成一对小块的计算,以提高Cache的命中率。
提示:
图中n=N/m
计算次序为A11*B11, A11*B12,…, A11*B1n,,由于反复使用A11,因此可以提高Cache的命中率。
提示:
图中n=N/m
计算次序为A11*B11, A11*B12,…, A11*B1n,,由于反复使用A11,因此可以提高Cache的命中率。
/* 矩阵分开计算 C=A*B --- C(i,j)等于A的第i行乘以第j列 */ #include <stdio.h> #include <time.h> #include <stdlib.h> #include <math.h> #include <windows.h> /* 生成n*n矩阵 */ void GenerateMatrix(float *m, int n); void PrintMatrix(float *p, int n); void GeneralMul(float *A, float *B, float *C, int n); void ClearMatrix(float *m, int n); /* 矩阵分块计算 */ void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m); /* 两个矩阵的误差 */ float diff(float *C1, float *C0, int n); struct ARG { float *A; int ax, ay; float *B; int bx, by; float *C; int cx, cy; int m; int n; }; int main(int argc, char **argv) { if (argc != 4) { printf("Usage: %s N thread_num M\n", argv[0]); return 0; } int n=atoi(argv[1]); int thread_num = atoi(argv[2]); int m = atoi(argv[3]); float *A = new float[n*n]; float *B = new float[n*n]; float *C = new float[n*n]; float *C0 = new float[n*n]; GenerateMatrix(A, n); GenerateMatrix(B, n); clock_t start; float time_used; ClearMatrix(C0, n); start=clock(); GeneralMul(A, B, C0, n); time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000; printf("General: time = %f\n", time_used); ClearMatrix(C, n); start=clock(); BlockCacul(A, B, C, n, thread_num, m); time_used = static_cast<float>(clock() - start)/CLOCKS_PER_SEC*1000; printf("Block: time = %f\n", time_used); printf("Difference of two result: %f\n", diff(C0, C, n)); delete [] A; delete [] B; delete [] C; delete [] C0; return 0; } void ClearMatrix(float *m, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) m[i*n+j]=0.0; } } /* 普通矩阵相乘 */ void GeneralMul(float *A, float *B, float *C, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { float *p = C+i*n+j; for (int k=0; k<n; k++) { *p += A[i*n+k]*B[k*n+j]; } } } } DWORD WINAPI Mul_Fun(LPVOID arg) { struct ARG *p = (struct ARG *)arg; float *A = p->A; float *B = p->B; float *C = p->C; int m = p->m; int n = p->n; for (int i=0; i<m; i++) { for (int j=0; j<m; j++) { float *t = C+(i+p->cx)*n+p->cy+j; for (int k=0; k<m; k++) { *t += A[(p->ax+i)*n+p->ay+k]*B[(p->bx+k)*n+p->by+j]; } } } return 0; } void BlockCacul(float *A, float *B, float *C, int n, int thread_num, int m) { //m = static_cast<int>(sqrt(m)); struct ARG *args = new struct ARG[thread_num]; HANDLE *handle = new HANDLE[thread_num]; int t=0; int i; for (i=0; i<thread_num; i++) { args[i].A = A; args[i].B = B; args[i].C = C; args[i].m = m; args[i].n = n; } //分成n/m x n/m块 //A i行j列 for (i=0; i<n; i+=m) { for (int j=0; j<n; j+=m) { //B j行k列 for (int k=0; k<n; k+=m) { args[t].ax = i; args[t].ay = j; args[t].bx = j; args[t].by = k; args[t].cx = i; args[t].cy = k; if (t<thread_num) { handle[t] = CreateThread(NULL, 0, Mul_Fun, (LPVOID)(&args[t]), 0, 0 ); t++; } if (t==thread_num) { for (int ii=0; ii<t; ii++) WaitForMultipleObjects(thread_num, &handle[ii],TRUE,INFINITE); t = 0; } } } } } void GenerateMatrix(float *p, int n) { srand(time(NULL)+rand()); for (int i=0; i<n*n; i++) { *p = static_cast<float>(rand())/ (static_cast<float>(rand())+ static_cast<float>(0.55)); p++; } } float diff(float *C1, float *C0, int n) { float rst=0.0; float t; for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { t = C1[i*n+j]-C0[i*n+j]; if (t<0) t = -t; rst += t; } } return rst; } void PrintMatrix(float *p, int n) { for (int i=0; i<n; i++) { for (int j=0; j<n; j++) { printf("%.2f\t", p[i*n+j]); } printf("\n"); } printf("\n"); }
相关文章推荐
- 多线程--Python下载(支持断点续传) & Java多线程计算矩阵乘法
- 手把手教你用Execel计算两个矩阵的乘法
- 华为OJ(矩阵乘法计算量估计)
- HDU 1082 矩阵乘法次数计算 写了半天搞定,很有成就感啊
- 矩阵乘法计算量估算
- 矩阵乘法的计算和来源
- CUDA之矩阵乘法——非方阵计算
- 动态规划 的方法求矩阵乘法的最少计算加括号方式
- 计算矩阵运算的乘法次数
- NumPy中的乘法运算符 * 指示按元素计算,矩阵乘法可以使用 dot 函数或创建矩阵对象实现
- C语言科学计算入门之矩阵乘法的相关计算
- MPI多进程并行计算矩阵乘法实现
- 向MapReduce转换:通过部分成绩计算矩阵乘法
- java 多线程并行计算之矩阵乘法(星星笔记)
- 从矩阵乘法的不同计算方式来看局部性原理
- 计算矩阵乘法所需运算的次数
- 计算矩阵运算的乘法次数
- 汇编—双重循环—计算4*4矩阵与4*1矩阵乘法
- Strassen矩阵乘法 + 快速计算乘方的算法 + 矩阵的次幂
- 线性代数教程之一——矩阵乘法计算、理解及代码实现