pthread多线程加速示例(大型矩阵乘法):Blocking,1024线程^_^
2014-04-20 13:30
162 查看
使用分块方式,加速效果更为显著,以32*32=1024线程测试,比非Blocking方式(参考这篇文章)提升十几倍。
特别注意线程数量多了也会更消耗资源,额外花费一些时间的。下述param参数结构体与前例不同,可按需要修改。
日志节选
特别注意线程数量多了也会更消耗资源,额外花费一些时间的。下述param参数结构体与前例不同,可按需要修改。
// Multi-Thread Speedup:Blocking Method #include <cmnheader.h> #include <time.h> #include "MatrixLib.h" #pragma comment(lib,"MatrixLib.lib") #pragma warning(disable:4996) void checkResult(char* str, int value, FILE* pflog) { if (value != 0) { fprintf(pflog, "Failed with %d at %s", value, str); exit(1); } } typedef struct { FILE* pflog; double** R; double** A; double** B; int start_row; int end_row; int start_col; int end_col; } threadParm_t; // 在此之前请先执行ZeroInitSquareMatrix(R,N) void *oneThread(void *param) { threadParm_t *p = (threadParm_t *)param; //fprintf(p->pflog, "# Thread \'%.8X %.8X\' is now running.\n", getpid()); double** R = p->R; double** A = p->A; double** B = p->B; int start_row = p->start_row; int end_row = p->end_row; int start_col = p->start_col; int end_col = p->end_col; double tmp; for (int i = start_row; i < end_row; ++i) { for (int j = start_col; j < end_col; ++j) { tmp = 0; for (int k = start_col; k < end_col; ++k) { tmp += A[i][k] * B[j][k]; } R[i][j] += tmp; } } return NULL; } void OneTry(const int N, const int C, FILE* pflog) { int CC = C*C; fprintf(pflog, "== %4d * %4d Matrix Multiply, %d Threads. ==\n", N, N, CC); clock_t start = clock(); double** X = NewSquareMatrix(N); double** Y = NewSquareMatrix(N); double** Z = NewSquareMatrix(N); TransformSquareMat(Z, N); // 转置一次 int start_row = 0, end_row = 0; int start_col = 0, end_col = 0; int inc_row = N / C,inc_col=N/C; end_row = start_row + inc_row; end_col = start_col + inc_col; int i, j,k,rc; pthread_t* threads = new pthread_t[CC]; threadParm_t* tparams = new threadParm_t[CC]; for (i = 0; i < C; ++i) { for (j = 0; j < C; ++j) { k = i*C + j; tparams[k].pflog = pflog; tparams[k].R = X; tparams[k].A = Y; tparams[k].B = Z; tparams[k].start_row = start_row; tparams[k].end_row = end_row; tparams[k].start_col = start_col; tparams[k].end_col = end_col; start_row = end_row + 1; end_row += inc_row; start_col = end_col + 1; end_col += inc_col; start_row %= N; end_row %= N; start_col %= N; end_col %= N; rc = pthread_create(&threads[k], NULL, oneThread, &tparams[k]); checkResult("!! pthread_create()\n", rc, pflog); //fprintf(pflog, "********** %4d of %4d threads created **********\n", k + 1, CC); } } fprintf(pflog, "@ Waiting for worker threads' end...\n"); int* status = new int[CC]; for (i = 0; i < CC; ++i) { rc = pthread_join(threads[i], (void**)(&status[i])); checkResult("!! pthread_join()\n", rc, pflog); } fprintf(pflog, "@ Check all thread's results\n"); for (i = 0; i < CC; ++i) { if (status[i] != NULL) { fprintf(pflog, "!! Unexpected thread status\n"); } } //TransformSquareMat(Z, N); // 恢复 SafeDeleteSquareMat(X, N); SafeDeleteSquareMat(Y, N); SafeDeleteSquareMat(Z, N); clock_t finish = clock(); fprintf(pflog, "@ All finished. Total time:%.8f(sec).\n\n", (finish - start) / (1.0*CLOCKS_PER_SEC)); } int main(int argc, char **argv) { FILE* pflog = fopen("trace_log.txt", "a"); const int N = 4096, C = 32; printf("Matrix N=%d,Thread C=%d, now running...", N, C*C); time_t rawtime; time(&rawtime); tm* tminfo = localtime(&rawtime); fprintf(pflog, "\nNEW LOG @%s", asctime(tminfo)); OneTry(N, C, pflog); fflush(pflog); fclose(pflog); printf("finshed!\n"); system("pause"); return 0; }
日志节选
NEW LOG @Sun Apr 20 13:18:12 2014 == 4096 * 4096 Matrix Multiply, 1024 Threads. == ********** 1 of 1024 threads created ********** ********** 2 of 1024 threads created ********** ********** 3 of 1024 threads created ********** # Thread '000033A8 00F91A80' is now running. # Thread '000033A8 00F91B60' is now running. (以下省略...) @ Check all thread's results @ All finished. Total time:2.57800000(sec). NEW LOG @Sun Apr 20 13:18:42 2014 == 4096 * 4096 Matrix Multiply, 256 Threads. == ********** 1 of 256 threads created ********** ********** 2 of 256 threads created ********** ********** 3 of 256 threads created ********** ********** 4 of 256 threads created ********** # Thread '00003470 01001A80' is now running. # Thread '00003470 01001B60' is now running. # Thread '00003470 01003578' is now running. # Thread '00003470 01003888' is now running. ********** 5 of 256 threads created ********** (以下省略...) @ Check all thread's results @ All finished. Total time:3.60900000(sec). NEW LOG @Sun Apr 20 13:18:52 2014 == 4096 * 4096 Matrix Multiply, 64 Threads. == ********** 1 of 64 threads created ********** ********** 2 of 64 threads created ********** ********** 3 of 64 threads created ********** # Thread '0000368C 009B1A80' is now running. ********** 4 of 64 threads created ********** # Thread '0000368C 009B1B60' is now running. (以下省略...) # Thread '0000368C 009B3888' is now running. @ Check all thread's results @ All finished. Total time:6.90600000(sec). NEW LOG @Sun Apr 20 13:29:52 2014 == 4096 * 4096 Matrix Multiply, 1024 Threads. == @ Waiting for worker threads' end... @ Check all thread's results @ All finished. Total time:2.44600000(sec).
相关文章推荐
- pthread多线程加速示例(大型矩阵乘法)
- C - pthread多线程最简单示例
- java 多线程并行计算之矩阵乘法(星星笔记)
- 矩阵乘法的多线程实现
- 使用各种方法加速大型矩阵运算的效率对比
- java 多线程并行计算之矩阵乘法继承Thread类实现(星星笔记)
- 如何使用矩阵乘法加速动态规划——以[SDOI2009]HH去散步为例
- c++的矩阵乘法加速trick
- python 多线程稀疏矩阵乘法
- 多线程本地图片加载示例【OpenCV】【Pthread】
- 分享矩阵乘法单线程与多线程的Java实现与效率对比,请教Strassen算法
- 多线程编程-矩阵乘法
- CUDA矩阵乘法——VS2010中使用CUDA示例
- 算法导论,动态规划 —— 矩阵链乘法(python代码实现示例)
- 多线程--Python下载(支持断点续传) & Java多线程计算矩阵乘法
- [pthread]Linux C 多线程简单示例
- 用多线程并发的方式来计算两个矩阵的乘法
- CUDA矩阵乘法——VS2010中使用CUDA示例
- 51NOD 1836 战忽局的手段(矩阵乘法加速+__float128精度问题+概率期望)——算法马拉松20(告别美国大选及卡斯特罗)
- c++的矩阵乘法加速trick