C++ AMP同CUDA之间的性能比较
2016-02-03 16:18
357 查看
Date:2016-02-03
Author:kagula
Environment:
[1]Win10
[2]VS2013 Update5
[3]Cuda 7.5
从Uvidia官网下载的cuda_7.5.18_windows.exe,cudatoolkit_3.1_win_64.exe
[4]Core i7-4790k + GTX960
测试方式
两个500阶矩阵相乘。
Corei7-4790K+GTX960 测试结果(GPU Boost模式打开情况下)
CPU: 217.866ms
C++ AMP: 35.1236ms
CUDA: 1.969ms
Corei5-2500K+Quadro K600测试结果
CPU: 252.744ms
C++ AMP: 73.1175ms
CUDA: 19.706ms
CUDA由于对threads做了分块优化,我这里写的cpu和amp代码没有优化,所以性能差距很大。
程序提示计算结果有错误,需要关闭boost改进GPU计算的正确性。
这里没有进一步测试。
测试用到的代码段
C++源代码
C++AMP源代码
CUDA源代码
源于Nvidia自带的sample,只列出修改部分
参考资料
[1]https://developer.nvidia.com/cuda-zone
Author:kagula
Environment:
[1]Win10
[2]VS2013 Update5
[3]Cuda 7.5
从Uvidia官网下载的cuda_7.5.18_windows.exe,cudatoolkit_3.1_win_64.exe
[4]Core i7-4790k + GTX960
测试方式
两个500阶矩阵相乘。
Corei7-4790K+GTX960 测试结果(GPU Boost模式打开情况下)
CPU: 217.866ms
C++ AMP: 35.1236ms
CUDA: 1.969ms
Corei5-2500K+Quadro K600测试结果
CPU: 252.744ms
C++ AMP: 73.1175ms
CUDA: 19.706ms
CUDA由于对threads做了分块优化,我这里写的cpu和amp代码没有优化,所以性能差距很大。
程序提示计算结果有错误,需要关闭boost改进GPU计算的正确性。
这里没有进一步测试。
测试用到的代码段
C++源代码
void matrixMultiplication(int count, int **ppA, int **ppB, int ***pppR){ int **ppMatrix = new int*[count]; *pppR = ppMatrix; for (int row = 0; row < count; row++){ ppMatrix[row] = new int[count]; } // for (int row = 0; row < count; row++) { for (int col = 0; col < count; col++) { ppMatrix[row][col] = 0; for (int k = 0; k < count; k++) { ppMatrix[row][col] += ppA[row][col] * ppB[col][row]; } } } }
C++AMP源代码
float Matrix_AMP() { float score = .0f; int *a = new int[MATRIX_ORDER*MATRIX_ORDER]; int *b = new int[MATRIX_ORDER*MATRIX_ORDER]; int *r = new int[MATRIX_ORDER*MATRIX_ORDER]; for (int row = 0; row < MATRIX_ORDER; row++) { for (int col = 0; col < MATRIX_ORDER; col++) { a[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col; b[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col; r[row*MATRIX_ORDER + col] = 0; } } startTiming(); //amp.begin array_view<const int, 2> src(MATRIX_ORDER, MATRIX_ORDER, a); array_view<const int, 2> dst(MATRIX_ORDER, MATRIX_ORDER, b); array_view<int, 2> result(MATRIX_ORDER, MATRIX_ORDER, r); result.discard_data(); parallel_for_each( result.extent, [=](index<2> idx) restrict(amp) { const unsigned int row = idx[0]; const unsigned int col = idx[1]; int r = 0; for (unsigned int i = 0; i < MATRIX_ORDER;i++) { r += src[row][i]*dst[i][col]; } result[idx] = r; } ); result.synchronize(); //amp.end score = (float)stopTiming(); delete a; delete b; delete r; //以毫秒为单位,返回耗时。 return score; }
CUDA源代码
源于Nvidia自带的sample,只列出修改部分
/** * Run a simple test of matrix multiplication using CUDA */ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) { // Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = new float[mem_size_A];//(float *)malloc(mem_size_A); unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = new float[mem_size_B];// (float *)malloc(mem_size_B); // Initialize host memory const float valB = 0.01f; constantInit(h_A, size_A, 1.0f); constantInit(h_B, size_B, valB); // Allocate device memory float *d_A, *d_B, *d_C; // Allocate host matrix C dim3 dimsC(dimsB.x, dimsA.y, 1); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); float *h_C = new float[mem_size_C];//(float *) malloc(mem_size_C); if (h_C == NULL) { fprintf(stderr, "Failed to allocate host matrix C!\n"); exit(EXIT_FAILURE); } // Record the start event cudaError_t error; // Allocate CUDA events that we'll use for timing cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } // Setup execution parameters dim3 threads(block_size, block_size); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); // Create and start timer printf("Computing result using CUDA Kernel...\n"); // Performs warmup operation using matrixMul CUDA kernel if (block_size == 16) { matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } else { matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } printf("done\n"); cudaDeviceSynchronize(); cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Execute the kernel int nIter = 1; for (int j = 0; j < nIter; j++) { if (block_size == 16) { matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } else { matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); } } // Copy result from device to host error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); if (error != cudaSuccess) { printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); exit(EXIT_FAILURE); } // Record the stop event error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Wait for the stop event to complete error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); printf("Checking computed result for correctness: "); bool correct = true; // test relative error by the formula // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps //double eps = 1.e-6 ; // machine zero //for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) //{ // double abs_err = fabs(h_C[i] - (dimsA.x * valB)); // double dot_length = dimsA.x; // double abs_val = fabs(h_C[i]); // double rel_err = abs_err/abs_val/dot_length ; // if (rel_err > eps) // { // printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); // correct = false; // } //} //printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); // Clean up memory delete h_A;//free(h_A); delete h_B;//free(h_B); delete h_C;//free(h_C); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (correct) { return EXIT_SUCCESS; } else { return EXIT_FAILURE; } }
参考资料
[1]https://developer.nvidia.com/cuda-zone
相关文章推荐
- C/C++内存泄漏及检测
- C语言文件IO操作的一些其它函数
- <重拾C++>const
- C++ 左值、右值、右值引用
- CC++代码优化的27个建议
- AVL平衡二叉树的c++实现
- C++学习之多态篇(异常处理)
- c++ vector
- 单链表的C++实现(采用模板类)
- C++开发工程师面试题
- c++多线程在异常环境下的等待
- C++语法 面试题 带答案
- C++ Primer 学习笔记——顺序容器(1)
- 逐梦C++补遗篇之三:若干重要C++特性
- c++ 容器(list学习总结)
- C++ 使用栈判断回文字符串
- C++学习之多态篇(运行时类型识别--RTTI(typeid和dynamic_cast))
- 两道出现频率超高的C++笔试题
- NSScanner: nil string argument libc++abi.dylib: terminate_handler unexpectedly threw an exception问题
- C语言之基本算法35—数组上三角之积 主对角之积 副对角之积