CUDA:对齐内存访问、展开循环提高运算性能
2017-11-29 19:46
295 查看
#include "../common/common.h" #include <cuda_runtime.h> #include <stdio.h> /* * This example demonstrates the impact of misaligned reads on performance by * forcing misaligned reads to occur on a float*. Kernels that reduce the * performance impact of misaligned reads via unrolling are also included below. */ void checkResult(float *hostRef, float *gpuRef, const int N) { double epsilon = 1.0E-8; bool match = 1; for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0; printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], gpuRef[i]); break; } } if (!match) printf("Arrays do not match.\n\n"); } void initialData(float *ip, int size) { for (int i = 0; i < size; i++) { ip[i] = (float)( rand() & 0xFF ) / 100.0f; } return; } void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset) { for (int idx = offset, k = 0; idx < n; idx++, k++) { C[k] = A[idx] + B[idx]; } } __global__ void warmup(float *A, float *B, float *C, const int n, int offset) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; unsigned int k = i + offset; if (k < n) C[i] = A[k] + B[k]; } __global__ void readOffset(float *A, float *B, float *C, const int n, int offset) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; u 4000 nsigned int k = i + offset; if (k < n) C[i] = A[k] + B[k]; } __global__ void readOffsetUnroll2(float *A, float *B, float *C, const int n, int offset) { unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x; unsigned int k = i + offset; if (k < n) C[i] = A[k] + B[k]; if (k + blockDim.x < n) { C[i + blockDim.x] = A[k + blockDim.x] + B[k + blockDim.x]; } } __global__ void readOffsetUnroll4(float *A, float *B, float *C, const int n, int offset) { unsigned int i = blockIdx.x * blockDim.x * 4 + threadIdx.x; unsigned int k = i + offset; if (k < n) C[i] = A[k] + B[k]; if (k + blockDim.x < n) { C[i + blockDim.x] = A[k + blockDim.x] + B[k + blockDim.x]; } if (k + 2 * blockDim.x < n) { C[i + 2 * blockDim.x] = A[k + 2 * blockDim.x] + B[k + 2 * blockDim.x]; } if (k + 3 * blockDim.x < n) { C[i + 3 * blockDim.x] = A[k + 3 * blockDim.x] + B[k + 3 * blockDim.x]; } } int main(int argc, char **argv) { // set up device int dev = 0; cudaDeviceProp deviceProp; CHECK(cudaGetDeviceProperties(&deviceProp, dev)); printf("%s starting reduction at ", argv[0]); printf("device %d: %s ", dev, deviceProp.name); CHECK(cudaSetDevice(dev)); // set up array size int power = 20; int blocksize = 512; int offset = 0; if (argc > 1) offset = atoi(argv[1]); if (argc > 2) blocksize = atoi(argv[2]); if (argc > 3) power = atoi(argv[3]); int nElem = 1 << power; // total number of elements to reduce printf(" with array size %d\n", nElem); size_t nBytes = nElem * sizeof(float); // execution configuration dim3 block (blocksize, 1); dim3 grid ((nElem + block.x - 1) / block.x, 1); // allocate host memory float *h_A = (float *)malloc(nBytes); float *h_B = (float *)malloc(nBytes); float *hostRef = (float *)malloc(nBytes); float *gpuRef = (float *)malloc(nBytes); // initialize host array initialData(h_A, nElem); memcpy(h_B, h_A, nBytes); // summary at host side sumArraysOnHost(h_A, h_B, hostRef, nElem, offset); // allocate device memory float *d_A, *d_B, *d_C; CHECK(cudaMalloc((float**)&d_A, nBytes)); CHECK(cudaMalloc((float**)&d_B, nBytes)); CHECK(cudaMalloc((float**)&d_C, nBytes)); // copy data from host to device CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice)); // kernel 1: double iStart = seconds(); warmup<<<grid, block>>>(d_A, d_B, d_C, nElem, offset); CHECK(cudaDeviceSynchronize()); double iElaps = seconds() - iStart; printf("warmup <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x, block.x, offset, iElaps); CHECK(cudaGetLastError()); CHECK(cudaMemset(d_C, 0x00, nBytes)); // kernel 1 iStart = seconds(); readOffset<<<grid, block>>>(d_A, d_B, d_C, nElem, offset); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("readOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x, block.x, offset, iElaps); CHECK(cudaGetLastError()); // copy kernel result back to host side and check device results CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkResult(hostRef, gpuRef, nElem-offset); CHECK(cudaMemset(d_C, 0x00, nBytes)); // kernel 2 iStart = seconds(); readOffsetUnroll2<<<grid.x/2, block>>>(d_A, d_B, d_C, nElem, offset); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("unroll2 <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x / 2, block.x, offset, iElaps); CHECK(cudaGetLastError()); // copy kernel result back to host side and check device results CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkResult(hostRef, gpuRef, nElem - offset); CHECK(cudaMemset(d_C, 0x00, nBytes)); // kernel 3 iStart = seconds(); readOffsetUnroll4<<<grid.x / 4, block>>>(d_A, d_B, d_C, nElem, offset); CHECK(cudaDeviceSynchronize()); iElaps = seconds() - iStart; printf("unroll4 <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x / 4, block.x, offset, iElaps); CHECK(cudaGetLastError()); // copy kernel result back to host side and check device results CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); checkResult(hostRef, gpuRef, nElem - offset); CHECK(cudaMemset(d_C, 0x00, nBytes)); // free host and device memory CHECK(cudaFree(d_A)); CHECK(cudaFree(d_B)); CHECK(cudaFree(d_C)); free(h_A); free(h_B); // reset device CHECK(cudaDeviceReset()); return EXIT_SUCCESS; }
运行结果性能对比:
-bash-4.1$ ./readSegmentUnroll ./readSegmentUnroll starting reduction at device 0: Tesla K40c with array size 1048576 warmup <<< 2048, 512 >>> offset 0 elapsed 0.000179 sec readOffset <<< 2048, 512 >>> offset 0 elapsed 0.000101 sec unroll2 <<< 1024, 512 >>> offset 0 elapsed 0.000099 sec unroll4 <<< 512, 512 >>> offset 0 elapsed 0.000100 sec -bash-4.1$ ./readSegmentUnroll 11 ./readSegmentUnroll starting reduction at device 0: Tesla K40c with array size 1048576 warmup <<< 2048, 512 >>> offset 11 elapsed 0.000187 sec readOffset <<< 2048, 512 >>> offset 11 elapsed 0.000106 sec unroll2 <<< 1024, 512 >>> offset 11 elapsed 0.000105 sec unroll4 <<< 512, 512 >>> offset 11 elapsed 0.000110 sec -bash-4.1$ ./readSegmentUnroll 128 ./readSegmentUnroll starting reduction at device 0: Tesla K40c with array size 1048576 warmup <<< 2048, 512 >>> offset 128 elapsed 0.000209 sec readOffset <<< 2048, 512 >>> offset 128 elapsed 0.000100 sec unroll2 <<< 1024, 512 >>> offset 128 elapsed 0.000101 sec unroll4 <<< 512, 512 >>> offset 128 elapsed 0.000098 sec -bash-4.1$
相关文章推荐
- (CUDA 编程7).CUDA内存访问(一)提高篇------按部就班
- 【并行计算-CUDA开发】有关CUDA当中global memory如何实现合并访问跟内存对齐相关的问题
- [菜鸟每天来段CUDA_C] 利用页锁定内存提高运算效率
- 7. CUDA内存访问(一)提高篇------按部就班 ------GPU的革命
- 顺序访问内存,提高程序性能
- 利用Cache缓存数据DataTable数据提高大数据量访问性能
- 转载:走向DBA[MSSQL篇] - 从SQL语句的角度提高数据库的访问性能
- 使用内存映射文件来提高你程序的性能
- 利用Cache缓存数据DataTable数据提高大数据量访问性能
- 走向DBA[MSSQL篇] - 从SQL语句的角度提高数据库的访问性能
- 从CPU角度看内存访问对齐
- 走向DBA[MSSQL篇] - 从SQL语句的角度提高数据库的访问性能
- 使用内存映射文件来提高你程序的性能
- 程序性能优化探讨(1)——周期计数器与循环展开
- 【SqlServer2005+ 查询优化】MSSQL优化SQL语句 提高数据库的访问性能
- 如何提高网站首页的访问性能
- 优化网站性能 提高网站速度访问速度的14条实践
- 优化网站性能 提高网站速度访问速度的14条实践
- 优化网站性能 提高网站速度访问速度的14条实践
- cxf怎样提高webservice性能,及访问速度调优