您的位置:首页 > 编程语言 > C语言/C++

C++ AMP同CUDA之间的性能比较

2016-02-03 16:18 357 查看
Date:2016-02-03

Author:kagula

Environment:

[1]Win10

[2]VS2013 Update5

[3]Cuda 7.5

从Uvidia官网下载的cuda_7.5.18_windows.exe,cudatoolkit_3.1_win_64.exe
[4]Core i7-4790k + GTX960

测试方式

两个500阶矩阵相乘。

Corei7-4790K+GTX960 测试结果(GPU Boost模式打开情况下)

CPU:          217.866ms
C++ AMP:     35.1236ms
CUDA:              1.969ms

Corei5-2500K+Quadro K600测试结果

CPU:             252.744ms
C++ AMP:     73.1175ms
CUDA:           19.706ms

       CUDA由于对threads做了分块优化,我这里写的cpu和amp代码没有优化,所以性能差距很大。

程序提示计算结果有错误,需要关闭boost改进GPU计算的正确性

这里没有进一步测试。

测试用到的代码段

C++源代码

void matrixMultiplication(int count, int **ppA, int **ppB, int ***pppR){
int **ppMatrix = new int*[count];
*pppR = ppMatrix;

for (int row = 0; row < count; row++){
ppMatrix[row] = new int[count];
}

//
for (int row = 0; row < count; row++)
{
for (int col = 0; col < count; col++)
{
ppMatrix[row][col] = 0;
for (int k = 0; k < count; k++)
{
ppMatrix[row][col] += ppA[row][col] * ppB[col][row];
}
}
}
}


C++AMP源代码

float Matrix_AMP()
{
float score = .0f;

int *a = new int[MATRIX_ORDER*MATRIX_ORDER];
int *b = new int[MATRIX_ORDER*MATRIX_ORDER];
int *r = new int[MATRIX_ORDER*MATRIX_ORDER];

for (int row = 0; row < MATRIX_ORDER; row++)
{
for (int col = 0; col < MATRIX_ORDER; col++)
{
a[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
b[row*MATRIX_ORDER + col] = row*MATRIX_ORDER + col;
r[row*MATRIX_ORDER + col] = 0;
}
}

startTiming();

//amp.begin
array_view<const int, 2> src(MATRIX_ORDER, MATRIX_ORDER, a);
array_view<const int, 2> dst(MATRIX_ORDER, MATRIX_ORDER, b);
array_view<int, 2> result(MATRIX_ORDER, MATRIX_ORDER, r);
result.discard_data();

parallel_for_each(
result.extent,
[=](index<2> idx) restrict(amp)
{
const unsigned int row = idx[0];
const unsigned int col = idx[1];
int r = 0;

for (unsigned int i = 0; i < MATRIX_ORDER;i++)
{
r += src[row][i]*dst[i][col];
}
result[idx] = r;
}
);

result.synchronize();
//amp.end

score = (float)stopTiming();

delete a;
delete b;
delete r;

//以毫秒为单位,返回耗时。
return score;
}


CUDA源代码

源于Nvidia自带的sample,只列出修改部分

/**
* Run a simple test of matrix multiplication using CUDA
*/
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = new float[mem_size_A];//(float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = new float[mem_size_B];// (float *)malloc(mem_size_B);

// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);

// Allocate device memory
float *d_A, *d_B, *d_C;

// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = new float[mem_size_C];//(float *) malloc(mem_size_C);

if (h_C == NULL)
{
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}

// Record the start event
cudaError_t error;
// Allocate CUDA events that we'll use for timing
cudaEvent_t start;
error = cudaEventCreate(&start);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaEventRecord(start, NULL);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}

error = cudaMalloc((void **) &d_A, mem_size_A);

if (error != cudaSuccess)
{
printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

error = cudaMalloc((void **) &d_B, mem_size_B);

if (error != cudaSuccess)
{
printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

error = cudaMalloc((void **) &d_C, mem_size_C);

if (error != cudaSuccess)
{
printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

if (error != cudaSuccess)
{
printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

if (error != cudaSuccess)
{
printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);

// Create and start timer
printf("Computing result using CUDA Kernel...\n");

// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16)
{
matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else
{
matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}

printf("done\n");

cudaDeviceSynchronize();

cudaEvent_t stop;
error = cudaEventCreate(&stop);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}

// Execute the kernel
int nIter = 1;

for (int j = 0; j < nIter; j++)
{
if (block_size == 16)
{
matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else
{
matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}

// Copy result from device to host
error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);

if (error != cudaSuccess)
{
printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}

// Record the stop event
error = cudaEventRecord(stop, NULL);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}

// Wait for the stop event to complete
error = cudaEventSynchronize(stop);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}

float msecTotal = 0.0f;
error = cudaEventElapsedTime(&msecTotal, start, stop);

if (error != cudaSuccess)
{
fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}

// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x;
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);

printf("Checking computed result for correctness: ");
bool correct = true;

// test relative error by the formula
//     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
//double eps = 1.e-6 ; // machine zero

//for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++)
//{
//    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
//    double dot_length = dimsA.x;
//    double abs_val = fabs(h_C[i]);
//    double rel_err = abs_err/abs_val/dot_length ;

//    if (rel_err > eps)
//    {
//        printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps);
//        correct = false;
//    }
//}

//printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");

// Clean up memory
delete h_A;//free(h_A);
delete h_B;//free(h_B);
delete h_C;//free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice.  It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();

if (correct)
{
return EXIT_SUCCESS;
}
else
{
return EXIT_FAILURE;
}
}

参考资料

[1]https://developer.nvidia.com/cuda-zone
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: