您的位置:首页 > 其它

一个不错的关于CPU和GPU(CUDA)的性能比较讨论话题

2008-10-28 22:18 639 查看
http://topic.csdn.net/u/20081027/23/67ff3857-3c71-4d5c-acf6-095f3497c7a9.html
这里是今天的一个论坛的一个帖子,大家可以讨论一下:)
1.那些程序适合用cpu来做,那些适合用gpu来做
2.如果用gpu来做,需要注意那些东西
3.如果需要优化,需要那些思路:)

在lz的代码的基础上做了一些变化,大家可以自己测试一下,就知道那些工作适合用cpu做,那些是适合用gpu来做。
这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....
大家可以做一个测试,看看最后的效果是怎么样的,可以画一个曲线图出来:)

过一段时间,还可以把这个代码在修改一下,添加更多的内容进去,再看看两者的效果怎么样:)
C/C++ code

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <cutil.h>

// Simple utility function to check for CUDA runtime errors
//void checkCUDAError(const char* msg)
#define LOOP_ADD_TIME 100

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock( int*d_a)
{
int dx=blockDim.x*blockIdx.x+threadIdx.x;

for (int i = 1; i <= LOOP_ADD_TIME; i++)
{
d_a[dx] += i;
}
}

int gpu_test()
{
clock_t start, finish;
double duration;

// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)

// pointer for device memory
int *d_a;

// define grid and block size
int numThreadsPerBlock =512;

// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA/numThreadsPerBlock;
printf("blocks: %d/n",numBlocks);

// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
CUDA_SAFE_CALL(cudaMalloc( (void **) &d_a, memSize ));

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf("%d ",h_a[i]);
}

start = clock();
//unsigned int timer;
//CUT_SAFE_CALL(cutCreateTimer(&timer));
//CUT_SAFE_CALL(cutStartTimer(timer));
// Copy host array to device array
CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));

// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock <<< dimGrid, dimBlock >>>( d_a );

// device to host copy
CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));

//CUT_SAFE_CALL(cutStopTimer(timer));
finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "gpu time is %f ms/n", duration );
//printf( "gpu time is %f ms/n", cutGetTimerValue(timer));

int *h_a2;

// allocate host memory
h_a2 = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a2[i] = i;
//printf("%d ",h_a[i]);
}
for( int j=0; j < dimA ; ++j )
{
for(int k = 1; k <= LOOP_ADD_TIME; k++)
{
h_a2[j] += k;
}
}

for( int j=0; j < dimA ; ++j )
{
if (h_a[j] != h_a2[j])printf("error!/n");
}
// free host memory
free(h_a2);

// free host memory
free(h_a);

// free device memory
cudaFree(d_a);

return 0;

}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int cpu_test()
{
clock_t start, finish;
double duration;

// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)

// allocate host memory
size_t memSize = 512*21056* sizeof(int);
h_a = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
//printf("%d ",h_a[i]);
}
printf("/n");

start = clock();
for( int j=0; j < dimA ; ++j )
{
for(int k = 1; k <= LOOP_ADD_TIME; k++)
{
h_a[j] += k;
}
}

finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "cpu time is %f ms/n", duration );

// free host memory
free(h_a);

return 0;

}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{

CUT_DEVICE_INIT(argc, argv);
gpu_test();

cpu_test();

CUT_EXIT(argc, argv);
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息