这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....

C/C++ code

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <cutil.h>

// Simple utility function to check for CUDA runtime errors
//void checkCUDAError(const char* msg)
#define LOOP_ADD_TIME 100

// Part 2 of 2: implement the kernel
__global__ void reverseArrayBlock( int*d_a)
int dx=blockDim.x*blockIdx.x+threadIdx.x;

for (int i = 1; i <= LOOP_ADD_TIME; i++)
d_a[dx] += i;

int gpu_test()
clock_t start, finish;
double duration;

// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)

// pointer for device memory
int *d_a;

// define grid and block size
int numThreadsPerBlock =512;

// Part 1 of 2: compute number of blocks needed based on array size and desired block size
int numBlocks = dimA/numThreadsPerBlock;
printf("blocks: %d/n",numBlocks);

// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
CUDA_SAFE_CALL(cudaMalloc( (void **) &d_a, memSize ));

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
h_a[i] = i;
//printf("%d ",h_a[i]);

start = clock();
//unsigned int timer;
// Copy host array to device array
CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));

// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock <<< dimGrid, dimBlock >>>( d_a );

// device to host copy
CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));

finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "gpu time is %f ms/n", duration );
//printf( "gpu time is %f ms/n", cutGetTimerValue(timer));

int *h_a2;

// allocate host memory
h_a2 = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
h_a2[i] = i;
//printf("%d ",h_a[i]);
for( int j=0; j < dimA ; ++j )
for(int k = 1; k <= LOOP_ADD_TIME; k++)
h_a2[j] += k;

for( int j=0; j < dimA ; ++j )
if (h_a[j] != h_a2[j])printf("error!/n");
// free host memory

// free host memory

// free device memory

return 0;


// Program main
int cpu_test()
clock_t start, finish;
double duration;

// pointer for host memory and size
int *h_a,transfer;
int dimA = 512*21056; // 256K elements (1MB total)

// allocate host memory
size_t memSize = 512*21056* sizeof(int);
h_a = (int *) malloc(memSize);

// Initialize input array on host
for (int i = 0; i < dimA; ++i)
h_a[i] = i;
//printf("%d ",h_a[i]);

start = clock();
for( int j=0; j < dimA ; ++j )
for(int k = 1; k <= LOOP_ADD_TIME; k++)
h_a[j] += k;

finish = clock();
duration = (double)(finish - start)*1000 / CLOCKS_PER_SEC;
printf( "cpu time is %f ms/n", duration );

// free host memory

return 0;

// Program main
int main( int argc, char** argv)

CUT_DEVICE_INIT(argc, argv);


CUT_EXIT(argc, argv);
