您的位置:首页 > 其它

CUDA学习,使用shared memory实现Matrix Parallel Add

2014-05-13 23:56 197 查看
#include< stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
#include

using namespace std;

#define N (2048*2048)
#define THREADS_PER_BLOCK 256  //16*16
#define TILE_WIDTH 16
#define width 2048

__global__ void Add(int *dev_a, int *dev_b,int *dev_c)
{

int bx=blockIdx.x;
int by=blockIdx.y;

int tx=threadIdx.x;
int ty=threadIdx.y;

int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;

__shared__ int A[TILE_WIDTH][TILE_WIDTH];
__shared__ int B[TILE_WIDTH][TILE_WIDTH];

A[ty][tx]=dev_a[i];
B[ty][tx]=dev_b[i];

__syncthreads();

dev_c[i]=A[ty][tx]+B[ty][tx];

/*
int i=bx*blockDim.x+by*blockDim.y*width+ty*width+tx;
dev_c[i]=dev_a[i]+dev_b[i];
*/
}

int main( void ) {

int *a, *b, *c; // host copies of a, b, c

int *dev_a, *dev_b, *dev_c; // device copies of a, b, c

int size = N * sizeof( int); // we need space for N integers

// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );

a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );

//random_ints( a, N );
//random_ints( b, N );

for (int i = 0; i < N; ++i)
{
a[i] = rand();
b[i] = rand();
}

// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);

dim3 dimGrid(128,128,1);
dim3 dimBlock(16,16,1);

int sharesize = 16*16*sizeof(int);

cudaEvent_t timeStartEvent,timeEndEvent;
cudaEventCreate( &timeStartEvent, 0);
cudaEventCreate(&timeEndEvent, 0);
cudaEventRecord( timeStartEvent, 0);

// launch add() kernel with blocks and threads
Add<<< dimGrid, dimBlock,sharesize >>>( dev_a, dev_b, dev_c);

// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

// verify the data returned to the host is correct

for (int i = 0; i < N; i++)
{
assert(c[i] == a[i]+b[i] );
}

free( a );
free( b );
free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);

cudaEventRecord( timeEndEvent, 0) ;
cudaEventSynchronize( timeEndEvent ) ;
float elapsedTime = 0 ;
cudaEventElapsedTime( & elapsedTime, timeStartEvent, timeEndEvent ) ;

std::cout<< "elapsedTime is  " << elapsedTime << " ms. ";
cudaEventDestroy( timeStartEvent ) ;
cudaEventDestroy( timeEndEvent ) ;

return 0;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: