您的位置:首页 > 运维架构 > 网站架构

first.cu 关于CUDA的第一个程序,备注理解架构

2012-05-14 22:35 495 查看
// includes, system
#include <stdio.h>
#include <stdlib.h>

#define RADIUS 1
#define BLOCKDIM 16
#define N 13

__global__ static void set_global_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = idx;

}

__global__ static void set_block_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = blockIdx.x;

}

__global__ static void set_thread_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = threadIdx.x;

}

__global__ static void stencil_naive(int n, int *in, int *out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i;
int value = 0;
if (idx < n) {
for (i=-RADIUS; i<=RADIUS; ++i)
{
value += in[idx+i+RADIUS];
}
}
out[idx] = value;

}

__global__ static void stencil(int n, int *in, int *out)
{
__shared__ int shared[BLOCKDIM + 2*RADIUS];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idx_local = threadIdx.x + RADIUS;
shared[idx_local] = in[idx + RADIUS];
int i;
if (idx < n) {
if( threadIdx.x < RADIUS)
{
shared[idx_local - RADIUS] = in[idx ];
shared[idx_local + BLOCKDIM] = in[idx + BLOCKDIM + RADIUS];
}
}
__syncthreads();
int value = 0;
for (i=-RADIUS; i<=RADIUS; i++)
{
value += shared[idx_local + i];
}
out[idx] = value;

}

__global__ static void matrix_multiply_naive(int n, int *in_a, int *in_b, int *out)
{
int tmp = gridDim.x*blockDim.x;
int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*tmp + col;
int sum = 0;
if (idx < n)
{
for (int i=0; i<N; i++)
{
sum += in_a[row*N+i]*in_b[i*N+col];
}
}
out[row*N+col] = sum;

/*int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*gridDim.x*blockDim.x + col;
if (idx < n)
{
out[idx] = idx;
}*/


}

// Start the main SDK sample here
int main(int argc, char** argv)
{
printf(" ###########################\n");
printf(" #test for cuda toolkit ...#\n");
printf(" ###########################\n");

int dim = 16;
int mem_size = dim*sizeof(int);

int *d_a, *h_a, *d_b;
h_a = (int*)malloc(mem_size);
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);

if ( 0==h_a || 0==d_a)
{
printf("could not allocate memory\n");
return 1;
}
cudaMemset(d_a, 0, mem_size);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");

int dim_block = 3;
int dim_grid = (dim + dim_block -1)/dim_block;
set_global_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");

set_block_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);

for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");

set_thread_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(d_b, d_a, mem_size, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_a, d_b, mem_size, cudaMemcpyDeviceToHost);

for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");

free(h_a);
cudaFree(d_a);
cudaFree(d_b);

///////////////////////////////////////////////////////////////
printf("\n");
printf(" #######################\n");
printf(" #stencil test ...... #\n");
printf(" #######################\n");
dim = 80;
int dim_pro = dim + RADIUS*2;
int mem_size_pro = sizeof(int)*dim_pro;
mem_size = sizeof(int)*dim;
h_a = (int*)malloc(mem_size_pro);
memset(h_a, 0, mem_size_pro);
int *h_b;
cudaMalloc((void**)&d_a, mem_size_pro);
cudaMalloc((void**)&d_b, mem_size);
for (int i=0; i<dim; i++)
{
h_a[i+RADIUS] = i;
}
h_b = &(h_a[RADIUS]);
printf(" original vector ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n\n\n");
cudaMemcpy(d_a, h_a, mem_size_pro, cudaMemcpyHostToDevice);
dim_grid = (dim + BLOCKDIM -1)/BLOCKDIM;
//cudaMemset(d_b, 0, mem_size);
stencil<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
//stencil_naive<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
cudaMemcpy(h_b, d_b, mem_size, cudaMemcpyDeviceToHost);


printf(" after stencil ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n");

free(h_a);
cudaFree(d_a);
cudaFree(d_b);

///////////////////////////////////////////////////////////////
printf("\n");
printf(" ##########################\n");
printf(" #matrix multiply ...... #\n");
printf(" ##########################\n");

int row = N;
int col = N;
dim = row*col;
mem_size = dim*sizeof(int);
h_a = (int*)malloc(mem_size);
int *d_c;
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
cudaMalloc((void**)&d_c, mem_size);
printf(" matrix A & B : \n");
for (int i=0; i<dim; ++i)
{
h_a[i] = 1;
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
cudaMemcpy(d_a, h_a, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_a, mem_size, cudaMemcpyHostToDevice);
dim_block = 2;
dim3 block(dim_block, dim_block);
dim3 grid((row+dim_block-1)/dim_block, (col+dim_block-1)/dim_block);
cudaMemset(d_c, 0, mem_size);
matrix_multiply_naive<<<grid, block>>> (dim, d_a, d_b, d_c);

//memset(h_a, 0,mem_size);
cudaMemcpy(h_a, d_c, mem_size, cudaMemcpyDeviceToHost);
printf("\n matrix C : \n");
for (int i=0; i<dim; ++i)
{
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}


return 0;
}

——————————————————————————————————————————————————————————————————

#include <stdio.h>
#include <stdlib.h>
#define RADIUS 3
#define BLOCK_SIZE 16

__global__ void stencil(int* in, int* out)

{
__shared__ int shared[BLOCK_SIZE + 2 * RADIUS];
int globIdx = blockIdx.x * blockDim.x + threadIdx.x;
int locIdx = threadIdx.x + RADIUS;
shared[locIdx] = in[globIdx];

if (threadIdx.x < RADIUS)
{
shared[locIdx – RADIUS] = in[globIdx – RADIUS];
shared[locIdx + BLOCK_DIMX] = in[globIdx + BLOCK_SIZE];
}
__syncthreads();

int value = 0;
for (offset = - RADIUS; offset <= RADIUS; offset++)
value += shared[locIdx + offset];
out[globIdx] = value;

}

void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}

int main( int argc, char** argv)
{
int *h_a;
int *d_a;
int numBlocks = 16;
int numThreadsPerBlock = 16;

size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void**)&d_a, memSize );

if( 0==h_a || 0==d_a )
{
printf("couldn't allocate memory\n");
return 1;
}

dim3 dimGrid(16,16);
dim3 dimBlock(16);
stencil<<< dimGrid,dimBlock>>>(RADIUS);

cudaThreadSynchronize();

checkCUDAError("kernel execution");

cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost);

checkCUDAError("cudaMemcpy");

for (int i = 0; i < 16 ; i++)
{
for (int j = 0; j < 16 ; j++)
{
assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
}
}

cudaFree(d_a);
free(h_a);
printf("Correct!\n");
return 0;
}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: