first.cu 关于CUDA的第一个程序,备注理解架构
2012-05-14 22:35
495 查看
// includes, system
#include <stdio.h>
#include <stdlib.h>
#define RADIUS 1
#define BLOCKDIM 16
#define N 13
__global__ static void set_global_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = idx;
}
__global__ static void set_block_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = blockIdx.x;
}
__global__ static void set_thread_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = threadIdx.x;
}
__global__ static void stencil_naive(int n, int *in, int *out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i;
int value = 0;
if (idx < n) {
for (i=-RADIUS; i<=RADIUS; ++i)
{
value += in[idx+i+RADIUS];
}
}
out[idx] = value;
}
__global__ static void stencil(int n, int *in, int *out)
{
__shared__ int shared[BLOCKDIM + 2*RADIUS];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idx_local = threadIdx.x + RADIUS;
shared[idx_local] = in[idx + RADIUS];
int i;
if (idx < n) {
if( threadIdx.x < RADIUS)
{
shared[idx_local - RADIUS] = in[idx ];
shared[idx_local + BLOCKDIM] = in[idx + BLOCKDIM + RADIUS];
}
}
__syncthreads();
int value = 0;
for (i=-RADIUS; i<=RADIUS; i++)
{
value += shared[idx_local + i];
}
out[idx] = value;
}
__global__ static void matrix_multiply_naive(int n, int *in_a, int *in_b, int *out)
{
int tmp = gridDim.x*blockDim.x;
int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*tmp + col;
int sum = 0;
if (idx < n)
{
for (int i=0; i<N; i++)
{
sum += in_a[row*N+i]*in_b[i*N+col];
}
}
out[row*N+col] = sum;
/*int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*gridDim.x*blockDim.x + col;
if (idx < n)
{
out[idx] = idx;
}*/
}
// Start the main SDK sample here
int main(int argc, char** argv)
{
printf(" ###########################\n");
printf(" #test for cuda toolkit ...#\n");
printf(" ###########################\n");
int dim = 16;
int mem_size = dim*sizeof(int);
int *d_a, *h_a, *d_b;
h_a = (int*)malloc(mem_size);
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
if ( 0==h_a || 0==d_a)
{
printf("could not allocate memory\n");
return 1;
}
cudaMemset(d_a, 0, mem_size);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
int dim_block = 3;
int dim_grid = (dim + dim_block -1)/dim_block;
set_global_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
set_block_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
set_thread_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(d_b, d_a, mem_size, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_a, d_b, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///////////////////////////////////////////////////////////////
printf("\n");
printf(" #######################\n");
printf(" #stencil test ...... #\n");
printf(" #######################\n");
dim = 80;
int dim_pro = dim + RADIUS*2;
int mem_size_pro = sizeof(int)*dim_pro;
mem_size = sizeof(int)*dim;
h_a = (int*)malloc(mem_size_pro);
memset(h_a, 0, mem_size_pro);
int *h_b;
cudaMalloc((void**)&d_a, mem_size_pro);
cudaMalloc((void**)&d_b, mem_size);
for (int i=0; i<dim; i++)
{
h_a[i+RADIUS] = i;
}
h_b = &(h_a[RADIUS]);
printf(" original vector ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n\n\n");
cudaMemcpy(d_a, h_a, mem_size_pro, cudaMemcpyHostToDevice);
dim_grid = (dim + BLOCKDIM -1)/BLOCKDIM;
//cudaMemset(d_b, 0, mem_size);
stencil<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
//stencil_naive<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
cudaMemcpy(h_b, d_b, mem_size, cudaMemcpyDeviceToHost);
printf(" after stencil ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///////////////////////////////////////////////////////////////
printf("\n");
printf(" ##########################\n");
printf(" #matrix multiply ...... #\n");
printf(" ##########################\n");
int row = N;
int col = N;
dim = row*col;
mem_size = dim*sizeof(int);
h_a = (int*)malloc(mem_size);
int *d_c;
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
cudaMalloc((void**)&d_c, mem_size);
printf(" matrix A & B : \n");
for (int i=0; i<dim; ++i)
{
h_a[i] = 1;
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
cudaMemcpy(d_a, h_a, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_a, mem_size, cudaMemcpyHostToDevice);
dim_block = 2;
dim3 block(dim_block, dim_block);
dim3 grid((row+dim_block-1)/dim_block, (col+dim_block-1)/dim_block);
cudaMemset(d_c, 0, mem_size);
matrix_multiply_naive<<<grid, block>>> (dim, d_a, d_b, d_c);
//memset(h_a, 0,mem_size);
cudaMemcpy(h_a, d_c, mem_size, cudaMemcpyDeviceToHost);
printf("\n matrix C : \n");
for (int i=0; i<dim; ++i)
{
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
return 0;
}
——————————————————————————————————————————————————————————————————
#include <stdio.h>
#include <stdlib.h>
#define RADIUS 3
#define BLOCK_SIZE 16
__global__ void stencil(int* in, int* out)
{
__shared__ int shared[BLOCK_SIZE + 2 * RADIUS];
int globIdx = blockIdx.x * blockDim.x + threadIdx.x;
int locIdx = threadIdx.x + RADIUS;
shared[locIdx] = in[globIdx];
if (threadIdx.x < RADIUS)
{
shared[locIdx – RADIUS] = in[globIdx – RADIUS];
shared[locIdx + BLOCK_DIMX] = in[globIdx + BLOCK_SIZE];
}
__syncthreads();
int value = 0;
for (offset = - RADIUS; offset <= RADIUS; offset++)
value += shared[locIdx + offset];
out[globIdx] = value;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
int main( int argc, char** argv)
{
int *h_a;
int *d_a;
int numBlocks = 16;
int numThreadsPerBlock = 16;
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void**)&d_a, memSize );
if( 0==h_a || 0==d_a )
{
printf("couldn't allocate memory\n");
return 1;
}
dim3 dimGrid(16,16);
dim3 dimBlock(16);
stencil<<< dimGrid,dimBlock>>>(RADIUS);
cudaThreadSynchronize();
checkCUDAError("kernel execution");
cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy");
for (int i = 0; i < 16 ; i++)
{
for (int j = 0; j < 16 ; j++)
{
assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
}
}
cudaFree(d_a);
free(h_a);
printf("Correct!\n");
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#define RADIUS 1
#define BLOCKDIM 16
#define N 13
__global__ static void set_global_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = idx;
}
__global__ static void set_block_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = blockIdx.x;
}
__global__ static void set_thread_idx(int n, int *d_a)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n)
d_a[idx] = threadIdx.x;
}
__global__ static void stencil_naive(int n, int *in, int *out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i;
int value = 0;
if (idx < n) {
for (i=-RADIUS; i<=RADIUS; ++i)
{
value += in[idx+i+RADIUS];
}
}
out[idx] = value;
}
__global__ static void stencil(int n, int *in, int *out)
{
__shared__ int shared[BLOCKDIM + 2*RADIUS];
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int idx_local = threadIdx.x + RADIUS;
shared[idx_local] = in[idx + RADIUS];
int i;
if (idx < n) {
if( threadIdx.x < RADIUS)
{
shared[idx_local - RADIUS] = in[idx ];
shared[idx_local + BLOCKDIM] = in[idx + BLOCKDIM + RADIUS];
}
}
__syncthreads();
int value = 0;
for (i=-RADIUS; i<=RADIUS; i++)
{
value += shared[idx_local + i];
}
out[idx] = value;
}
__global__ static void matrix_multiply_naive(int n, int *in_a, int *in_b, int *out)
{
int tmp = gridDim.x*blockDim.x;
int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*tmp + col;
int sum = 0;
if (idx < n)
{
for (int i=0; i<N; i++)
{
sum += in_a[row*N+i]*in_b[i*N+col];
}
}
out[row*N+col] = sum;
/*int row = threadIdx.x + blockIdx.x*blockDim.x;
int col = threadIdx.y + blockIdx.y*blockDim.y;
int idx = row*gridDim.x*blockDim.x + col;
if (idx < n)
{
out[idx] = idx;
}*/
}
// Start the main SDK sample here
int main(int argc, char** argv)
{
printf(" ###########################\n");
printf(" #test for cuda toolkit ...#\n");
printf(" ###########################\n");
int dim = 16;
int mem_size = dim*sizeof(int);
int *d_a, *h_a, *d_b;
h_a = (int*)malloc(mem_size);
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
if ( 0==h_a || 0==d_a)
{
printf("could not allocate memory\n");
return 1;
}
cudaMemset(d_a, 0, mem_size);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
int dim_block = 3;
int dim_grid = (dim + dim_block -1)/dim_block;
set_global_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
set_block_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(h_a, d_a, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
set_thread_idx<<<dim_grid, dim_block>>> (dim, d_a);
cudaMemcpy(d_b, d_a, mem_size, cudaMemcpyDeviceToDevice);
cudaMemcpy(h_a, d_b, mem_size, cudaMemcpyDeviceToHost);
for (int i=0; i<dim; ++i)
{
printf("%2d ", h_a[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///////////////////////////////////////////////////////////////
printf("\n");
printf(" #######################\n");
printf(" #stencil test ...... #\n");
printf(" #######################\n");
dim = 80;
int dim_pro = dim + RADIUS*2;
int mem_size_pro = sizeof(int)*dim_pro;
mem_size = sizeof(int)*dim;
h_a = (int*)malloc(mem_size_pro);
memset(h_a, 0, mem_size_pro);
int *h_b;
cudaMalloc((void**)&d_a, mem_size_pro);
cudaMalloc((void**)&d_b, mem_size);
for (int i=0; i<dim; i++)
{
h_a[i+RADIUS] = i;
}
h_b = &(h_a[RADIUS]);
printf(" original vector ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n\n\n");
cudaMemcpy(d_a, h_a, mem_size_pro, cudaMemcpyHostToDevice);
dim_grid = (dim + BLOCKDIM -1)/BLOCKDIM;
//cudaMemset(d_b, 0, mem_size);
stencil<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
//stencil_naive<<<dim_grid, BLOCKDIM>>> (dim, d_a, d_b);
cudaMemcpy(h_b, d_b, mem_size, cudaMemcpyDeviceToHost);
printf(" after stencil ... \n");
for (int i=0; i<dim; ++i)
{
if (i%10 == 0)
printf("\n");
printf("%5d ", h_b[i]);
}
printf("\n");
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
///////////////////////////////////////////////////////////////
printf("\n");
printf(" ##########################\n");
printf(" #matrix multiply ...... #\n");
printf(" ##########################\n");
int row = N;
int col = N;
dim = row*col;
mem_size = dim*sizeof(int);
h_a = (int*)malloc(mem_size);
int *d_c;
cudaMalloc((void**)&d_a, mem_size);
cudaMalloc((void**)&d_b, mem_size);
cudaMalloc((void**)&d_c, mem_size);
printf(" matrix A & B : \n");
for (int i=0; i<dim; ++i)
{
h_a[i] = 1;
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
cudaMemcpy(d_a, h_a, mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_a, mem_size, cudaMemcpyHostToDevice);
dim_block = 2;
dim3 block(dim_block, dim_block);
dim3 grid((row+dim_block-1)/dim_block, (col+dim_block-1)/dim_block);
cudaMemset(d_c, 0, mem_size);
matrix_multiply_naive<<<grid, block>>> (dim, d_a, d_b, d_c);
//memset(h_a, 0,mem_size);
cudaMemcpy(h_a, d_c, mem_size, cudaMemcpyDeviceToHost);
printf("\n matrix C : \n");
for (int i=0; i<dim; ++i)
{
if (i%row == 0)
{
printf("\n");
}
printf("%5d ", h_a[i]);
}
return 0;
}
——————————————————————————————————————————————————————————————————
#include <stdio.h>
#include <stdlib.h>
#define RADIUS 3
#define BLOCK_SIZE 16
__global__ void stencil(int* in, int* out)
{
__shared__ int shared[BLOCK_SIZE + 2 * RADIUS];
int globIdx = blockIdx.x * blockDim.x + threadIdx.x;
int locIdx = threadIdx.x + RADIUS;
shared[locIdx] = in[globIdx];
if (threadIdx.x < RADIUS)
{
shared[locIdx – RADIUS] = in[globIdx – RADIUS];
shared[locIdx + BLOCK_DIMX] = in[globIdx + BLOCK_SIZE];
}
__syncthreads();
int value = 0;
for (offset = - RADIUS; offset <= RADIUS; offset++)
value += shared[locIdx + offset];
out[globIdx] = value;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
int main( int argc, char** argv)
{
int *h_a;
int *d_a;
int numBlocks = 16;
int numThreadsPerBlock = 16;
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void**)&d_a, memSize );
if( 0==h_a || 0==d_a )
{
printf("couldn't allocate memory\n");
return 1;
}
dim3 dimGrid(16,16);
dim3 dimBlock(16);
stencil<<< dimGrid,dimBlock>>>(RADIUS);
cudaThreadSynchronize();
checkCUDAError("kernel execution");
cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost);
checkCUDAError("cudaMemcpy");
for (int i = 0; i < 16 ; i++)
{
for (int j = 0; j < 16 ; j++)
{
assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
}
}
cudaFree(d_a);
free(h_a);
printf("Correct!\n");
return 0;
}
相关文章推荐
- 详解第一个CUDA程序kernel.cu
- 《c primer plus》第一个程序first.c的编辑,编译与输出
- 理解CUDA并行程序的规约思想
- 第12周-阅读、修改和运行关于交通工具类的程序,回答问题,以理解相关技术方法
- 现在理解的程序架构
- 深入理解CUDA线程层次以及关于设置线程数的思考
- Tesla架构下的CUDA程序优化
- SQL SERVER 2005/2008 中关于架构的理解(一)
- CUDA从入门到精通(二):第一个CUDA程序
- 关于程序执行、内存资源的理解
- 关于Ubuntu 16.04 安装CUDA和cuDnn环境变量的设置
- 第十三周项目二 阅读修改关于交通工具类的程序,回答问题,以理解相关内容(2)
- 关于lua中文教程上C API第一个lua与c交互示例程序编译 整理
- 第一个CUDA程序
- 关于微软的OBA架构理解(欢迎大家批评指正^_^)
- 深入理解CUDA线程层次以及关于设置线程数的思考
- 我人生中第一个真正的CUDA程序
- 一、编译,运行 并且 理解 你的 第一个 c程序
- [Vue][自定义事件]关于组件prop双向绑定的理解和程序调试
- SQL SERVER 2005/2008 中关于架构的理解(一)