您的位置:首页 > 产品设计 > UI/UE

cuda 常量内存,头文件不知道有什么gui问题

2016-03-24 15:43 1206 查看
#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include "device_functions.h"

#include "device_atomic_functions.h"

#include <iostream>

#include <stdio.h>

#include "crt\func_macro.h"

#include "assert.h"

#include "conio.h"

using namespace std;

#define CUDA_CALL(x){const cudaError_t a=(x);if(a!=cudaSuccess){printf("\nCUDAError:%s(err_num= %d) \n",cudaGetErrorString(a),a);cudaDeviceReset();assert(0);}}

#define KERNEL_LOOP 65536

__constant__ static const int const_data_01 = 0x55555555;

__constant__ static const int const_data_02 = 0x77777777;

__constant__ static const int const_data_03 = 0x33333333;

__constant__ static const int const_data_04 = 0x11111111;

__global__ void const_test_gpu_literal(int * const data, const int num_elements)

{

const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;

if (tid < num_elements)

{

int d = 0x55555555;

for (int i = 0; i < KERNEL_LOOP; i++)

{

d ^= 0x55555555;

d |= 0x77777777;

d &= 0x33333333;

d |= 0x11111111;

}

data[tid] = d;

}

}

__global__ void const_test_gpu_const(int * const data, const int num_elements)

{

const int tid = (blockDim.x*blockIdx.x) + threadIdx.x;

if (tid < num_elements)

{

int d = const_data_01;

for (int i = 0; i < KERNEL_LOOP; i++)

{

d ^= const_data_01;

d |= const_data_02;

d &= const_data_03;

d |= const_data_04;

}

data[tid] = d;

}

}

__host__ void wait_exit(void)

{

char ch;

printf("\nPress any key to exit");

ch = getch();

}

__host__ void cuda_error_check(

const char * prefix,

const char * postfix)

{

if (cudaPeekAtLastError() != cudaSuccess)

{

printf("\n%s%s%s", prefix, cudaGetErrorString(cudaGetLastError()), postfix);

cudaDeviceReset();

wait_exit();

exit(1);

}

}

__host__ void gpu_kernel(void)

{

const int num_elements = (128 * 1024);

const int num_threads = 256;

const int num_blocks = (num_elements + (num_threads - 1)) / num_threads;

const int num_bytes = num_elements*sizeof(int);

int max_device_num;

const int max_runs = 6;

CUDA_CALL(cudaGetDeviceCount(&max_device_num));

for (int device_num = 0; device_num < max_device_num; device_num++)

{

CUDA_CALL(cudaSetDevice(device_num));

for (int num_test = 0; num_test < max_runs; num_test++)

{

int *data_gpu;

cudaEvent_t kernel_start1, kernel_stop1;

cudaEvent_t kernel_start2, kernel_stop2;

float delta_time1 = 0.0F, delta_time2 = 0.0F;

struct cudaDeviceProp device_prop;

char device_prefix[261];

CUDA_CALL(cudaMalloc(&data_gpu, num_bytes));

CUDA_CALL(cudaEventCreate(&kernel_start1));

CUDA_CALL(cudaEventCreate(&kernel_start2));

CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop1, cudaEventBlockingSync));

CUDA_CALL(cudaEventCreateWithFlags(&kernel_stop2, cudaEventBlockingSync));

CUDA_CALL(cudaGetDeviceProperties(&device_prop, device_num));

sprintf(device_prefix, "ID:%d %s:", device_num, device_prop.name);

const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);

cuda_error_check("Error ", "return from literal starup kernel");

CUDA_CALL(cudaEventRecord(kernel_start1, 0));

const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);

cuda_error_check("Error ", "return from literal runtime kernel");

CUDA_CALL(cudaEventRecord(kernel_stop1, 0));

CUDA_CALL(cudaEventSynchronize(kernel_stop1));

CUDA_CALL(cudaEventElapsedTime(&delta_time1, kernel_start1, kernel_stop1));

const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);

cuda_error_check("Error ", "return from constant starup kernel");

CUDA_CALL(cudaEventRecord(kernel_start2, 0));

const_test_gpu_literal << <num_blocks, num_threads >> >(data_gpu, num_elements);

cuda_error_check("Error ", "return from constant runtime kernel");

CUDA_CALL(cudaEventRecord(kernel_stop2, 0));

CUDA_CALL(cudaEventSynchronize(kernel_stop2));

CUDA_CALL(cudaEventElapsedTime(&delta_time2, kernel_start2, kernel_stop2));

if (delta_time1 > delta_time2)

{

printf("\n%sConst version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time1 = delta_time2, delta_time1, delta_time2);

}

else

{

printf("\n%sLiteral version is faster by: %.2fms (Const=%.2fms vs. Literal=%.2fms)", device_prefix, delta_time2 = delta_time1, delta_time1, delta_time2);

}

CUDA_CALL(cudaEventDestroy(kernel_start1));

CUDA_CALL(cudaEventDestroy(kernel_start2));

CUDA_CALL(cudaEventDestroy(kernel_stop1));

CUDA_CALL(cudaEventDestroy(kernel_stop2));

}

CUDA_CALL(cudaDeviceReset());

printf("\n");

}

wait_exit();

}

int main()

{

gpu_kernel();

return 0;

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: