您的位置:首页 > 其它

CUDA,day-2,共享内存-排序算法

2016-03-17 09:32 169 查看
#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <iostream>

#include <stdio.h>

using namespace std;

#define u32 unsigned int

__device__ void func1(

u32 * const sort_tmp,

const u32 num_lists,

const u32 num_elements,

const u32 tid,

u32 * const sort_tmp_0,

u32 * const sort_tmp_1);

__device__ void func2(

u32 * const sort_tmp,

const u32 num_lists,

const u32 num_elements,

const u32 tid,

u32 * const sort_tmp_0);

int main()

{

return 0;

}

__device__ void func1(

u32 * const sort_tmp,

const u32 num_lists,

const u32 num_elements,

const u32 tid,

u32 * const sort_tmp_0,

u32 * const sort_tmp_1)

{

for (u32 bit = 0; bit < 32; bit++)

{

u32 base_cnt_0 = 0;

u32 base_cnt_1 = 0;

for (u32 i = 0; i < num_elements; i += num_lists)

{

const u32 elem = sort_tmp[i + tid];

const u32 bit_mask = (1 << bit);

if ((elem&bit_mask)>0)

{

sort_tmp_1[base_cnt_1 + tid] = elem;

base_cnt_1 += num_lists;

}

else

{

sort_tmp_0[base_cnt_0 + tid] = elem;

base_cnt_0 += num_lists;

}

}

for (u32 i = 0; i < base_cnt_0; i += num_lists)

{

sort_tmp[i + tid] = sort_tmp_0[i + tid];

}

for (u32 i = 0; i < base_cnt_1; i += num_lists)

{

sort_tmp[base_cnt_0 + i + tid] = sort_tmp_1[i + tid];

}

}

}

__device__ void func2(

u32 * const sort_tmp,

const u32 num_lists,

const u32 num_elements,

const u32 tid,

u32 * const sort_tmp_0)

{

for (u32 bit = 0; bit < 32; bit++)

{

const u32 bit_mask = (1 << bit);

u32 base_cnt_0 = 0;

u32 base_cnt_1 = 0;

for (u32 i = 0; i < num_elements; i += num_lists)

{

const u32 elem = sort_tmp[i + tid];

if ((elem&bit_mask)>0)

{

sort_tmp_0[base_cnt_1 + tid] = elem;

base_cnt_1 += num_lists;

}

else

{

sort_tmp[base_cnt_0 + tid] = elem;

base_cnt_0 += num_lists;

}

}

for (u32 i = 0; i < base_cnt_0; i += num_lists)

{

sort_tmp[base_cnt_0 + i + tid] = sort_tmp_0[i + tid];

}

}

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: