您的位置:首页 > 运维架构

OpenCL实现矩阵转置

2016-10-04 14:57 417 查看
本例子主要参考OpenCL实战书中提到的方法。

矩阵转置就是行列互换,但是作为并行计算的转置,首先考虑分块,由不同的工作项同时对不同的矩阵块进行行列互换,以此来提高矩阵转置的效率。具体操作如下:



这是一个8*8的矩阵,将其分为2*2的矩阵块。一共有16个矩阵块,分为对角线上的矩阵块和非对角线上的矩阵块。

对角线上的矩阵块转置以后就是矩阵块内部行列互换,如图红色十字线所示;非对角线上的矩阵块是与自己对称的矩阵块行列互换,如图中黄色数字(黄色横线标出)与黄色数字(黄色竖线标出)互换,蓝色数字与蓝色数字互换。

所以我们需要的工作项个数为上三角矩阵块的个数,即4*(4+1)/2=10个;获得对角线数据的工作项只需要负责块内部行列互换;计算非对角线矩阵块的工作项需要获得两个对称的矩阵块,完成块间的行列互换。

host.c

#define PROGRAM_FILE "kernel.cl"
#define KERNEL_FUNC "transpose"

#define MATRIX_DIM 64

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<CL/cl.h>
#pragma warning( disable : 4996 )
int main() {
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_program program;
size_t log_size;
char *log_buffer;
cl_kernel kernel;
size_t error=0;
error=clGetPlatformIDs(1, &platform, NULL);
if (error != CL_SUCCESS) {
printf("Couldn't check avalable platform!\n");
return -1;
}
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (error != CL_SUCCESS) {
printf("Couldn't check avalable device of GPU!\n");
return -1;
}
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
if (error != CL_SUCCESS) {
printf("Couldn't create context for the device!\n");
return -1;
}
FILE *fp;
char *program_buffer;
size_t program_size;
fp = fopen(PROGRAM_FILE, "rb");
if (fp == NULL) {
perror("Couldn't find the program file");
return -1;
}
fseek(fp, 0, SEEK_END);
program_size = ftell(fp);
//fseek(fp, 0, SEEK_SET);
rewind(fp);
program_buffer = (char *)malloc(program_size + 1);
program_buffer[program_size] = '\0';
error=fread(program_buffer, sizeof(char), program_size, fp);
fclose(fp);
//error=feof(fp);
//error = ferror(fp);
if (error == 0) {
printf("Read the kernel.cl failed!\n");
return -1;
}

program=clCreateProgramWithSource(context, 1, (const char **)&program_buffer, &program_size, &error);
if (error != CL_SUCCESS) {
printf("Create program failed!\n");
getchar();
return -1;
}
//  free(program_buffer);
error=clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (error != CL_SUCCESS) {
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
log_buffer = (char *)malloc(log_size+1);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log_buffer, NULL);
printf("%s\n", log_buffer);
free(log_buffer);
}
kernel=clCreateKernel(program, KERNEL_FUNC, &error);
if (error != CL_SUCCESS) {
printf("Couldn't create avalable kernel!\n");
return -1;
}
cl_command_queue queue=clCreateCommandQueue(context, device, NULL, &error);
if (error != CL_SUCCESS) {
printf("Create command queue failed!\n");
return -1;
}
//初始化输入参数
float data[MATRIX_DIM][MATRIX_DIM];
size_t global_size;
size_t local_size;
size_t size = 16;
cl_ulong mem_size;
for (int i = 0; i < MATRIX_DIM; i++) {
for (int j = 0; j < MATRIX_DIM; j++) {
data[i][j] = 1.0f*i*MATRIX_DIM + j;
}
}
global_size = (MATRIX_DIM / 4)* (MATRIX_DIM / 4 + 1) / 2;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(mem_size), &mem_size, NULL);
local_size = global_size;
cl_mem data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data), data, &error);
error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer);
error |= clSetKernelArg(kernel, 1, (size_t)mem_size, NULL);
error |= clSetKernelArg(kernel, 2, sizeof(size), &size);
if (error != CL_SUCCESS) {
printf("Couldn't set a kernel argument!\n");
return -1;
}
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
if (error != CL_SUCCESS) {
printf("Couldn't enqueue the kernel!\n");
return -1;
}
error = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, sizeof(data), data, 0, NULL, NULL);
if (error != CL_SUCCESS) {
printf("Couldn't read the buffer!\n");
return -1;
}
int check = 1;
for (int i = 0; i<MATRIX_DIM; i++) {
for (int j = 0; j<MATRIX_DIM; j++) {
if (data[i][j] != 1.0*j*MATRIX_DIM + i) {
check = 0;
break;
}
}
}
if (check)
printf("Transpose check succeeded.\n");
else
printf("Transpose check failed.\n");

//释放资源
clReleaseMemObject(data_buffer);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}


kernel.cl

__kernel void transpose(__global float4 *g_mat,
__local float4 *l_mat, uint size) {

__global float4 *src, *dst;

int col = get_global_id(0);
int row = 0;
//确定行号
while (col >= size) {
col -= size--;
row++;
}
//确定列号
col += row;
size += row;

src = g_mat + row * size * 4 + col;
l_mat += get_local_id(0) * 8;
l_mat[0] = src[0];
l_mat[1] = src[size];
l_mat[2] = src[2 * size];
l_mat[3] = src[3 * size];
//对角线上的数据
if (row == col) {
//行列互换
src[0] =
(float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
src[size] =
(float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
src[2 * size] =
(float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
src[3 * size] =
(float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);
}
//非对角线数据
else {
dst = g_mat + col * size * 4 + row;
l_mat[4] = dst[0];
l_mat[5] = dst[size];
l_mat[6] = dst[2 * size];
l_mat[7] = dst[3 * size];

dst[0] =
(float4)(l_mat[0].x, l_mat[1].x, l_mat[2].x, l_mat[3].x);
dst[size] =
(float4)(l_mat[0].y, l_mat[1].y, l_mat[2].y, l_mat[3].y);
dst[2 * size] =
(float4)(l_mat[0].z, l_mat[1].z, l_mat[2].z, l_mat[3].z);
dst[3 * size] =
(float4)(l_mat[0].w, l_mat[1].w, l_mat[2].w, l_mat[3].w);

src[0] =
(float4)(l_mat[4].x, l_mat[5].x, l_mat[6].x, l_mat[7].x);
src[size] =
(float4)(l_mat[4].y, l_mat[5].y, l_mat[6].y, l_mat[7].y);
src[2 * size] =
(float4)(l_mat[4].z, l_mat[5].z, l_mat[6].z, l_mat[7].z);
src[3 * size] =
(float4)(l_mat[4].w, l_mat[5].w, l_mat[6].w, l_mat[7].w);
}
}


矩阵转置方法二

http://blog.csdn.net/u011028771/article/details/52781766
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  opencl 并行计算