您的位置:首页 > 其它

GPU计算矩阵相乘(未优化)

2012-07-28 14:23 453 查看
#include <stdio.h>

#define LEN     10
#define BLOCK   10
#define THREAD  32

__global__ void multi_0(int* A, int* B, int* C)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;

if(id < LEN * LEN)
{
int sum = 0;
int i = id / LEN;
int j = id % LEN;
for(int k = 0; k < LEN; ++k)
{
sum += A[i * LEN + k] * B[k * LEN + j];
}
C[id] = sum;
}
}

int main()
{
int *A, *B, *C;

int memSize = LEN * LEN * sizeof(int);

int* a = (int*)malloc(memSize);
int* b = (int*)malloc(memSize);
int* c = (int*)malloc(memSize);

cudaMalloc((void**)&A, memSize);
cudaMalloc((void**)&B, memSize);
cudaMalloc((void**)&C, memSize);

for(int i = 0; i < LEN * LEN; ++i)
{
//a[i] = i + 1;
//b[i] = i<<1 + 1;
a[i] = 1;
b[i] = 1;
}

cudaMemcpy(A, a, memSize, cudaMemcpyHostToDevice);
cudaMemcpy(B, b, memSize, cudaMemcpyHostToDevice);

multi_0<<<BLOCK, THREAD>>>(A, B, C);

cudaMemcpy(c, C, memSize, cudaMemcpyDeviceToHost);

for(int i = 0; i < LEN; ++i)
{
for(int j = 0; j < LEN; ++j)
{
printf("%d\t", c[i * LEN + j]);
}
printf("\n");
}

free(a);
free(b);
free(c);
cudaFree(A);
cudaFree(B);
cudaFree(C);

getchar();
return 0;
}


 
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  优化 thread include c