cuda 矩阵乘法
2013-12-04 16:33
211 查看
原始矩阵乘法
#include <stdio.h>
#include <cuda.h>
#define BLOCK_SIZE 1
typedef struct{
int width;
int height;
float* elements;
}Matrix;
__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
void MatMul(const Matrix A,const Matrix B,Matrix C)
{
Matrix d_A;
d_A.width=A.width;d_A.height=A.height;
size_t size=A.width*A.height*sizeof(float);
cudaMalloc((void**)&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width=B.width;d_B.height=B.height;
size=B.width*B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,
cudaMemcpyHostToDevice);
//AllocateCindevicememory
Matrix d_C;
d_C.width=C.width;d_C.height=C.height;
size=C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements,size);
//Invokekernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//ReadCfromdevicememory
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//Freedevicememory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
//MatrixmultiplicationkernelcalledbyMatMul()
__global__ void MatMulKernel(Matrix A,Matrix B,Matrix C){
//EachthreadcomputesoneelementofC
//byaccumulatingresultsintoCvalue
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0;e<A.width;++e)
Cvalue+=A.elements[row*A.width+e]
*B.elements[e*B.width+col];
C.elements[row*C.width+col]=Cvalue;
}
优化算法
#include <stdio.h>
#include <cuda.h>
#define BLOCK_SIZE 1
typedef struct{
int width;
int height;
float* elements;
}Matrix;
__global__ void MatMulKernel(const Matrix,const Matrix,Matrix);
void MatMul(const Matrix A,const Matrix B,Matrix C)
{
Matrix d_A;
d_A.width=A.width;d_A.height=A.height;
size_t size=A.width*A.height*sizeof(float);
cudaMalloc((void**)&d_A.elements,size);
cudaMemcpy(d_A.elements,A.elements,size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width=B.width;d_B.height=B.height;
size=B.width*B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,
cudaMemcpyHostToDevice);
//AllocateCindevicememory
Matrix d_C;
d_C.width=C.width;d_C.height=C.height;
size=C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements,size);
//Invokekernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 dimGrid(B.width/dimBlock.x,A.height/dimBlock.y);
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//ReadCfromdevicememory
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//Freedevicememory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
//MatrixmultiplicationkernelcalledbyMatMul()
__global__ void MatMulKernel(Matrix A,Matrix B,Matrix C){
//EachthreadcomputesoneelementofC
//byaccumulatingresultsintoCvalue
float Cvalue=0;
int row=blockIdx.y*blockDim.y+threadIdx.y;
int col=blockIdx.x*blockDim.x+threadIdx.x;
for(int e=0;e<A.width;++e)
Cvalue+=A.elements[row*A.width+e]
*B.elements[e*B.width+col];
C.elements[row*C.width+col]=Cvalue;
}
优化算法
相关文章推荐
- 计步器
- ubuntu下卸载openjdk并安装sunjdk(修改java版本/环境变量)
- spring mvc多视图解释,配置详解
- oracle表空间和用户
- 数组的冒泡排序.
- 随机取数
- WHAT I WANT TO DO ?
- 图解SVD分解
- Algorithms - 离散概率值(discrete) 和 重置\洗牌(shuffle) 算法 及 代码
- 第十五周项目3--带姓名的成绩排序(按成绩升序排列)
- 最大的那个值
- linux 添加PATH环境变量
- Tomcat访问日志
- Algorithms - 离散概率值(discrete) 和 重置\洗牌(shuffle) 算法 及 代码
- 解密:顺丰是一家什么样的公司?
- SharePoint 门户添加内网域名
- 一些xmpp插件安装,
- 使用CocoaPods来做iOS程序的包依赖管理
- qtcreator inet_addr
- CScrollView的使用