您的位置：首页 > 编程语言

【CUDA并行编程之四】矩阵相乘

2015-06-24 09:18 531 查看

前面介绍了基本的Cuda编程的相关知识，那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能，我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码：

mat_mul.cc:

[cpp] view
plaincopy

//a[i]*b[i] + c[i] = d[i]

#include<iostream>

#include<vector>

#include<map>

#include<fstream>

#include"wtime.h"

using namespace std;

const int N = 320;

//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示

int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];

int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];

void init()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

a[i][j] = 1;

b[i][j] = 2;

c[i][j] = 3;

}

}

void init1()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

aa[i*N+j] = 1;

bb[i*N+j] = 2;

cc[i*N+j] = 3;

}

}

void mul()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

for(int k=0;k<N;k++)

{

d[i][j] += a[i][k] * b[k][j];

}

d[i][j] += c[i][j];

}

}

void mul1()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

for(int k=0;k<N;k++)

{

dd[i*N+j] += aa[i*N+k] * bb[k*N+j];

}

dd[N*i+j] += cc[N*i+j];

}

}

void print()

{

ofstream fout;

fout.open("result.txt");

if(!fout)

{

perror("can not open the file");

}

for(int i=0;i<N;i++)

{

for(int j=0;j<N;j++)

{

fout<<d[i][j]<<" ";

}

fout<<endl;

}

fout.close();

}

int main()

{

init1();

double t = wtime();

mul1();

t = wtime()-t;

printf("computation timing = %10.10f sec\n",t);

//print();

return 0;

}

wtime.h:

[cpp] view
plaincopy

#ifndef _WTIME_

#define _WTIME_

double wtime();

#endif

wtime.cc:

[cpp] view
plaincopy

#include <stdio.h>

#include <sys/time.h>

#include <iostream>

#include <cstdlib>

double wtime(void)

{

double now_time;

struct timeval etstart;

struct timezone tzp;

if(gettimeofday(&etstart,&tzp)==-1)

{

perror("Error:calling gettimeofday() not successfully.\n");

}

now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

return now_time;

}

#if 0

int main()

{

double time;

time = wtime();

printf("time of day = %10.4f\n",time);

return 0;

}

#endif

makefile:

[cpp] view
plaincopy

target:

g++ mat_mul.cc wtime.cc

./a.out

结果：

2.GPU上执行矩阵相乘以及性能。

代码：

cuda_mat_mul_v1.cu:

[cpp] view
plaincopy

//matrix multiplication with global memory

#include<iostream>

#include<fstream>

#include "wtime.h"

using namespace std;

const int BLOCK_SIZE = 16;

const int GRID_SIZE = 20;

//D = A * B + C;

__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)

{

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;

int sum = 0;

for(int i=0;i<N;i++)

{

sum += da[row*N + i] * db[row*i+col];

}

dd[row*N + col] = sum + dc[row*N + col];

}

int main()

{

int N = BLOCK_SIZE * GRID_SIZE;

int *ha,*hb,*hc,*hd;

int *da,*db,*dc,*dd;

double time;

ha = new int[N*N];

hb = new int[N*N];

hc = new int[N*N];

hd = new int[N*N];

cudaError_t err;

//initialize

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

ha[i*N+j] = 1;

hb[i*N+j] = 2;

hc[i*N+j] = 3;

}

//malloc</strong>

cudaMalloc(&da,N*N*sizeof(int));

cudaMalloc(&db,N*N*sizeof(int));

cudaMalloc(&dc,N*N*sizeof(int));

err = cudaMalloc(&dd,N*N*sizeof(int));

printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));

//host to device

cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);

dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 grid(GRID_SIZE,GRID_SIZE);

//kernel

time = wtime();

mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);

printf("Computation time is %10.10f\n",wtime()-time);

//device to host

cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);

//print result to file

ofstream fout;

fout.open("result_v1.txt");

if(!fout)

{

cerr<<"open the file error"<<endl;

exit(-1);

}

for(int i=0;i<N;i++)

{

for(int j=0;j<N;j++)

{

fout<<hd[i*N+j]<<" ";

}

fout<<endl;

}

delete []ha;delete []hb;delete []hc;delete []hd;

cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);

return 0;

}

cuda_wtime.cu:

[cpp] view
plaincopy

#ifndef _WTIME_

#define _WTIME_

double wtime();

#endif

cuda_wtime.cu:

[cpp] view
plaincopy

cu:

nvcc cuda_mat_mul_v1.cu cuda_wtime.cu

./a.out

结果：

3.计算性能对比：

矩阵大小	*16001600**	1200*1200	800*800	320*320
串行时间/s	30.9	11.49865	2.597987	0.162311
并行时间	grid=100/block=16	grid=75/block=16	grid=50/block=16	grid=20/block=16
kernel执行时间/s	0.0000319	0.0000309944	0.0000309944	0.0000231266
并行计算总时间（分配内存加+数据拷贝+计算）/s	0.70796	0.439213	0.310214	0.237676

可见，在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。

注明出处：/article/2003002.html

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航