您的位置:首页 > 编程语言

【CUDA并行编程之四】矩阵相乘

2015-06-24 09:18 531 查看
前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码:

mat_mul.cc:

[cpp] view
plaincopy





//a[i]*b[i] + c[i] = d[i]

#include<iostream>

#include<vector>

#include<map>

#include<fstream>

#include"wtime.h"



using namespace std;



const int N = 320;



//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示

int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];

int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];



void init()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

a[i][j] = 1;

b[i][j] = 2;

c[i][j] = 3;

}

}



void init1()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

aa[i*N+j] = 1;

bb[i*N+j] = 2;

cc[i*N+j] = 3;

}

}



void mul()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

for(int k=0;k<N;k++)

{

d[i][j] += a[i][k] * b[k][j];

}

d[i][j] += c[i][j];

}

}



void mul1()

{

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

for(int k=0;k<N;k++)

{

dd[i*N+j] += aa[i*N+k] * bb[k*N+j];

}

dd[N*i+j] += cc[N*i+j];

}

}



void print()

{

ofstream fout;

fout.open("result.txt");

if(!fout)

{

perror("can not open the file");

}

for(int i=0;i<N;i++)

{

for(int j=0;j<N;j++)

{

fout<<d[i][j]<<" ";

}

fout<<endl;

}

fout.close();

}



int main()

{

init1();



double t = wtime();

mul1();

t = wtime()-t;

printf("computation timing = %10.10f sec\n",t);



//print();



return 0;

}

wtime.h:

[cpp] view
plaincopy





#ifndef _WTIME_

#define _WTIME_



double wtime();



#endif

wtime.cc:

[cpp] view
plaincopy





#include <stdio.h>

#include <sys/time.h>

#include <iostream>

#include <cstdlib>



double wtime(void)

{

double now_time;

struct timeval etstart;

struct timezone tzp;



if(gettimeofday(&etstart,&tzp)==-1)

{

perror("Error:calling gettimeofday() not successfully.\n");

}



now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;



return now_time;

}



#if 0

int main()

{

double time;

time = wtime();



printf("time of day = %10.4f\n",time);



return 0;

}

#endif

makefile:

[cpp] view
plaincopy





target:

g++ mat_mul.cc wtime.cc

./a.out

结果:








2.GPU上执行矩阵相乘以及性能。


代码:

cuda_mat_mul_v1.cu:

[cpp] view
plaincopy





//matrix multiplication with global memory

#include<iostream>

#include<fstream>

#include "wtime.h"



using namespace std;





const int BLOCK_SIZE = 16;

const int GRID_SIZE = 20;



//D = A * B + C;

__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)

{

int row = blockIdx.y * blockDim.y + threadIdx.y;

int col = blockIdx.x * blockDim.x + threadIdx.x;



int sum = 0;

for(int i=0;i<N;i++)

{

sum += da[row*N + i] * db[row*i+col];

}

dd[row*N + col] = sum + dc[row*N + col];

}



int main()

{

int N = BLOCK_SIZE * GRID_SIZE;

int *ha,*hb,*hc,*hd;

int *da,*db,*dc,*dd;

double time;

ha = new int[N*N];

hb = new int[N*N];

hc = new int[N*N];

hd = new int[N*N];

cudaError_t err;



//initialize

for(int i=0;i<N;i++)

for(int j=0;j<N;j++)

{

ha[i*N+j] = 1;

hb[i*N+j] = 2;

hc[i*N+j] = 3;

}



//malloc</strong>

cudaMalloc(&da,N*N*sizeof(int));

cudaMalloc(&db,N*N*sizeof(int));

cudaMalloc(&dc,N*N*sizeof(int));

err = cudaMalloc(&dd,N*N*sizeof(int));

printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));



//host to device

cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);

cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);



dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);

dim3 grid(GRID_SIZE,GRID_SIZE);

//kernel

time = wtime();

mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);

printf("Computation time is %10.10f\n",wtime()-time);



//device to host

cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);



//print result to file

ofstream fout;

fout.open("result_v1.txt");

if(!fout)

{

cerr<<"open the file error"<<endl;

exit(-1);

}

for(int i=0;i<N;i++)

{

for(int j=0;j<N;j++)

{

fout<<hd[i*N+j]<<" ";

}

fout<<endl;

}



delete []ha;delete []hb;delete []hc;delete []hd;

cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);



return 0;

}

cuda_wtime.cu:

[cpp] view
plaincopy





#include <stdio.h>

#include <sys/time.h>

#include <iostream>

#include <cstdlib>



double wtime(void)

{

double now_time;

struct timeval etstart;

struct timezone tzp;



if(gettimeofday(&etstart,&tzp)==-1)

{

perror("Error:calling gettimeofday() not successfully.\n");

}



now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;



return now_time;

}



#if 0

int main()

{

double time;

time = wtime();



printf("time of day = %10.4f\n",time);



return 0;

}

#endif

wtime.h:

[cpp] view
plaincopy





#ifndef _WTIME_

#define _WTIME_



double wtime();



#endif

cuda_wtime.cu:

[cpp] view
plaincopy





#include <stdio.h>

#include <sys/time.h>

#include <iostream>

#include <cstdlib>



double wtime(void)

{

double now_time;

struct timeval etstart;

struct timezone tzp;



if(gettimeofday(&etstart,&tzp)==-1)

{

perror("Error:calling gettimeofday() not successfully.\n");

}



now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;



return now_time;

}



#if 0

int main()

{

double time;

time = wtime();



printf("time of day = %10.4f\n",time);



return 0;

}

#endif

makefile:

[cpp] view
plaincopy





cu:

nvcc cuda_mat_mul_v1.cu cuda_wtime.cu

./a.out

结果:






3.计算性能对比:

矩阵大小
1600*16001200*1200
800*800
320*320
串行时间/s
30.9
11.49865
2.597987
0.162311
并行时间
grid=100/block=16grid=75/block=16
grid=50/block=16
grid=20/block=16
kernel执行时间/s
0.0000319
0.0000309944
0.0000309944
0.0000231266
并行计算总时间(分配内存加+数据拷贝+计算)/s
0.70796
0.439213
0.310214
0.237676
可见,在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。

注明出处:/article/2003002.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: