【CUDA并行编程之四】矩阵相乘
2015-06-24 09:18
531 查看
前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子。
1.CPU上执行矩阵相乘以及性能。
在CPU上进行矩阵相乘运算的代码:
mat_mul.cc:
[cpp] view
plaincopy
//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h"
using namespace std;
const int N = 320;
//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];
void init()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
a[i][j] = 1;
b[i][j] = 2;
c[i][j] = 3;
}
}
void init1()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
aa[i*N+j] = 1;
bb[i*N+j] = 2;
cc[i*N+j] = 3;
}
}
void mul()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
for(int k=0;k<N;k++)
{
d[i][j] += a[i][k] * b[k][j];
}
d[i][j] += c[i][j];
}
}
void mul1()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
for(int k=0;k<N;k++)
{
dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
}
dd[N*i+j] += cc[N*i+j];
}
}
void print()
{
ofstream fout;
fout.open("result.txt");
if(!fout)
{
perror("can not open the file");
}
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
fout<<d[i][j]<<" ";
}
fout<<endl;
}
fout.close();
}
int main()
{
init1();
double t = wtime();
mul1();
t = wtime()-t;
printf("computation timing = %10.10f sec\n",t);
//print();
return 0;
}
wtime.h:
[cpp] view
plaincopy
#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
wtime.cc:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
makefile:
[cpp] view
plaincopy
target:
g++ mat_mul.cc wtime.cc
./a.out
结果:
2.GPU上执行矩阵相乘以及性能。
代码:
cuda_mat_mul_v1.cu:
[cpp] view
plaincopy
//matrix multiplication with global memory
#include<iostream>
#include<fstream>
#include "wtime.h"
using namespace std;
const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;
//D = A * B + C;
__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for(int i=0;i<N;i++)
{
sum += da[row*N + i] * db[row*i+col];
}
dd[row*N + col] = sum + dc[row*N + col];
}
int main()
{
int N = BLOCK_SIZE * GRID_SIZE;
int *ha,*hb,*hc,*hd;
int *da,*db,*dc,*dd;
double time;
ha = new int[N*N];
hb = new int[N*N];
hc = new int[N*N];
hd = new int[N*N];
cudaError_t err;
//initialize
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
ha[i*N+j] = 1;
hb[i*N+j] = 2;
hc[i*N+j] = 3;
}
//malloc</strong>
cudaMalloc(&da,N*N*sizeof(int));
cudaMalloc(&db,N*N*sizeof(int));
cudaMalloc(&dc,N*N*sizeof(int));
err = cudaMalloc(&dd,N*N*sizeof(int));
printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));
//host to device
cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);
dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(GRID_SIZE,GRID_SIZE);
//kernel
time = wtime();
mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
printf("Computation time is %10.10f\n",wtime()-time);
//device to host
cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);
//print result to file
ofstream fout;
fout.open("result_v1.txt");
if(!fout)
{
cerr<<"open the file error"<<endl;
exit(-1);
}
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
fout<<hd[i*N+j]<<" ";
}
fout<<endl;
}
delete []ha;delete []hb;delete []hc;delete []hd;
cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);
return 0;
}
cuda_wtime.cu:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
wtime.h:
[cpp] view
plaincopy
#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
cuda_wtime.cu:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
makefile:
[cpp] view
plaincopy
cu:
nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
./a.out
结果:
3.计算性能对比:
可见,在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。
注明出处:/article/2003002.html
1.CPU上执行矩阵相乘以及性能。
在CPU上进行矩阵相乘运算的代码:
mat_mul.cc:
[cpp] view
plaincopy
//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h"
using namespace std;
const int N = 320;
//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];
void init()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
a[i][j] = 1;
b[i][j] = 2;
c[i][j] = 3;
}
}
void init1()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
aa[i*N+j] = 1;
bb[i*N+j] = 2;
cc[i*N+j] = 3;
}
}
void mul()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
for(int k=0;k<N;k++)
{
d[i][j] += a[i][k] * b[k][j];
}
d[i][j] += c[i][j];
}
}
void mul1()
{
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
for(int k=0;k<N;k++)
{
dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
}
dd[N*i+j] += cc[N*i+j];
}
}
void print()
{
ofstream fout;
fout.open("result.txt");
if(!fout)
{
perror("can not open the file");
}
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
fout<<d[i][j]<<" ";
}
fout<<endl;
}
fout.close();
}
int main()
{
init1();
double t = wtime();
mul1();
t = wtime()-t;
printf("computation timing = %10.10f sec\n",t);
//print();
return 0;
}
wtime.h:
[cpp] view
plaincopy
#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
wtime.cc:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
makefile:
[cpp] view
plaincopy
target:
g++ mat_mul.cc wtime.cc
./a.out
结果:
2.GPU上执行矩阵相乘以及性能。
代码:
cuda_mat_mul_v1.cu:
[cpp] view
plaincopy
//matrix multiplication with global memory
#include<iostream>
#include<fstream>
#include "wtime.h"
using namespace std;
const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;
//D = A * B + C;
__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for(int i=0;i<N;i++)
{
sum += da[row*N + i] * db[row*i+col];
}
dd[row*N + col] = sum + dc[row*N + col];
}
int main()
{
int N = BLOCK_SIZE * GRID_SIZE;
int *ha,*hb,*hc,*hd;
int *da,*db,*dc,*dd;
double time;
ha = new int[N*N];
hb = new int[N*N];
hc = new int[N*N];
hd = new int[N*N];
cudaError_t err;
//initialize
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{
ha[i*N+j] = 1;
hb[i*N+j] = 2;
hc[i*N+j] = 3;
}
//malloc</strong>
cudaMalloc(&da,N*N*sizeof(int));
cudaMalloc(&db,N*N*sizeof(int));
cudaMalloc(&dc,N*N*sizeof(int));
err = cudaMalloc(&dd,N*N*sizeof(int));
printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));
//host to device
cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);
dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
dim3 grid(GRID_SIZE,GRID_SIZE);
//kernel
time = wtime();
mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
printf("Computation time is %10.10f\n",wtime()-time);
//device to host
cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);
//print result to file
ofstream fout;
fout.open("result_v1.txt");
if(!fout)
{
cerr<<"open the file error"<<endl;
exit(-1);
}
for(int i=0;i<N;i++)
{
for(int j=0;j<N;j++)
{
fout<<hd[i*N+j]<<" ";
}
fout<<endl;
}
delete []ha;delete []hb;delete []hc;delete []hd;
cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);
return 0;
}
cuda_wtime.cu:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
wtime.h:
[cpp] view
plaincopy
#ifndef _WTIME_
#define _WTIME_
double wtime();
#endif
cuda_wtime.cu:
[cpp] view
plaincopy
#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>
double wtime(void)
{
double now_time;
struct timeval etstart;
struct timezone tzp;
if(gettimeofday(&etstart,&tzp)==-1)
{
perror("Error:calling gettimeofday() not successfully.\n");
}
now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
return now_time;
}
#if 0
int main()
{
double time;
time = wtime();
printf("time of day = %10.4f\n",time);
return 0;
}
#endif
makefile:
[cpp] view
plaincopy
cu:
nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
./a.out
结果:
3.计算性能对比:
矩阵大小 | 1600*1600 | 1200*1200 | 800*800 | 320*320 |
串行时间/s | 30.9 | 11.49865 | 2.597987 | 0.162311 |
并行时间 | grid=100/block=16 | grid=75/block=16 | grid=50/block=16 | grid=20/block=16 |
kernel执行时间/s | 0.0000319 | 0.0000309944 | 0.0000309944 | 0.0000231266 |
并行计算总时间(分配内存加+数据拷贝+计算)/s | 0.70796 | 0.439213 | 0.310214 | 0.237676 |
注明出处:/article/2003002.html
相关文章推荐
- YTU-OJ-分数类的四则运算【C++】
- php-fpm配置笔记
- 一个简单的Spring MVC的例子
- MySQL python组件安装
- libevent源代码分析之优先级
- YTU-OJ-实现复数类中的加运算符重载【C++运算符重载】
- php判断是不是ajax访问
- 【CUDA并行编程之三】Cuda矢量求和运算
- 如何查看Eclipse是32位还是64位?
- 【Cuda并行编程之二】Cuda Memory Hierarchy_Cuda内存层次结构
- C++中,如何在标准库的std::string和常用库(Qt,VC等)的QString之间进行选择?
- java/lang/NoClassDefFoundError: java/lang/invoke/MethodHandle的解决办法
- JAVA 遍历文件夹下的所有文件(递归调用和非递归调用)
- 解决32位Eclipse和64位Eclipse在64位win7系统上运行问题
- Python中读取json模块
- Eclipse配置C/C++开发环境
- C#中计算时间差
- Android 开发环境配置图文教程(jdk+eclipse+android sdk)
- 多核平台下的JAVA优化
- 写入TXT文件