MPI实现fft的迭代算法 源于并行计算——结构。算法。编程中伪码
2012-09-13 16:21
513 查看
#include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #define T 0 typedef struct { double real; double img; } com; double PI; //don't use the omega array,not every node needs a whole copy of omega,not efficient static inline void cp(com f,com *t);//copy complex static inline void add(com a,com b,com* r); static inline void mul(com a,com b,com* r); static inline void sub(com a,com b,com* r); int br(int src,int size);//bit reverse void show(com a) { printf("%.4f %.4f \n",a.real,a.img); } int main(int argc,char *argv[]) { PI=atan(1)*4; int k,n;//process id and total number of processes MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&k); MPI_Comm_size(MPI_COMM_WORLD,&n); //mpi communication obj MPI_Request isReq; MPI_Request reReq; MPI_Status s; //data com a;//input a.real=k+1; a.img=0; com c;//temp cp(a,&c); com w;//omega com r;//recieve int l=log(n)/log(2); for(int h=l-1;h>=0;--h) { int p=pow(2,h); int q=n/p; if(k%p==k%(2*p)) { MPI_Issend(&c,2,MPI_DOUBLE,k+p,T,MPI_COMM_WORLD,&isReq); MPI_Irecv(&r,2,MPI_DOUBLE,k+p,T,MPI_COMM_WORLD,&reReq); int time=p*(br(k,l)%q);//compute while recieving and sending w.real=cos(2*PI*time/n); w.img=sin(2*PI*time/n); MPI_Wait(&reReq,&s); mul(r,w,&r); MPI_Wait(&isReq,&s); add(c,r,&c); } else { MPI_Issend(&c,2,MPI_DOUBLE,k-p,T,MPI_COMM_WORLD,&isReq); MPI_Irecv(&r,2,MPI_DOUBLE,k-p,T,MPI_COMM_WORLD,&reReq); int time=p*(br(k-p,l)%q);//compute while recieving and sending w.real=cos(2*PI*time/n); w.img=sin(2*PI*time/n); MPI_Wait(&reReq,&s); MPI_Wait(&isReq,&s); mul(c,w,&c);//can't modify until sending comes to an end sub(r,c,&c); } MPI_Barrier(MPI_COMM_WORLD); } MPI_Issend(&c,2,MPI_DOUBLE,br(k,l),T,MPI_COMM_WORLD,&isReq); MPI_Recv(&a,2,MPI_DOUBLE,MPI_ANY_SOURCE,T,MPI_COMM_WORLD,&s); printf("b%d:%.4f %.4f \n",k,a.real,a.img); MPI_Wait(&isReq,&s); MPI_Finalize(); return 0; } void cp(com f,com *t) { t->real=f.real; t->img=f.img; } void add(com a,com b,com *c) { c->real=a.real+b.real; c->img=a.img+b.img; } void mul(com a,com b,com *c) { c->real=a.real*b.real-a.img*b.img; c->img=a.real*b.img+a.img*b.real; } void sub(com a,com b,com *c) { c->real=a.real-b.real; c->img=a.img-b.img; } int br(int src,int size) { int tmp=src; int des=0; for(int i=size-1;i>=0;--i) { des=((tmp&1)<<i)|des; tmp=tmp>>1; } return des; }
计算过程和本书的串行算法有一定的区别,平衡了节点的计算量,很巧妙。实现过程中,尽量让通信和计算时间重叠,缩短计算时间。改写自本人另一个文章串行fft。
没有使用omega数组,而是直接在使用时计算。因为每个节点最多使用log(n)个omega,计算数组需要计算n个。
碟形中上下两个复数的计算和通信顺序有少许差别,因为可能产生读后写错误。
只要数据大小ds和进程数ps保证 0==ds%ps即可使用下面的改进算法
全通信算法,每次都全局更新数据,可能会导致通信overhead过大。
#include "mpi.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #include <sys/stat.h> #include <memory.h> typedef struct { double real; double img; } com; double PI; int readBinary(char* file,void *buf,int fsize);//if named read causes override and miscall int writeBinary(char *file,com *array,int size); //don't use the omega array,not every node needs a whole copy of omega,not efficient static inline void cp(com f,com *t);//copy complex static inline void add(com a,com b,com* r); static inline void mul(com a,com b,com* r); static inline void sub(com a,com b,com* r); int br(int src,int size);//bit reverse void show(com a) { printf("%.4f %.4f \n",a.real,a.img); } int main(int argc,char *argv[]) { if(argc<3) { printf("wtf\n"); return 1; } PI=atan(1)*4; int self,size;//process id and total number of processes MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD,&self); MPI_Comm_size(MPI_COMM_WORLD,&size); int fsize,n; void *buf; com *in; if(0==self) { printf("start \n"); struct stat fstat; stat(argv[1],&fstat); fsize=fstat.st_size; buf=malloc(fsize); n=readBinary(argv[1],buf,fsize)/2;//n stands for total complex number in=(com*)malloc(n*sizeof(com)); memcpy(in,((int*)buf+1),n*sizeof(com)); } MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD);//every thread should know the total size if(0==self) { } if(-1==n) { printf("error reading \n"); MPI_Finalize(); } int psize=n/size; //mpi communication obj MPI_Request isReq[psize]; MPI_Request reReq[psize]; MPI_Status s[psize]; //data com a[psize];//input com w;//omega com r[psize];//recieve int l=log(n)/log(2); // void *buffer=malloc(fsize); // MPI_Buffer_attach(buffer,fsize); com c[psize];//temp for(int h=l-1;h>=0;--h) { //initialize data each round MPI_Scatter(in,psize*2,MPI_DOUBLE,a,psize*2,MPI_DOUBLE,0,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); memcpy(c,a,psize*sizeof(com)); //calculate int p=pow(2,h); int q=n/p; int k; for(k=self*psize;k<self*psize+psize;++k) {//post all comunications first if(k%p==k%(2*p)) {//tag is always the sending row number //printf("self %d row %d dest row %d dest %d \n",self,k+self,k+p,(k+p)/psize); MPI_Issend(&c[k-self*psize],2,MPI_DOUBLE,(k+p)/psize,k,MPI_COMM_WORLD,&isReq[k-self*psize]); MPI_Irecv(&r[k-self*psize],2,MPI_DOUBLE,(k+p)/psize,k+p,MPI_COMM_WORLD,&reReq[k-self*psize]); } else { MPI_Issend(&c[k-self*psize],2,MPI_DOUBLE,(k-p)/psize,k,MPI_COMM_WORLD,&isReq[k-self*psize]); MPI_Irecv(&r[k-self*psize],2,MPI_DOUBLE,(k-p)/psize,k-p,MPI_COMM_WORLD,&reReq[k-self*psize]); } } MPI_Barrier(MPI_COMM_WORLD); for(k=self*psize;k<self*psize+psize;++k) {//wait and calculate if(k%p==k%(2*p)) { int time=p*(br(k,l)%q);//compute while recieving and sending w.real=cos(2*PI*time/n); w.img=sin(2*PI*time/n); MPI_Wait(&reReq[k-self*psize],&s[k-self*psize]); mul(r[k-self*psize],w,&r[k-self*psize]); MPI_Wait(&isReq[k-self*psize],&s[k-self*psize]); add(c[k-self*psize],r[k-self*psize],&c[k-self*psize]); } else { int time=p*(br(k-p,l)%q);//compute while recieving and sending w.real=cos(2*PI*time/n); w.img=sin(2*PI*time/n); MPI_Wait(&reReq[k-self*psize],&s[k-self*psize]); MPI_Wait(&isReq[k-self*psize],&s[k-self*psize]); mul(c[k-self*psize],w,&c[k-self*psize]);//can't modify until sending comes to an end sub(r[k-self*psize],c[k-self*psize],&c[k-self*psize]); } } MPI_Gather(c,2*psize,MPI_DOUBLE,in,2*psize,MPI_DOUBLE,0,MPI_COMM_WORLD); } //reverse all data for(int k=self*psize;k<self*psize+psize;++k) {//post all comunications first //tag is always the sending row number int brv=br(k,l); MPI_Issend(&c[k-self*psize],2,MPI_DOUBLE,brv/psize,k,MPI_COMM_WORLD,&isReq[k-self*psize]); MPI_Irecv(&r[k-self*psize],2,MPI_DOUBLE,brv/psize,brv,MPI_COMM_WORLD,&reReq[k-self*psize]); } MPI_Waitall(psize,isReq,s); MPI_Waitall(psize,reReq,s); MPI_Gather(r,psize*2,MPI_DOUBLE,in,psize*2,MPI_DOUBLE,0,MPI_COMM_WORLD); if(0==self) { for(int i=0;i<n;++i) { printf("b%d:%.4f %.4f \n",i,in[i].real,in[i].img); } writeBinary(argv[2],in,n); } // MPI_Buffer_detach(buffer,&fsize); // free(buffer); if(0==self) { free(buf); } MPI_Finalize(); return 0; } int readBinary(char* file,void *buf,int fsize) { FILE *in; if(!(in=fopen(file,"r"))) { printf("can't open \n"); return -1; } fread(buf,sizeof(char),fsize,in); int size=((int*)buf)[0]; fclose(in); return size; } int writeBinary(char *file,com *array,int size) { FILE *out; if(!(out=fopen(file,"w"))) { printf("can't open \n"); return -1; } int bsize=sizeof(int)+size*sizeof(com); void *buf=malloc(bsize); ((int*)buf)[0]=2*size; memcpy(((int*)buf+1),array,size*sizeof(com)); fwrite(buf,sizeof(char),bsize,out); free(buf); fclose(out); return 0; } void cp(com f,com *t) { t->real=f.real; t->img=f.img; } void add(com a,com b,com *c) { c->real=a.real+b.real; c->img=a.img+b.img; } void mul(com a,com b,com *c) { c->real=a.real*b.real-a.img*b.img; c->img=a.real*b.img+a.img*b.real; } void sub(com a,com b,com *c) { c->real=a.real-b.real; c->img=a.img-b.img; } int br(int src,int size) { int tmp=src; int des=0; for(int i=size-1;i>=0;--i) { des=((tmp&1)<<i)|des; tmp=tmp>>1; } return des; }
相关文章推荐
- MPI实现fft的迭代算法 源于并行计算——结构。算法。编程中伪码 更新3
- MPI实现fft的迭代算法 源于并行计算——结构。算法。编程中伪码 更新1
- MPI实现fft的迭代算法 源于并行计算——结构。算法。编程中伪码 更新2
- 串行fft 源于并行计算——结构。算法。编程中伪码
- 不用迭代算法而快速实现的jsp树结构
- PHP编程实现计算抽奖概率算法完整实例
- 结构之法 算法之道 面试题 9、编程实现两个正整数的除法 编程实现两个正整数的除法,当然不能用除法操作符。
- 【并行计算】用MPI进行分布式内存编程(一)
- 【并行计算】用MPI进行分布式内存编程(二)
- 不用迭代算法而快速实现的jsp树结构
- 转置矩阵的分块并行乘法(C语言实现),计算矩阵C[rawn][rawn]=A[rawm][rawn]'*B[rawm][rawn],子块大小为S*T,其算法实现原理参加本代码的附件。
- MPI多进程并行计算矩阵乘法实现
- 不用迭代算法而快速实现的jsp树结构
- c++多线程编程:实现标准库accumulate函数的并行计算版本
- 不用迭代算法而快速实现的jsp树结构
- 【CUDA并行编程之六】KNN算法的并行实现
- 【CUDA并行编程之六】KNN算法的并行实现
- 数据结构与算法学习笔记——动态规划的入门与编程实现
- Zabbix数据结构及并行计算实现 推荐
- MPI实现并行计算统计数据