学习BLAS库 -- DDOT
2015-08-06 21:59
495 查看
DDOT( N, X, INCX, Y, INCY)
函数名释义:
D - Double precision
DOT - Dot production
功能:
computes the dot product of two double precision vectors.
dot <- X^{T}Y
实现:
FORTRAN语言版DDOT函数代码:
由FORTRAN语言版翻译的C语言版DDOT函数代码:
VC 编译器生成的DDOT函数汇编语言版代码
编译参数:
启动编译器优化选项
编译参数:
VC 编译器生成的DDOT函数汇编语言速度优化版代码
GotoBLAS2库中DOT函数汇编语言代码
使用方法:
测试DDOT函数
未开启速度优化选项编译得到的运算时间 为0.005 秒左右。
开启速度优化选项编译得到的运算时间 为0.0005 秒左右。
开启编译器优化选项可使计算效率提高10倍左右。
测试环境:
Visual Studio Express 2010
Operating System:Windows 7, 64-bit
CPU:Intel Core(TM) CPU 2.90GHz
Memory: 4.00GB
Hard disk: 500G
参考文献
[1] http://www.applied-mathematics.net/miniSSEL1BLAS/miniSSEL1BLAS.pdf
[2] https://software.intel.com/en-us/articles/use-intriniscs/
[3] Hadi Brais. Compilers - What Every Programmer Should Know About Compiler Optimizations. https://msdn.microsoft.com/en-us/magazine/dn904673.aspx. February 2015.
[4] Koushik Ghosh. Writing Efficient C and C Code Optimization. http://www.codeproject.com/Articles/6154/Writing-Efficient-C-and-C-Code-Optimization. 26 Feb 2004.
[5] http://sci.tuomastonteri.fi/programming/sse
[6] https://www.kernel.org/pub/linux/kernel/people/geoff/cell/ps3-linux-docs/CellProgrammingTutorial/BasicsOfSIMDProgramming.html
函数名释义:
D - Double precision
DOT - Dot production
功能:
computes the dot product of two double precision vectors.
dot <- X^{T}Y
实现:
FORTRAN语言版DDOT函数代码:
DOUBLE PRECISION FUNCTION DDOT(N,DX,INCX,DY,INCY) * .. Scalar Arguments .. INTEGER INCX,INCY,N * .. * .. Array Arguments .. DOUBLE PRECISION DX(*),DY(*) * .. * * Purpose * ======= * * forms the dot product of two vectors. * uses unrolled loops for increments equal to one. * jack dongarra, linpack, 3/11/78. * modified 12/3/93, array(1) declarations changed to array(*) * * * .. Local Scalars .. DOUBLE PRECISION DTEMP INTEGER I,IX,IY,M,MP1 * .. * .. Intrinsic Functions .. INTRINSIC MOD * .. DDOT = 0.0d0 DTEMP = 0.0d0 IF (N.LE.0) RETURN IF (INCX.EQ.1 .AND. INCY.EQ.1) GO TO 20 * * code for unequal increments or equal increments * not equal to 1 * IX = 1 IY = 1 IF (INCX.LT.0) IX = (-N+1)*INCX + 1 IF (INCY.LT.0) IY = (-N+1)*INCY + 1 DO 10 I = 1,N DTEMP = DTEMP + DX(IX)*DY(IY) IX = IX + INCX IY = IY + INCY 10 CONTINUE DDOT = DTEMP RETURN * * code for both increments equal to 1 * * * clean-up loop * 20 M = MOD(N,5) IF (M.EQ.0) GO TO 40 DO 30 I = 1,M DTEMP = DTEMP + DX(I)*DY(I) 30 CONTINUE IF (N.LT.5) GO TO 60 40 MP1 = M + 1 DO 50 I = MP1,N,5 DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) + + DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4) 50 CONTINUE 60 DDOT = DTEMP RETURN END
由FORTRAN语言版翻译的C语言版DDOT函数代码:
/* * cblas_ddot.c * * The program is a C version of ddot. * All right reserved. */ double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY) { int i; int m; int mp1; double result = 0.0; if (N < 0) { return result; } if ((incX != 1) || (incY != 1)) { int ix = 1; int iy = 1; if (incX < 0) { ix = (1 - N) * incX + 1; } if (incY < 0) { iy = (1 - N) * incY + 1; } for (i = 0; i < N; i++) { result += Y[iy] * X[ix]; ix = ix + incX; iy = iy + incY; } return result; } m = (N % 5); if (0 != m) { for (i = 0; i < m; i++) { result += Y[i] * X[i]; } if (N < 5) { return result; } } mp1 = m ; for (i = mp1; i < N; i += 5) { result += (Y[i] * X[i] + Y[i + 1] * X[i + 1] + Y[i + 2] * X[i + 2] + Y[i + 3] * X[i + 3] + Y[i + 4] * X[i + 4]); } return result; }
VC 编译器生成的DDOT函数汇编语言版代码
编译参数:
cl main.c
_TEXT SEGMENT _iy$4493 = -32 ; size = 4 _ix$4492 = -28 ; size = 4 _m$ = -24 ; size = 4 _mp1$ = -20 ; size = 4 _result$ = -16 ; size = 8 _i$ = -4 ; size = 4 _N$ = 8 ; size = 4 _X$ = 12 ; size = 4 _incX$ = 16 ; size = 4 _Y$ = 20 ; size = 4 _incY$ = 24 ; size = 4 _cblas_ddot PROC ; Line 15 push ebp mov ebp, esp sub esp, 32 ; 00000020H push esi ; Line 18 mov DWORD PTR _mp1$[ebp], 0 ; Line 19 fldz fstp QWORD PTR _result$[ebp] ; Line 20 cmp DWORD PTR _N$[ebp], 0 jge SHORT $LN16@cblas_ddot ; Line 22 fld QWORD PTR _result$[ebp] jmp $LN17@cblas_ddot $LN16@cblas_ddot: ; Line 26 cmp DWORD PTR _incX$[ebp], 1 jne SHORT $LN14@cblas_ddot cmp DWORD PTR _incY$[ebp], 1 je $LN15@cblas_ddot $LN14@cblas_ddot: ; Line 28 mov DWORD PTR _ix$4492[ebp], 1 ; Line 29 mov DWORD PTR _iy$4493[ebp], 1 ; Line 30 cmp DWORD PTR _incX$[ebp], 0 jge SHORT $LN13@cblas_ddot ; Line 32 mov eax, 1 sub eax, DWORD PTR _N$[ebp] imul eax, DWORD PTR _incX$[ebp] add eax, 1 mov DWORD PTR _ix$4492[ebp], eax $LN13@cblas_ddot: ; Line 34 cmp DWORD PTR _incY$[ebp], 0 jge SHORT $LN12@cblas_ddot ; Line 36 mov ecx, 1 sub ecx, DWORD PTR _N$[ebp] imul ecx, DWORD PTR _incY$[ebp] add ecx, 1 mov DWORD PTR _iy$4493[ebp], ecx $LN12@cblas_ddot: ; Line 38 mov DWORD PTR _i$[ebp], 0 jmp SHORT $LN11@cblas_ddot $LN10@cblas_ddot: mov edx, DWORD PTR _i$[ebp] add edx, 1 mov DWORD PTR _i$[ebp], edx $LN11@cblas_ddot: mov eax, DWORD PTR _i$[ebp] cmp eax, DWORD PTR _N$[ebp] jge SHORT $LN9@cblas_ddot ; Line 40 mov ecx, DWORD PTR _iy$4493[ebp] mov edx, DWORD PTR _Y$[ebp] mov eax, DWORD PTR _ix$4492[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [edx+ecx*8] fmul QWORD PTR [esi+eax*8] fadd QWORD PTR _result$[ebp] fstp QWORD PTR _result$[ebp] ; Line 41 mov ecx, DWORD PTR _ix$4492[ebp] add ecx, DWORD PTR _incX$[ebp] mov DWORD PTR _ix$4492[ebp], ecx ; Line 42 mov edx, DWORD PTR _iy$4493[ebp] add edx, DWORD PTR _incY$[ebp] mov DWORD PTR _iy$4493[ebp], edx ; Line 43 jmp SHORT $LN10@cblas_ddot $LN9@cblas_ddot: ; Line 44 fld QWORD PTR _result$[ebp] jmp $LN17@cblas_ddot $LN15@cblas_ddot: ; Line 47 mov eax, DWORD PTR _N$[ebp] cdq mov ecx, 5 idiv ecx mov DWORD PTR _m$[ebp], edx ; Line 49 cmp DWORD PTR _m$[ebp], 0 je SHORT $LN4@cblas_ddot ; Line 51 mov DWORD PTR _i$[ebp], 0 jmp SHORT $LN7@cblas_ddot $LN6@cblas_ddot: mov edx, DWORD PTR _i$[ebp] add edx, 1 mov DWORD PTR _i$[ebp], edx $LN7@cblas_ddot: mov eax, DWORD PTR _i$[ebp] cmp eax, DWORD PTR _m$[ebp] jge SHORT $LN5@cblas_ddot ; Line 53 mov ecx, DWORD PTR _i$[ebp] mov edx, DWORD PTR _Y$[ebp] mov eax, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [edx+ecx*8] fmul QWORD PTR [esi+eax*8] fadd QWORD PTR _result$[ebp] fstp QWORD PTR _result$[ebp] ; Line 54 jmp SHORT $LN6@cblas_ddot $LN5@cblas_ddot: ; Line 55 cmp DWORD PTR _N$[ebp], 5 jge SHORT $LN4@cblas_ddot ; Line 57 fld QWORD PTR _result$[ebp] jmp $LN17@cblas_ddot $LN4@cblas_ddot: ; Line 61 mov ecx, DWORD PTR _m$[ebp] mov DWORD PTR _mp1$[ebp], ecx ; Line 63 mov edx, DWORD PTR _mp1$[ebp] mov DWORD PTR _i$[ebp], edx jmp SHORT $LN3@cblas_ddot $LN2@cblas_ddot: mov eax, DWORD PTR _i$[ebp] add eax, 5 mov DWORD PTR _i$[ebp], eax $LN3@cblas_ddot: mov ecx, DWORD PTR _i$[ebp] cmp ecx, DWORD PTR _N$[ebp] jge SHORT $LN1@cblas_ddot ; Line 66 mov edx, DWORD PTR _i$[ebp] mov eax, DWORD PTR _Y$[ebp] mov ecx, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [eax+edx*8] fmul QWORD PTR [esi+ecx*8] mov edx, DWORD PTR _i$[ebp] mov eax, DWORD PTR _Y$[ebp] mov ecx, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [eax+edx*8+8] fmul QWORD PTR [esi+ecx*8+8] faddp ST(1), ST(0) mov edx, DWORD PTR _i$[ebp] mov eax, DWORD PTR _Y$[ebp] mov ecx, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [eax+edx*8+16] fmul QWORD PTR [esi+ecx*8+16] faddp ST(1), ST(0) mov edx, DWORD PTR _i$[ebp] mov eax, DWORD PTR _Y$[ebp] mov ecx, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [eax+edx*8+24] fmul QWORD PTR [esi+ecx*8+24] faddp ST(1), ST(0) mov edx, DWORD PTR _i$[ebp] mov eax, DWORD PTR _Y$[ebp] mov ecx, DWORD PTR _i$[ebp] mov esi, DWORD PTR _X$[ebp] fld QWORD PTR [eax+edx*8+32] fmul QWORD PTR [esi+ecx*8+32] faddp ST(1), ST(0) fadd QWORD PTR _result$[ebp] fstp QWORD PTR _result$[ebp] ; Line 71 jmp $LN2@cblas_ddot $LN1@cblas_ddot: ; Line 73 fld QWORD PTR _result$[ebp] $LN17@cblas_ddot: ; Line 75 pop esi mov esp, ebp pop ebp ret 0 _cblas_ddot ENDP _TEXT ENDS
启动编译器优化选项
编译参数:
cl main.c /FA /O2 /arch:SSE2
VC 编译器生成的DDOT函数汇编语言速度优化版代码
; COMDAT _cblas_ddot _TEXT SEGMENT _i$ = -12 ; size = 4 _result$ = -8 ; size = 8 tv1489 = 8 ; size = 4 tv683 = 8 ; size = 4 _N$ = 8 ; size = 4 _X$ = 12 ; size = 4 _incX$ = 16 ; size = 4 _Y$ = 20 ; size = 4 _incY$ = 24 ; size = 4 _cblas_ddot PROC ; COMDAT ; Line 15 sub esp, 12 ; 0000000cH ; Line 19 xorps xmm1, xmm1 push edi ; Line 20 mov edi, DWORD PTR _N$[esp+12] movsd QWORD PTR _result$[esp+16], xmm1 test edi, edi jns SHORT $LN16@cblas_ddot ; Line 22 fldz pop edi ; Line 75 add esp, 12 ; 0000000cH ret 0 $LN16@cblas_ddot: ; Line 26 mov edx, DWORD PTR _incX$[esp+12] push ebx push ebp push esi mov esi, DWORD PTR _incY$[esp+24] cmp edx, 1 jne $LN14@cblas_ddot cmp esi, 1 jne $LN14@cblas_ddot ; Line 49 mov ebp, DWORD PTR _Y$[esp+24] mov eax, 1717986919 ; 66666667H imul edi sar edx, 1 mov eax, edx shr eax, 31 ; 0000001fH add eax, edx lea eax, DWORD PTR [eax+eax*4] mov ebx, edi sub ebx, eax je $LN32@cblas_ddot ; Line 51 xor esi, esi cmp ebx, 4 jl SHORT $LC26@cblas_ddot mov edx, DWORD PTR _X$[esp+24] lea ecx, DWORD PTR [edx+24] ; Line 53 sub edx, ebp mov DWORD PTR tv683[esp+24], edx lea edx, DWORD PTR [ebx-4] shr edx, 2 inc edx lea esi, DWORD PTR [edx*4] mov DWORD PTR _i$[esp+28], esi mov esi, DWORD PTR tv683[esp+24] lea eax, DWORD PTR [ebp+8] npad 9 $LL27@cblas_ddot: movsd xmm0, QWORD PTR [eax-8] mulsd xmm0, QWORD PTR [ecx-24] addsd xmm0, xmm1 movsd xmm1, QWORD PTR [esi+eax] mulsd xmm1, QWORD PTR [eax] addsd xmm0, xmm1 movsd xmm1, QWORD PTR [eax+8] mulsd xmm1, QWORD PTR [ecx-8] addsd xmm0, xmm1 movsd xmm1, QWORD PTR [eax+16] mulsd xmm1, QWORD PTR [ecx] add eax, 32 ; 00000020H add ecx, 32 ; 00000020H dec edx addsd xmm1, xmm0 jne SHORT $LL27@cblas_ddot mov esi, DWORD PTR _i$[esp+28] movsd QWORD PTR _result$[esp+28], xmm1 $LC26@cblas_ddot: ; Line 51 cmp esi, ebx jge SHORT $LN31@cblas_ddot mov edx, DWORD PTR _X$[esp+24] mov ecx, ebx sub edx, ebp lea eax, DWORD PTR [ebp+esi*8] sub ecx, esi npad 5 $LC7@cblas_ddot: ; Line 53 movsd xmm0, QWORD PTR [eax+edx] mulsd xmm0, QWORD PTR [eax] add eax, 8 dec ecx addsd xmm1, xmm0 jne SHORT $LC7@cblas_ddot movsd QWORD PTR _result$[esp+28], xmm1 $LN31@cblas_ddot: ; Line 55 cmp edi, 5 ; Line 57 jl $LN28@cblas_ddot $LN32@cblas_ddot: ; Line 63 cmp ebx, edi jge $LN28@cblas_ddot mov eax, DWORD PTR _X$[esp+24] lea esi, DWORD PTR [eax+ebx*8+24] sub eax, ebp sub edi, ebx mov DWORD PTR tv683[esp+24], eax dec edi mov eax, -858993459 ; cccccccdH mul edi shr edx, 2 lea ecx, DWORD PTR [ebp+ebx*8+8] inc edx npad 6 $LL3@cblas_ddot: ; Line 66 mov eax, DWORD PTR tv683[esp+24] movsd xmm0, QWORD PTR [ecx-8] mulsd xmm0, QWORD PTR [esi-24] movsd xmm2, QWORD PTR [eax+ecx] mulsd xmm2, QWORD PTR [ecx] addsd xmm0, xmm2 movsd xmm2, QWORD PTR [ecx+8] mulsd xmm2, QWORD PTR [esi-8] addsd xmm0, xmm2 movsd xmm2, QWORD PTR [ecx+16] mulsd xmm2, QWORD PTR [esi] addsd xmm0, xmm2 movsd xmm2, QWORD PTR [ecx+24] mulsd xmm2, QWORD PTR [esi+8] add ecx, 40 ; 00000028H add esi, 40 ; 00000028H dec edx addsd xmm0, xmm2 addsd xmm1, xmm0 jne SHORT $LL3@cblas_ddot ; Line 40 pop esi pop ebp movsd QWORD PTR _result$[esp+20], xmm1 ; Line 44 fld QWORD PTR _result$[esp+20] pop ebx pop edi ; Line 75 add esp, 12 ; 0000000cH ret 0 $LN14@cblas_ddot: ; Line 28 mov eax, 1 ; Line 29 mov ecx, eax ; Line 30 test edx, edx jns SHORT $LN13@cblas_ddot ; Line 32 sub eax, edi imul eax, edx inc eax $LN13@cblas_ddot: ; Line 34 test esi, esi jns SHORT $LN12@cblas_ddot ; Line 36 sub ecx, edi imul ecx, esi inc ecx $LN12@cblas_ddot: ; Line 38 mov ebx, DWORD PTR _Y$[esp+24] xor ebp, ebp cmp edi, 4 jl SHORT $LC29@cblas_ddot lea ebp, DWORD PTR [edi-4] shr ebp, 2 inc ebp mov DWORD PTR tv1489[esp+24], ebp add ebp, ebp add ebp, ebp mov DWORD PTR _i$[esp+28], ebp npad 4 $LL30@cblas_ddot: ; Line 40 mov ebp, DWORD PTR _X$[esp+24] movsd xmm0, QWORD PTR [ebp+eax*8] mulsd xmm0, QWORD PTR [ebx+ecx*8] addsd xmm0, xmm1 ; Line 41 add eax, edx movsd xmm1, QWORD PTR [ebp+eax*8] ; Line 42 add ecx, esi mulsd xmm1, QWORD PTR [ebx+ecx*8] addsd xmm1, xmm0 add eax, edx movsd xmm0, QWORD PTR [ebp+eax*8] add ecx, esi mulsd xmm0, QWORD PTR [ebx+ecx*8] add eax, edx addsd xmm0, xmm1 movsd xmm1, QWORD PTR [ebp+eax*8] add ecx, esi mulsd xmm1, QWORD PTR [ebx+ecx*8] add eax, edx add ecx, esi dec DWORD PTR tv1489[esp+24] addsd xmm1, xmm0 jne SHORT $LL30@cblas_ddot ; Line 40 mov ebp, DWORD PTR _i$[esp+28] movsd QWORD PTR _result$[esp+28], xmm1 $LC29@cblas_ddot: ; Line 38 cmp ebp, edi jge SHORT $LN28@cblas_ddot add esi, esi add edx, edx add esi, esi add edx, edx lea ecx, DWORD PTR [ebx+ecx*8] mov ebx, DWORD PTR _X$[esp+24] add esi, esi add edx, edx lea eax, DWORD PTR [ebx+eax*8] sub edi, ebp npad 4 $LC11@cblas_ddot: ; Line 40 movsd xmm0, QWORD PTR [ecx] mulsd xmm0, QWORD PTR [eax] ; Line 41 add eax, edx ; Line 42 add ecx, esi dec edi addsd xmm1, xmm0 jne SHORT $LC11@cblas_ddot ; Line 40 movsd QWORD PTR _result$[esp+28], xmm1 $LN28@cblas_ddot: ; Line 44 fld QWORD PTR _result$[esp+28] pop esi pop ebp pop ebx pop edi ; Line 75 add esp, 12 ; 0000000cH ret 0 _cblas_ddot ENDP _TEXT ENDS
GotoBLAS2库中DOT函数汇编语言代码
/*********************************************************************/ /* */ /* Optimized BLAS libraries */ /* By Kazushige Goto <kgoto@tacc.utexas.edu> */ /* */ /* Copyright (c) The University of Texas, 2009. All rights reserved. */ /* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ /* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ /* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ /* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ /* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ /* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ /* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ /* Under no circumstances shall University be liable for incidental, */ /* special, indirect, direct or consequential damages or loss of */ /* profits, interruption of business, or related expenses which may */ /* arise from use of Software or Documentation, including but not */ /* limited to those resulting from defects in Software and/or */ /* Documentation, or loss or inaccuracy of data of any kind. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define STACK 12 #define ARGS 0 #define STACK_N 4 + STACK + ARGS(%esp) #define STACK_X 8 + STACK + ARGS(%esp) #define STACK_INCX 12 + STACK + ARGS(%esp) #define STACK_Y 16 + STACK + ARGS(%esp) #define STACK_INCY 20 + STACK + ARGS(%esp) #define N %ebx #define X %esi #define INCX %ecx #define Y %edi #define INCY %edx PROLOGUE pushl %edi pushl %esi pushl %ebx PROFCODE #if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) EMMS #endif movl STACK_N, N movl STACK_X, X movl STACK_INCX, INCX movl STACK_Y, Y movl STACK_INCY, INCY #ifdef F_INTERFACE movl (N),N movl (INCX),INCX movl (INCY),INCY #endif leal (, INCX, SIZE), INCX leal (, INCY, SIZE), INCY fldz fldz fldz fldz cmpl $SIZE, INCX jne .L14 cmpl $SIZE, INCY jne .L14 movl N, %eax sarl $2, %eax jle .L15 ALIGN_3 .L16: FLD 0 * SIZE(X) FMUL 0 * SIZE(Y) faddp %st,%st(1) FLD 1 * SIZE(X) FMUL 1 * SIZE(Y) faddp %st,%st(2) FLD 2 * SIZE(X) FMUL 2 * SIZE(Y) faddp %st,%st(3) FLD 3 * SIZE(X) FMUL 3 * SIZE(Y) faddp %st,%st(4) addl $4 * SIZE, X addl $4 * SIZE, Y decl %eax jg .L16 ALIGN_3 .L15: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L22: FLD (X) addl $SIZE, X FMUL (Y) addl $SIZE, Y faddp %st,%st(1) decl %eax jg .L22 jmp .L27 ALIGN_3 .L14: #ifdef F_INTERFACE testl INCX, INCX jge .L28 movl N, %eax decl %eax imull INCX, %eax subl %eax, X ALIGN_3 .L28: testl INCY, INCY jge .L29 movl N, %eax decl %eax imull INCY, %eax subl %eax, Y ALIGN_3 .L29: #endif movl N, %eax sarl $2, %eax jle .L30 ALIGN_3 .L31: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(1) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(2) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(3) FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st,%st(4) decl %eax jg .L31 ALIGN_3 .L30: movl N, %eax andl $3, %eax jle .L27 ALIGN_3 .L37: FLD (X) addl INCX, X FMUL (Y) addl INCY, Y faddp %st, %st(1) decl %eax jg .L37 ALIGN_3 .L27: faddp %st,%st(2) faddp %st,%st(2) faddp %st,%st(1) popl %ebx popl %esi popl %edi ret EPILOGUE
使用方法:
测试DDOT函数
unsigned int longGetCycleCount() { __asm RDTSC } void test_ddot() { int i; int n = 19; int inc1 = 1; int inc2 = 1; double result; double X[] = { 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 }; double Y[] = { 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1 }; char functionName[] = "ddot"; printf("================================================ \n"); printf("Testing BLAS library function -- %s \n", functionName); for( i = 0; i < 100000; i++ ) { result = cblas_ddot(n, X, inc1, Y, inc2); } printf("%s of the arraies is %f \n", functionName, result); if (fabs(12.1 - result) < 0.0001) { printf("%s test SUCCESS.\n", functionName); } else { printf("[Fail] %s test failed.\n", functionName); } printf("\n"); return; } int main( void ) { unsigned int start,end; double cost; start = longGetCycleCount(); test_ddot() ; end = longGetCycleCount(); cost = (double) (end-start) / 2900000000 ; printf("%f \n",cost); return 0; }
未开启速度优化选项编译得到的运算时间 为0.005 秒左右。
开启速度优化选项编译得到的运算时间 为0.0005 秒左右。
开启编译器优化选项可使计算效率提高10倍左右。
测试环境:
Visual Studio Express 2010
Operating System:Windows 7, 64-bit
CPU:Intel Core(TM) CPU 2.90GHz
Memory: 4.00GB
Hard disk: 500G
参考文献
[1] http://www.applied-mathematics.net/miniSSEL1BLAS/miniSSEL1BLAS.pdf
[2] https://software.intel.com/en-us/articles/use-intriniscs/
[3] Hadi Brais. Compilers - What Every Programmer Should Know About Compiler Optimizations. https://msdn.microsoft.com/en-us/magazine/dn904673.aspx. February 2015.
[4] Koushik Ghosh. Writing Efficient C and C Code Optimization. http://www.codeproject.com/Articles/6154/Writing-Efficient-C-and-C-Code-Optimization. 26 Feb 2004.
[5] http://sci.tuomastonteri.fi/programming/sse
[6] https://www.kernel.org/pub/linux/kernel/people/geoff/cell/ps3-linux-docs/CellProgrammingTutorial/BasicsOfSIMDProgramming.html
相关文章推荐
- JAVA技术交流群
- C语言-常用知识和技巧
- 设置线程的优先级
- codeforces 567c
- mysql 高可用6
- swift2.0学习之类型转换
- macbook Yosemite 10.10.4 编译Android 5.0
- Python图像处理(16):图像金字塔
- Process.start: how to get the output?
- Struts2知多少(2) Struts2 是什么
- 第一篇Dojo的碎碎念
- 指针和数组的区别
- “闯红灯、戴绿帽”的最大问题并非“绿帽子”
- hdu 1085 - Holding Bin-Laden Captive!
- Spark编译与部署
- 问题:FF中把UL下的LI设为左浮动UL的背景色就没有了?
- Fragment的进一步使用(一)
- maven学习
- (素材源码)swanzhu学IOS(二)UI之_NSTimer
- DataTemplate和ControlTemplate联系与区别