您的位置:首页 > 其它

学习BLAS库 -- DDOT

2015-08-06 21:59 495 查看
DDOT( N, X, INCX, Y, INCY)

函数名释义

D - Double precision

DOT - Dot production

功能

computes the dot product of two double precision vectors.

dot <- X^{T}Y

实现

FORTRAN语言版DDOT函数代码:

DOUBLE PRECISION FUNCTION DDOT(N,DX,INCX,DY,INCY)
*     .. Scalar Arguments ..
INTEGER INCX,INCY,N
*     ..
*     .. Array Arguments ..
DOUBLE PRECISION DX(*),DY(*)
*     ..
*
*  Purpose
*  =======
*
*     forms the dot product of two vectors.
*     uses unrolled loops for increments equal to one.
*     jack dongarra, linpack, 3/11/78.
*     modified 12/3/93, array(1) declarations changed to array(*)
*
*
*     .. Local Scalars ..
DOUBLE PRECISION DTEMP
INTEGER I,IX,IY,M,MP1
*     ..
*     .. Intrinsic Functions ..
INTRINSIC MOD
*     ..
DDOT = 0.0d0
DTEMP = 0.0d0
IF (N.LE.0) RETURN
IF (INCX.EQ.1 .AND. INCY.EQ.1) GO TO 20
*
*        code for unequal increments or equal increments
*          not equal to 1
*
IX = 1
IY = 1
IF (INCX.LT.0) IX = (-N+1)*INCX + 1
IF (INCY.LT.0) IY = (-N+1)*INCY + 1
DO 10 I = 1,N
DTEMP = DTEMP + DX(IX)*DY(IY)
IX = IX + INCX
IY = IY + INCY
10 CONTINUE
DDOT = DTEMP
RETURN
*
*        code for both increments equal to 1
*
*
*        clean-up loop
*
20 M = MOD(N,5)
IF (M.EQ.0) GO TO 40
DO 30 I = 1,M
DTEMP = DTEMP + DX(I)*DY(I)
30 CONTINUE
IF (N.LT.5) GO TO 60
40 MP1 = M + 1
DO 50 I = MP1,N,5
DTEMP = DTEMP + DX(I)*DY(I) + DX(I+1)*DY(I+1) +
+            DX(I+2)*DY(I+2) + DX(I+3)*DY(I+3) + DX(I+4)*DY(I+4)
50 CONTINUE
60 DDOT = DTEMP
RETURN
END


由FORTRAN语言版翻译的C语言版DDOT函数代码:

/*
* cblas_ddot.c
*
* The program is a C version of ddot.
* All right reserved.
*/
double cblas_ddot(const int N, const double *X, const int incX,
const double *Y, const int incY)
{
int i;
int m;
int mp1;
double result = 0.0;
if (N < 0)
{
return result;
}

if ((incX != 1) || (incY != 1))
{
int ix = 1;
int iy = 1;
if (incX < 0)
{
ix = (1 - N) * incX + 1;
}
if (incY < 0)
{
iy = (1 - N) * incY + 1;
}
for (i = 0; i < N; i++)
{
result += Y[iy] * X[ix];
ix = ix + incX;
iy = iy + incY;
}
return result;
}

m = (N % 5);
if (0 != m)
{
for (i = 0; i < m; i++)
{
result += Y[i] * X[i];
}
if (N < 5)
{
return result;
}
}

mp1 = m ;
for (i = mp1; i < N; i += 5)
{
result += (Y[i] * X[i] + Y[i + 1] * X[i + 1] + Y[i + 2] * X[i + 2]
+ Y[i + 3] * X[i + 3] + Y[i + 4] * X[i + 4]);

}
return result;

}


VC 编译器生成的DDOT函数汇编语言版代码

编译参数:

cl main.c


_TEXT	SEGMENT
_iy$4493 = -32						; size = 4
_ix$4492 = -28						; size = 4
_m$ = -24						; size = 4
_mp1$ = -20						; size = 4
_result$ = -16						; size = 8
_i$ = -4						; size = 4
_N$ = 8							; size = 4
_X$ = 12						; size = 4
_incX$ = 16						; size = 4
_Y$ = 20						; size = 4
_incY$ = 24						; size = 4
_cblas_ddot PROC
; Line 15
push	ebp
mov	ebp, esp
sub	esp, 32					; 00000020H
push	esi
; Line 18
mov	DWORD PTR _mp1$[ebp], 0
; Line 19
fldz
fstp	QWORD PTR _result$[ebp]
; Line 20
cmp	DWORD PTR _N$[ebp], 0
jge	SHORT $LN16@cblas_ddot
; Line 22
fld	QWORD PTR _result$[ebp]
jmp	$LN17@cblas_ddot
$LN16@cblas_ddot:
; Line 26
cmp	DWORD PTR _incX$[ebp], 1
jne	SHORT $LN14@cblas_ddot
cmp	DWORD PTR _incY$[ebp], 1
je	$LN15@cblas_ddot
$LN14@cblas_ddot:
; Line 28
mov	DWORD PTR _ix$4492[ebp], 1
; Line 29
mov	DWORD PTR _iy$4493[ebp], 1
; Line 30
cmp	DWORD PTR _incX$[ebp], 0
jge	SHORT $LN13@cblas_ddot
; Line 32
mov	eax, 1
sub	eax, DWORD PTR _N$[ebp]
imul	eax, DWORD PTR _incX$[ebp]
add	eax, 1
mov	DWORD PTR _ix$4492[ebp], eax
$LN13@cblas_ddot:
; Line 34
cmp	DWORD PTR _incY$[ebp], 0
jge	SHORT $LN12@cblas_ddot
; Line 36
mov	ecx, 1
sub	ecx, DWORD PTR _N$[ebp]
imul	ecx, DWORD PTR _incY$[ebp]
add	ecx, 1
mov	DWORD PTR _iy$4493[ebp], ecx
$LN12@cblas_ddot:
; Line 38
mov	DWORD PTR _i$[ebp], 0
jmp	SHORT $LN11@cblas_ddot
$LN10@cblas_ddot:
mov	edx, DWORD PTR _i$[ebp]
add	edx, 1
mov	DWORD PTR _i$[ebp], edx
$LN11@cblas_ddot:
mov	eax, DWORD PTR _i$[ebp]
cmp	eax, DWORD PTR _N$[ebp]
jge	SHORT $LN9@cblas_ddot
; Line 40
mov	ecx, DWORD PTR _iy$4493[ebp]
mov	edx, DWORD PTR _Y$[ebp]
mov	eax, DWORD PTR _ix$4492[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [edx+ecx*8]
fmul	QWORD PTR [esi+eax*8]
fadd	QWORD PTR _result$[ebp]
fstp	QWORD PTR _result$[ebp]
; Line 41
mov	ecx, DWORD PTR _ix$4492[ebp]
add	ecx, DWORD PTR _incX$[ebp]
mov	DWORD PTR _ix$4492[ebp], ecx
; Line 42
mov	edx, DWORD PTR _iy$4493[ebp]
add	edx, DWORD PTR _incY$[ebp]
mov	DWORD PTR _iy$4493[ebp], edx
; Line 43
jmp	SHORT $LN10@cblas_ddot
$LN9@cblas_ddot:
; Line 44
fld	QWORD PTR _result$[ebp]
jmp	$LN17@cblas_ddot
$LN15@cblas_ddot:
; Line 47
mov	eax, DWORD PTR _N$[ebp]
cdq
mov	ecx, 5
idiv	ecx
mov	DWORD PTR _m$[ebp], edx
; Line 49
cmp	DWORD PTR _m$[ebp], 0
je	SHORT $LN4@cblas_ddot
; Line 51
mov	DWORD PTR _i$[ebp], 0
jmp	SHORT $LN7@cblas_ddot
$LN6@cblas_ddot:
mov	edx, DWORD PTR _i$[ebp]
add	edx, 1
mov	DWORD PTR _i$[ebp], edx
$LN7@cblas_ddot:
mov	eax, DWORD PTR _i$[ebp]
cmp	eax, DWORD PTR _m$[ebp]
jge	SHORT $LN5@cblas_ddot
; Line 53
mov	ecx, DWORD PTR _i$[ebp]
mov	edx, DWORD PTR _Y$[ebp]
mov	eax, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [edx+ecx*8]
fmul	QWORD PTR [esi+eax*8]
fadd	QWORD PTR _result$[ebp]
fstp	QWORD PTR _result$[ebp]
; Line 54
jmp	SHORT $LN6@cblas_ddot
$LN5@cblas_ddot:
; Line 55
cmp	DWORD PTR _N$[ebp], 5
jge	SHORT $LN4@cblas_ddot
; Line 57
fld	QWORD PTR _result$[ebp]
jmp	$LN17@cblas_ddot
$LN4@cblas_ddot:
; Line 61
mov	ecx, DWORD PTR _m$[ebp]
mov	DWORD PTR _mp1$[ebp], ecx
; Line 63
mov	edx, DWORD PTR _mp1$[ebp]
mov	DWORD PTR _i$[ebp], edx
jmp	SHORT $LN3@cblas_ddot
$LN2@cblas_ddot:
mov	eax, DWORD PTR _i$[ebp]
add	eax, 5
mov	DWORD PTR _i$[ebp], eax
$LN3@cblas_ddot:
mov	ecx, DWORD PTR _i$[ebp]
cmp	ecx, DWORD PTR _N$[ebp]
jge	SHORT $LN1@cblas_ddot
; Line 66
mov	edx, DWORD PTR _i$[ebp]
mov	eax, DWORD PTR _Y$[ebp]
mov	ecx, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [eax+edx*8]
fmul	QWORD PTR [esi+ecx*8]
mov	edx, DWORD PTR _i$[ebp]
mov	eax, DWORD PTR _Y$[ebp]
mov	ecx, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [eax+edx*8+8]
fmul	QWORD PTR [esi+ecx*8+8]
faddp	ST(1), ST(0)
mov	edx, DWORD PTR _i$[ebp]
mov	eax, DWORD PTR _Y$[ebp]
mov	ecx, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [eax+edx*8+16]
fmul	QWORD PTR [esi+ecx*8+16]
faddp	ST(1), ST(0)
mov	edx, DWORD PTR _i$[ebp]
mov	eax, DWORD PTR _Y$[ebp]
mov	ecx, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [eax+edx*8+24]
fmul	QWORD PTR [esi+ecx*8+24]
faddp	ST(1), ST(0)
mov	edx, DWORD PTR _i$[ebp]
mov	eax, DWORD PTR _Y$[ebp]
mov	ecx, DWORD PTR _i$[ebp]
mov	esi, DWORD PTR _X$[ebp]
fld	QWORD PTR [eax+edx*8+32]
fmul	QWORD PTR [esi+ecx*8+32]
faddp	ST(1), ST(0)
fadd	QWORD PTR _result$[ebp]
fstp	QWORD PTR _result$[ebp]
; Line 71
jmp	$LN2@cblas_ddot
$LN1@cblas_ddot:
; Line 73
fld	QWORD PTR _result$[ebp]
$LN17@cblas_ddot:
; Line 75
pop	esi
mov	esp, ebp
pop	ebp
ret	0
_cblas_ddot ENDP
_TEXT	ENDS


启动编译器优化选项

编译参数:

cl main.c /FA /O2 /arch:SSE2


VC 编译器生成的DDOT函数汇编语言速度优化版代码

;	COMDAT _cblas_ddot
_TEXT	SEGMENT
_i$ = -12						; size = 4
_result$ = -8						; size = 8
tv1489 = 8						; size = 4
tv683 = 8						; size = 4
_N$ = 8							; size = 4
_X$ = 12						; size = 4
_incX$ = 16						; size = 4
_Y$ = 20						; size = 4
_incY$ = 24						; size = 4
_cblas_ddot PROC					; COMDAT
; Line 15
sub	esp, 12					; 0000000cH
; Line 19
xorps	xmm1, xmm1
push	edi
; Line 20
mov	edi, DWORD PTR _N$[esp+12]
movsd	QWORD PTR _result$[esp+16], xmm1
test	edi, edi
jns	SHORT $LN16@cblas_ddot
; Line 22
fldz
pop	edi
; Line 75
add	esp, 12					; 0000000cH
ret	0
$LN16@cblas_ddot:
; Line 26
mov	edx, DWORD PTR _incX$[esp+12]
push	ebx
push	ebp
push	esi
mov	esi, DWORD PTR _incY$[esp+24]
cmp	edx, 1
jne	$LN14@cblas_ddot
cmp	esi, 1
jne	$LN14@cblas_ddot
; Line 49
mov	ebp, DWORD PTR _Y$[esp+24]
mov	eax, 1717986919				; 66666667H
imul	edi
sar	edx, 1
mov	eax, edx
shr	eax, 31					; 0000001fH
add	eax, edx
lea	eax, DWORD PTR [eax+eax*4]
mov	ebx, edi
sub	ebx, eax
je	$LN32@cblas_ddot
; Line 51
xor	esi, esi
cmp	ebx, 4
jl	SHORT $LC26@cblas_ddot
mov	edx, DWORD PTR _X$[esp+24]
lea	ecx, DWORD PTR [edx+24]
; Line 53
sub	edx, ebp
mov	DWORD PTR tv683[esp+24], edx
lea	edx, DWORD PTR [ebx-4]
shr	edx, 2
inc	edx
lea	esi, DWORD PTR [edx*4]
mov	DWORD PTR _i$[esp+28], esi
mov	esi, DWORD PTR tv683[esp+24]
lea	eax, DWORD PTR [ebp+8]
npad	9
$LL27@cblas_ddot:
movsd	xmm0, QWORD PTR [eax-8]
mulsd	xmm0, QWORD PTR [ecx-24]
addsd	xmm0, xmm1
movsd	xmm1, QWORD PTR [esi+eax]
mulsd	xmm1, QWORD PTR [eax]
addsd	xmm0, xmm1
movsd	xmm1, QWORD PTR [eax+8]
mulsd	xmm1, QWORD PTR [ecx-8]
addsd	xmm0, xmm1
movsd	xmm1, QWORD PTR [eax+16]
mulsd	xmm1, QWORD PTR [ecx]
add	eax, 32					; 00000020H
add	ecx, 32					; 00000020H
dec	edx
addsd	xmm1, xmm0
jne	SHORT $LL27@cblas_ddot
mov	esi, DWORD PTR _i$[esp+28]
movsd	QWORD PTR _result$[esp+28], xmm1
$LC26@cblas_ddot:
; Line 51
cmp	esi, ebx
jge	SHORT $LN31@cblas_ddot
mov	edx, DWORD PTR _X$[esp+24]
mov	ecx, ebx
sub	edx, ebp
lea	eax, DWORD PTR [ebp+esi*8]
sub	ecx, esi
npad	5
$LC7@cblas_ddot:
; Line 53
movsd	xmm0, QWORD PTR [eax+edx]
mulsd	xmm0, QWORD PTR [eax]
add	eax, 8
dec	ecx
addsd	xmm1, xmm0
jne	SHORT $LC7@cblas_ddot
movsd	QWORD PTR _result$[esp+28], xmm1
$LN31@cblas_ddot:
; Line 55
cmp	edi, 5
; Line 57
jl	$LN28@cblas_ddot
$LN32@cblas_ddot:
; Line 63
cmp	ebx, edi
jge	$LN28@cblas_ddot
mov	eax, DWORD PTR _X$[esp+24]
lea	esi, DWORD PTR [eax+ebx*8+24]
sub	eax, ebp
sub	edi, ebx
mov	DWORD PTR tv683[esp+24], eax
dec	edi
mov	eax, -858993459				; cccccccdH
mul	edi
shr	edx, 2
lea	ecx, DWORD PTR [ebp+ebx*8+8]
inc	edx
npad	6
$LL3@cblas_ddot:
; Line 66
mov	eax, DWORD PTR tv683[esp+24]
movsd	xmm0, QWORD PTR [ecx-8]
mulsd	xmm0, QWORD PTR [esi-24]
movsd	xmm2, QWORD PTR [eax+ecx]
mulsd	xmm2, QWORD PTR [ecx]
addsd	xmm0, xmm2
movsd	xmm2, QWORD PTR [ecx+8]
mulsd	xmm2, QWORD PTR [esi-8]
addsd	xmm0, xmm2
movsd	xmm2, QWORD PTR [ecx+16]
mulsd	xmm2, QWORD PTR [esi]
addsd	xmm0, xmm2
movsd	xmm2, QWORD PTR [ecx+24]
mulsd	xmm2, QWORD PTR [esi+8]
add	ecx, 40					; 00000028H
add	esi, 40					; 00000028H
dec	edx
addsd	xmm0, xmm2
addsd	xmm1, xmm0
jne	SHORT $LL3@cblas_ddot
; Line 40
pop	esi
pop	ebp
movsd	QWORD PTR _result$[esp+20], xmm1
; Line 44
fld	QWORD PTR _result$[esp+20]
pop	ebx
pop	edi
; Line 75
add	esp, 12					; 0000000cH
ret	0
$LN14@cblas_ddot:
; Line 28
mov	eax, 1
; Line 29
mov	ecx, eax
; Line 30
test	edx, edx
jns	SHORT $LN13@cblas_ddot
; Line 32
sub	eax, edi
imul	eax, edx
inc	eax
$LN13@cblas_ddot:
; Line 34
test	esi, esi
jns	SHORT $LN12@cblas_ddot
; Line 36
sub	ecx, edi
imul	ecx, esi
inc	ecx
$LN12@cblas_ddot:
; Line 38
mov	ebx, DWORD PTR _Y$[esp+24]
xor	ebp, ebp
cmp	edi, 4
jl	SHORT $LC29@cblas_ddot
lea	ebp, DWORD PTR [edi-4]
shr	ebp, 2
inc	ebp
mov	DWORD PTR tv1489[esp+24], ebp
add	ebp, ebp
add	ebp, ebp
mov	DWORD PTR _i$[esp+28], ebp
npad	4
$LL30@cblas_ddot:
; Line 40
mov	ebp, DWORD PTR _X$[esp+24]
movsd	xmm0, QWORD PTR [ebp+eax*8]
mulsd	xmm0, QWORD PTR [ebx+ecx*8]
addsd	xmm0, xmm1
; Line 41
add	eax, edx
movsd	xmm1, QWORD PTR [ebp+eax*8]
; Line 42
add	ecx, esi
mulsd	xmm1, QWORD PTR [ebx+ecx*8]
addsd	xmm1, xmm0
add	eax, edx
movsd	xmm0, QWORD PTR [ebp+eax*8]
add	ecx, esi
mulsd	xmm0, QWORD PTR [ebx+ecx*8]
add	eax, edx
addsd	xmm0, xmm1
movsd	xmm1, QWORD PTR [ebp+eax*8]
add	ecx, esi
mulsd	xmm1, QWORD PTR [ebx+ecx*8]
add	eax, edx
add	ecx, esi
dec	DWORD PTR tv1489[esp+24]
addsd	xmm1, xmm0
jne	SHORT $LL30@cblas_ddot
; Line 40
mov	ebp, DWORD PTR _i$[esp+28]
movsd	QWORD PTR _result$[esp+28], xmm1
$LC29@cblas_ddot:
; Line 38
cmp	ebp, edi
jge	SHORT $LN28@cblas_ddot
add	esi, esi
add	edx, edx
add	esi, esi
add	edx, edx
lea	ecx, DWORD PTR [ebx+ecx*8]
mov	ebx, DWORD PTR _X$[esp+24]
add	esi, esi
add	edx, edx
lea	eax, DWORD PTR [ebx+eax*8]
sub	edi, ebp
npad	4
$LC11@cblas_ddot:
; Line 40
movsd	xmm0, QWORD PTR [ecx]
mulsd	xmm0, QWORD PTR [eax]
; Line 41
add	eax, edx
; Line 42
add	ecx, esi
dec	edi
addsd	xmm1, xmm0
jne	SHORT $LC11@cblas_ddot
; Line 40
movsd	QWORD PTR _result$[esp+28], xmm1
$LN28@cblas_ddot:
; Line 44
fld	QWORD PTR _result$[esp+28]
pop	esi
pop	ebp
pop	ebx
pop	edi
; Line 75
add	esp, 12					; 0000000cH
ret	0
_cblas_ddot ENDP
_TEXT	ENDS


GotoBLAS2库中DOT函数汇编语言代码

/*********************************************************************/
/*                                                                   */
/*             Optimized BLAS libraries                              */
/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
/*                                                                   */
/* Copyright (c) The University of Texas, 2009. All rights reserved. */
/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
/* Under no circumstances shall University be liable for incidental, */
/* special, indirect, direct or consequential damages or loss of     */
/* profits, interruption of business, or related expenses which may  */
/* arise from use of Software or Documentation, including but not    */
/* limited to those resulting from defects in Software and/or        */
/* Documentation, or loss or inaccuracy of data of any kind.         */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#define STACK	12
#define ARGS     0

#define STACK_N		 4 + STACK + ARGS(%esp)
#define STACK_X		 8 + STACK + ARGS(%esp)
#define STACK_INCX	12 + STACK + ARGS(%esp)
#define STACK_Y		16 + STACK + ARGS(%esp)
#define STACK_INCY	20 + STACK + ARGS(%esp)

#define N	%ebx
#define X	%esi
#define INCX	%ecx
#define Y	%edi
#define INCY	%edx

PROLOGUE

pushl	%edi
pushl	%esi
pushl	%ebx

PROFCODE

#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95)
EMMS
#endif

movl	STACK_N,     N
movl	STACK_X,     X
movl	STACK_INCX,  INCX
movl	STACK_Y,     Y
movl	STACK_INCY,  INCY

#ifdef F_INTERFACE
movl	(N),N
movl	(INCX),INCX
movl	(INCY),INCY
#endif

leal	(, INCX, SIZE), INCX
leal	(, INCY, SIZE), INCY

fldz
fldz
fldz
fldz

cmpl	$SIZE, INCX
jne	.L14
cmpl	$SIZE, INCY
jne	.L14

movl	N, %eax
sarl	$2,   %eax
jle	.L15
ALIGN_3

.L16:
FLD	0 * SIZE(X)
FMUL	0 * SIZE(Y)
faddp	%st,%st(1)
FLD	1 * SIZE(X)
FMUL	1 * SIZE(Y)
faddp	%st,%st(2)
FLD	2 * SIZE(X)
FMUL	2 * SIZE(Y)
faddp	%st,%st(3)
FLD	3 * SIZE(X)
FMUL	3 * SIZE(Y)
faddp	%st,%st(4)
addl	$4 * SIZE, X
addl	$4 * SIZE, Y
decl	%eax
jg	.L16
ALIGN_3

.L15:
movl	N, %eax
andl	$3,   %eax
jle	.L27
ALIGN_3

.L22:
FLD	(X)
addl	$SIZE, X
FMUL	(Y)
addl	$SIZE, Y
faddp	%st,%st(1)
decl	%eax
jg	.L22

jmp	.L27
ALIGN_3

.L14:
#ifdef F_INTERFACE
testl	INCX, INCX
jge	.L28

movl	N, %eax
decl	%eax
imull	INCX, %eax
subl	%eax, X
ALIGN_3

.L28:
testl	INCY, INCY
jge	.L29

movl	N, %eax
decl	%eax
imull	INCY, %eax
subl	%eax, Y
ALIGN_3
.L29:
#endif
movl	N, %eax
sarl	$2,   %eax
jle	.L30
ALIGN_3

.L31:
FLD	(X)
addl	INCX, X
FMUL	(Y)
addl	INCY, Y
faddp	%st,%st(1)

FLD	(X)
addl	INCX, X
FMUL	(Y)
addl	INCY, Y
faddp	%st,%st(2)

FLD	(X)
addl	INCX, X
FMUL	(Y)
addl	INCY, Y
faddp	%st,%st(3)

FLD	(X)
addl	INCX, X
FMUL	(Y)
addl	INCY, Y
faddp	%st,%st(4)

decl	%eax
jg	.L31
ALIGN_3

.L30:
movl	N, %eax
andl	$3,   %eax
jle	.L27
ALIGN_3

.L37:
FLD	(X)
addl	INCX, X
FMUL	(Y)
addl	INCY, Y
faddp	%st, %st(1)
decl	%eax
jg	.L37
ALIGN_3

.L27:
faddp	%st,%st(2)
faddp	%st,%st(2)
faddp	%st,%st(1)

popl	%ebx
popl	%esi
popl	%edi
ret

EPILOGUE


使用方法

测试DDOT函数

unsigned int longGetCycleCount()
{
__asm RDTSC
}

void test_ddot()
{
int i;
int n = 19;
int inc1 = 1;
int inc2 = 1;
double result;
double X[] =
{  1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 };

double Y[] =
{ 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1, 12.1 };

char functionName[] = "ddot";
printf("================================================ \n");
printf("Testing BLAS library function -- %s \n", functionName);

for( i = 0; i < 100000; i++ )
{
result = cblas_ddot(n, X, inc1, Y, inc2);
}

printf("%s  of the arraies is %f \n", functionName, result);

if (fabs(12.1 - result) < 0.0001)
{
printf("%s test SUCCESS.\n", functionName);
}
else
{
printf("[Fail] %s test failed.\n", functionName);
}

printf("\n");
return;
}

int main( void )
{

unsigned int start,end;
double cost;

start = longGetCycleCount();
test_ddot() ;
end = longGetCycleCount();

cost =  (double)  (end-start) / 2900000000 ;
printf("%f \n",cost);

return 0;
}


未开启速度优化选项编译得到的运算时间 为0.005 秒左右。

开启速度优化选项编译得到的运算时间 为0.0005 秒左右。

开启编译器优化选项可使计算效率提高10倍左右。

测试环境

Visual Studio Express 2010

Operating System:Windows 7, 64-bit

CPU:Intel Core(TM) CPU 2.90GHz

Memory: 4.00GB

Hard disk: 500G

参考文献

[1] http://www.applied-mathematics.net/miniSSEL1BLAS/miniSSEL1BLAS.pdf
[2] https://software.intel.com/en-us/articles/use-intriniscs/
[3] Hadi Brais. Compilers - What Every Programmer Should Know About Compiler Optimizations. https://msdn.microsoft.com/en-us/magazine/dn904673.aspx. February 2015.

[4] Koushik Ghosh. Writing Efficient C and C Code Optimization. http://www.codeproject.com/Articles/6154/Writing-Efficient-C-and-C-Code-Optimization. 26 Feb 2004.

[5] http://sci.tuomastonteri.fi/programming/sse

[6] https://www.kernel.org/pub/linux/kernel/people/geoff/cell/ps3-linux-docs/CellProgrammingTutorial/BasicsOfSIMDProgramming.html
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: