您的位置：首页 > 其它

图形图像处理－之－高质量的快速的图像缩放补充使用SSE2优化

2011-04-12 20:36 423 查看

图形图像处理－之－高质量的快速的图像缩放补充使用SSE2优化


HouSisong@GMail.com

   2011.04.12

tag:
图像缩放,速度优化,线性插值,三次卷积插值,SSE2,scale,bilinear,bicubic,StretchBlt

摘要:

本文章对线性插值和三次卷积插值(bicubic)的实现做了一些新的优化尝试;

使用了SSE2的128bit寄存器及相关指令;并预先建立SSE2用到的缩放系数表;

实现的结果在我的i7电脑上比以前的版本分别快出145%和75%!

线性插值的速度是StretchBlt的13倍!

正文:

(请先看看我的blog里<高质量的快速的图像缩放>的前3篇文章!)

支持SSE2指令集的CPU越来越多,CPU的SSE2实现性能也好了很多(以前不比MMX好多少),

而且软件在64位模式的时候不再支持MMX,所以尝试了SSE2的缩放优化,效果不错!

速度测试说明:

只测试内存数据到内存数据的缩放

测试图片都是800*600缩放到1024*768,单线程;fps表示每秒钟的帧数,值越大表示函数越快.

速度测试对比:
(CPU:i7 920 内存:DDR3 1333 3通道)

(windows)

StretchBlt 近邻取样           869.09 fps

StretchBlt 线性插值            44.46 fps //SetStretchBltMode(dc,4);?

PicZoom0:                       95.69 fps

PicZoom1:                      158.35 fps

PicZoom2:                      332.78 fps

PicZoom3:                     1172.79 fps

PicZoom3_float:                874.13 fps

PicZoom3_Table:               1158.30 fps

PicZoom3_SSE:                 1908.40 fps

PicZoom_Bilinear0:              28.80 fps

PicZoom_Bilinear1:              56.09 fps

PicZoom_Bilinear2:              97.09 fps

PicZoom_Bilinear_Common:       119.83 fps

PicZoom_Bilinear_MMX:          180.12 fps

PicZoom_Bilinear_MMX_Ex:       237.34 fps

PicZoom_ftBilinear_Common:     118.67 fps

PicZoom_ftBilinear_MMX:        213.68 fps

PicZoom_ThreeOrder0:             6.11 fps

PicZoom_ThreeOrder_Common:      25.38 fps

PicZoom_ThreeOrder_MMX:         52.32 fps

(SSE2的实现)

PicZoom_ftBilinearTable_SSE2: 588.24 fps

PicZoom_ThreeOrderTable_SSE2:   93.24 fps

PicZoom_ftBilinearTable_SSE2实现代码如下:

typedef UInt64 TMMXData64;
//ftBilinearTable_SSE2(out [edi+ebx*4]; xmm5=v,xmm6=vr,xmm7=0,[ebp]=(u,ur),[edx]=srx_x,esi=PSrcLineColor,ecx=PSrcLineColorNext)
//void __declspec(naked) ftBilinearTable_SSE2(){
#define  ftBilinearTable_SSE2()                     /
asm mov         eax,[edx+ebx]                   /
asm movq        xmm0,qword ptr[esi+eax*4]       /
asm movq        xmm1,qword ptr[ecx+eax*4]       /
asm punpcklbw   xmm0,xmm7                       /
asm punpcklbw   xmm1,xmm7                       /
asm pmullw      xmm0,mm5                        /
asm pmullw      xmm1,mm6                        /
asm paddw       xmm0,xmm1                       /
asm pmulhw      xmm0,xmmword ptr [ebp+ebx*4] /
asm movdqa      xmm1,xmm0                       /
asm punpckhqdq  xmm0,xmm0                       /
asm paddw       xmm0,xmm1                       /
asm packuswb    xmm0,xmm7                       /
asm movd  dword ptr  [edi+ebx],xmm0
//ret //for  __declspec(naked)
//}
//}
//void __declspec(naked) ftBilinearTable_SSE2_expand2(){
#define  ftBilinearTable_SSE2_expand2()             /
asm mov         eax,[edx+ebx]                   /
asm movq        xmm0,qword ptr[esi+eax*4]       /
asm movq        xmm1,qword ptr[ecx+eax*4]       /
asm mov         eax,[edx+ebx+4]                 /
asm movq        xmm2,qword ptr[esi+eax*4]       /
asm movq        xmm3,qword ptr[ecx+eax*4]       /
asm punpcklbw   xmm0,xmm7                       /
asm punpcklbw   xmm1,xmm7                       /
asm punpcklbw   xmm2,xmm7                       /
asm punpcklbw   xmm3,xmm7                       /
asm pmullw      xmm0,mm5                        /
asm pmullw      xmm1,mm6                        /
asm pmullw      xmm2,mm5                        /
asm pmullw      xmm3,mm6                        /
asm paddw       xmm0,xmm1                       /
asm paddw       xmm2,xmm3                       /
asm pmulhw      xmm0,xmmword ptr [ebp+ebx*4] /
asm pmulhw      xmm2,xmmword ptr [ebp+ebx*4+16] /
asm movdqa      xmm1,xmm0                       /
asm punpcklqdq  xmm0,xmm2                       /
asm punpckhqdq  xmm1,xmm2                       /
asm paddw       xmm0,xmm1                       /
asm packuswb    xmm0,xmm7                       /
asm movq  qword ptr  [edi+ebx],xmm0             /
//ret //for  __declspec(naked)
//}
//}

void PicZoom_ftBilinearTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if (  (0==Dst.width)||(0==Dst.height)
||(2>Src.width)||(2>Src.height)) return;
long xrIntFloat_16=((Src.width-1)<<16)/Dst.width;
long yrIntFloat_16=((Src.height-1)<<16)/Dst.height;
long dst_width=Dst.width;
UInt8* _bufMem=new UInt8[(dst_width*2*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐
Int32* xList=(Int32*)(uList+dst_width*2);
{//init u table
long srcx_16=0;
for (long x=0;x<dst_width*2;x+=2){
xList[x>>1]=(srcx_16>>16);
unsigned long u=(srcx_16>>8)&0xFF;
unsigned long ur=(256-u)<<1;
u=u<<1;
uList[x+0]=(ur|(ur<<16));
uList[x+0]|=uList[x+0]<<32;
uList[x+1]=u|(u<<16);
uList[x+1]|=uList[x+1]<<32;
srcx_16+=xrIntFloat_16;
}
}
Color32* pDstLine=Dst.pdata;
long srcy_16=0;
asm pxor  xmm7,xmm7 //xmm7=0
for (long y=0;y<Dst.height;++y){
unsigned long v=(srcy_16>>8) & 0xFF;
unsigned long vr=(256-v)>>1;
v>>=1;
Color32* PSrcLineColor= (Color32*)((UInt8*)(Src.pdata)+Src.byte_width*(srcy_16>>16)) ;
Color32* PSrcLineColorNext= (Color32*)((UInt8*)(PSrcLineColor)+Src.byte_width) ;
asm{
movd        xmm5,vr
movd        xmm6,v
punpcklwd   xmm5,xmm5
punpcklwd   xmm6,xmm6
punpckldq   xmm5,xmm5
punpckldq   xmm6,xmm6
punpcklqdq  xmm5,xmm5
punpcklqdq  xmm6,xmm6

mov       esi,PSrcLineColor
mov       ecx,PSrcLineColorNext
mov       edx,xList //x
mov       ebx,dst_width
mov       edi,pDstLine
push      ebp
mov       ebp,uList
push      ebx

and       ebx,(not 1)
test      ebx,ebx
jle     end_loop2

lea       ebx,[ebx*4]
lea       edi,[edi+ebx]
lea       edx,[edx+ebx]
lea       ebp,[ebp+ebx*4]
neg       ebx
loop2_start:
//call ftBilinearTable_SSE2_expand2
ftBilinearTable_SSE2_expand2()
add       ebx,8
jnz       loop2_start
end_loop2:
pop    ebx
and    ebx,1
test   ebx,ebx
jle    end_write
lea       ebx,[ebx*4]
lea       edi,[edi+ebx]
lea       edx,[edx+ebx]
lea       ebp,[ebp+ebx*4]
neg       ebx
loop1_start:
//call ftBilinearTable_SSE2
ftBilinearTable_SSE2()
add       ebx,4
jnz       loop1_start
end_write:
pop       ebp
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}

PicZoom_ThreeOrderTable_SSE2实现代码如下:

static TMMXData64 SinXDivX_Table64_MMX[(2<<8)+1];
class _CAutoInti_SinXDivX_Table64_MMX {
private:
void _Inti_SinXDivX_Table64_MMX()
{
for (long i=0;i<=(2<<8);++i)
{
unsigned short t=(unsigned short)(0.5+(1<<14)*SinXDivX(i*(1.0/(256))));
unsigned long tl=t|(((unsigned long)t)<<16);
TMMXData64 tll=tl|(((TMMXData64)tl)<<32);
SinXDivX_Table64_MMX[i]=tll;
}
};
public:
_CAutoInti_SinXDivX_Table64_MMX() { _Inti_SinXDivX_Table64_MMX(); }
};
static _CAutoInti_SinXDivX_Table64_MMX __tmp_CAutoInti_SinXDivX_Table64_MMX;
//void __declspec(naked)  _private_ThreeOrderTable_Fast_SSE2_2(){
#define  _private_ThreeOrderTable_Fast_SSE2_2() /
asm movq        xmm0,qword ptr [eax]        /
asm movq        xmm1,qword ptr [eax+8]      /
asm movq        xmm2,qword ptr [eax+edx] /
asm movq        xmm3,qword ptr [eax+edx+8] /
asm punpcklbw   xmm0,xmm7     /
asm punpcklbw   xmm1,xmm7     /
asm punpcklbw   xmm2,xmm7     /
asm punpcklbw   xmm3,xmm7     /
asm psllw       xmm0,7      /
asm psllw       xmm1,7      /
asm psllw       xmm2,7      /
asm psllw       xmm3,7      /
asm pmulhw      xmm0,xmmword ptr [ecx]  /
asm pmulhw      xmm1,xmmword ptr [ecx+16] /
asm pmulhw      xmm2,xmmword ptr [ecx]  /
asm pmulhw      xmm3,xmmword ptr [ecx+16] /
asm paddsw      xmm0,xmm1     /
asm paddsw      xmm2,xmm3     /
asm pmulhw      xmm0,xmmword ptr [ebx]  /
asm pmulhw      xmm2,xmmword ptr [ebx+16] /
asm paddsw      xmm0,xmm2     /
//asm ret //for __declspec(naked)
//}
must_inline UInt32 ThreeOrderTable_Fast_SSE2(const Color32* pixel,long byte_width,const TMMXData64* v4,const TMMXData64* u4){
asm mov     eax,pixel
asm mov     edx,byte_width
asm mov     ebx,v4
asm mov     ecx,u4
//asm call _private_ThreeOrderTable_Fast_SSE2_2
_private_ThreeOrderTable_Fast_SSE2_2();
asm movdqa  xmm6,xmm0
asm lea     eax,[eax+edx*2]  //+pic.byte_width
asm lea     ebx,[ebx+32]
//asm call _private_ThreeOrderTable_Fast_SSE2_2
_private_ThreeOrderTable_Fast_SSE2_2();
asm paddsw      xmm6,xmm0
asm movdqa      xmm5,xmm6
asm psrldq      xmm6,8   //srl 8*8 bit!
asm paddsw      xmm5,xmm6
asm psraw       xmm5,3
asm packuswb    xmm5,xmm7
asm movd        eax,xmm5
}
must_inline long getSizeBorder(long x,long maxx){
if (x<=0)
return 0;
else if (x>=maxx)
return maxx;
else
return x;
}
must_inline UInt32 ThreeOrderTable_Border_SSE2(const TPixels32Ref& pic,const long x0_sub1,const long y0_sub1,const TMMXData64* v4,const TMMXData64* u4){
Color32 pixel[16];
long height_sub_1=pic.height-1;
long width_sub_1=pic.width-1;
Color32* pbuf=pixel;
for (long i=0;i<4;++i,pbuf+=4){
long y=getSizeBorder(y0_sub1+i,height_sub_1);
Color32* pLine=pic.getLinePixels(y);
pbuf[0]=pLine[getSizeBorder(x0_sub1+0,width_sub_1)];
pbuf[1]=pLine[getSizeBorder(x0_sub1+1,width_sub_1)];
pbuf[2]=pLine[getSizeBorder(x0_sub1+2,width_sub_1)];
pbuf[3]=pLine[getSizeBorder(x0_sub1+3,width_sub_1)];
}
return ThreeOrderTable_Fast_SSE2(pixel,4*sizeof(Color32),v4,u4);
}
void PicZoom_ThreeOrderTable_SSE2(const TPixels32Ref& Dst,const TPixels32Ref& Src)
{
if (  (0==Dst.width)||(0==Dst.height)
||(0==Src.width)||(0==Src.height)) return;
long dst_width=Dst.width;
long dst_height=Dst.height;
long xrIntFloat_16=((Src.width)<<16)/dst_width+1;
long yrIntFloat_16=((Src.height)<<16)/dst_height+1;
const long csDErrorX=-(1<<15)+(xrIntFloat_16>>1);
const long csDErrorY=-(1<<15)+(yrIntFloat_16>>1);
//计算出需要特殊处理的边界
long border_y0=((1<<16)-csDErrorY)/yrIntFloat_16+1;//y0+y*yr>=1; y0=csDErrorY => y>=(1-csDErrorY)/yr
if (border_y0>=dst_height) border_y0=dst_height;
long border_x0=((1<<16)-csDErrorX)/xrIntFloat_16+1;
if (border_x0>=dst_width ) border_x0=dst_width;
long border_y1=(((Src.height-3)<<16)-csDErrorY)/yrIntFloat_16+1; //y0+y*yr<=(height-3) => y<=(height-3-csDErrorY)/yr
if (border_y1<border_y0) border_y1=border_y0;
long border_x1=(((Src.width-3)<<16)-csDErrorX)/xrIntFloat_16+1;;
if (border_x1<border_x0) border_x1=border_x0;
UInt8* _bufMem=new UInt8[(dst_width*4*sizeof(TMMXData64)+15)+dst_width*sizeof(Int32)];
TMMXData64* uList=(TMMXData64*)((((ptrdiff_t)_bufMem)+15)>>4<<4); //16byte对齐
Int32* xList=(Int32*)(uList+dst_width*4);
{//init u table
long srcx_16=csDErrorX;
for (long x=0;x<dst_width*4;x+=4){
xList[x>>2]=(srcx_16>>16)-1;
long u=(srcx_16>>8)&0xFF;
uList[x+0]=SinXDivX_Table64_MMX[256+u];
uList[x+1]=SinXDivX_Table64_MMX[u];
uList[x+2]=SinXDivX_Table64_MMX[256-u];
uList[x+3]=SinXDivX_Table64_MMX[512-u];
srcx_16+=xrIntFloat_16;
}
}
TMMXData64 _v4[8+2];
TMMXData64* v4=(&_v4[0]); v4=(TMMXData64*)( (((ptrdiff_t)v4)+15)>>4<<4);
asm pxor    xmm7,xmm7
Color32* pDstLine=Dst.pdata;
long srcy_16=csDErrorY;
for (long y=0;y<dst_height;++y){
//v table
const long srcy_sub1=(srcy_16>>16)-1;
const long v=(srcy_16>>8)&0xFF;
v4[0]=SinXDivX_Table64_MMX[256+v];
v4[1]=v4[0];
v4[2]=SinXDivX_Table64_MMX[v];
v4[3]=v4[2];
v4[4]=SinXDivX_Table64_MMX[256-v];
v4[5]=v4[4];
v4[6]=SinXDivX_Table64_MMX[512-v];
v4[7]=v4[6];
if ((y<border_y0)||(y>=border_y1)){
for (long x=0;x<dst_width;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]); //border
}else{
for (long x=0;x<border_x0;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border
const Color32* pixelLine=Src.getLinePixels(srcy_sub1);
long byte_width=Src.byte_width;
for (long x=border_x0;x<border_x1;++x)
pDstLine[x].argb=ThreeOrderTable_Fast_SSE2(&pixelLine[xList[x]],byte_width,v4,&uList[x*4]);//fast MMX !
for (long x=border_x1;x<dst_width;++x)
pDstLine[x].argb=ThreeOrderTable_Border_SSE2(Src,xList[x],srcy_sub1,v4,&uList[x*4]);//border
}
srcy_16+=yrIntFloat_16;
((UInt8*&)pDstLine)+=Dst.byte_width;
}
delete []_bufMem;
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航

图形图像处理－之－高质量的快速的图像缩放 补充 使用SSE2优化

图形图像处理－之－高质量的快速的图像缩放补充使用SSE2优化