您的位置：首页 > 其它

关于使用MMX/SSE技术优化memcpy的尝试

2010-08-18 09:28 218 查看

近来，希望能通过使用某种技术优化常规memcpy()的性能，于是尝试了 MMX/SSE，希望能借此实现一个性能更高的memcpy函数。

代码如下（里面的USE1函数是借用别人的，但性能也不怎么样）：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/time.h>
#define LEN		100*1024*1024
#define USE1
class TimeUse{
public:
TimeUse(char * cMsg)
{
memset(m_cMsg, 0, sizeof(m_cMsg));
strncpy(m_cMsg, cMsg, strlen(cMsg));
gettimeofday(&tTime1, NULL);
}
~TimeUse()
{
gettimeofday(&tTime2, NULL);
unsigned long ulDiff = (tTime2.tv_sec-tTime1.tv_sec)*1000 + (tTime2.tv_usec-tTime1.tv_usec)/1000;
printf("%s Use %ld ms/n", m_cMsg, ulDiff);
}
private:
struct timeval tTime1, tTime2;
char m_cMsg[255];
};
#ifdef USE0
/*100M耗时约85ms*/
static inline void * memcopy(void *dest, const void *src, int size)
{
int i, n, len, iCount;
char *	to = (char *)dest;
char *from = (char *)src;
n = size;
len = size;
char cFSave[108];
{
int i;
#if 0
__asm__ __volatile__ (
"1: prefetchnta 128(%0)/n"
: : "r" (from) );
#endif

/*开始MMX之前要保存FPS*/
iCount = (len/64);
if(iCount > 0)
{
__asm__(
".lcomm buffer, 108/n"
"fsave buffer/n"
"loop:/n"
"movq (%0), %%mm0/n"
"movq 8(%0), %%mm1/n"
"movq 16(%0), %%mm2/n"
"movq 24(%0), %%mm3/n"
"movq 32(%0), %%mm4/n"
"movq 40(%0), %%mm5/n"
"movq 48(%0), %%mm6/n"
"movq 56(%0), %%mm7/n"
"movntq %%mm0, (%1)/n"
"movntq %%mm1, 8(%1)/n"
"movntq %%mm2, 16(%1)/n"
"movntq %%mm3, 24(%1)/n"
"movntq %%mm4, 32(%1)/n"
"movntq %%mm5, 40(%1)/n"
"movntq %%mm6, 48(%1)/n"
"movntq %%mm7, 56(%1)/n"
"addl $64,%0/n"
"addl $64,%1/n"
"dec %2/n"
"jnz loop/n"
"frstor buffer/n"
"emms/n"
: : "a" (from), "b" (to), "c" (iCount) : "memory");
}
if (len%64)
{
memcpy(to+(len/64)*64, from+(len/64)*64, len%64);
}

return to;
}

}
#endif
#ifdef USE1
/*http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0  *
* http://mail-index.netbsd.org/tech-perform/2002/10/23/0004.html */
/*100M耗时约85ms*/
static inline void * memcopy(void *dest, const void *src, int size)
{
int i, n;
char *	to = (char *)dest;
char *from = (char *)src;
n = size;

{
size_t size;
#define STEP 0x20
#define ALIGN 0x10
if ((unsigned long)to & (ALIGN-1)) {
size = ALIGN - ((unsigned long)to & (ALIGN-1));
__asm__ __volatile__("movups (%0),%%xmm0/n/t"
"movups %%xmm0,(%1)/n/t"
:
: "r" (from),
"r" (to));
n -= size;
from += size;
to += size;
}
/*
* If the copy would have tailings, take care of them
* now instead of later
*/
if (n & (ALIGN-1)) {
size = n - ALIGN;
__asm__ __volatile__("movups (%0),%%xmm0/n/t"
"movups %%xmm0,(%1)/n/t"
:
: "r" (from + size),
"r" (to + size));
n &= ~(ALIGN-1);
}
/*
* Prefetch the first two cachelines now.
*/
__asm__ __volatile__("prefetchnta 0x00(%0)/n/t"
"prefetchnta 0x20(%0)/n/t"
:
: "r" (from));

while (n >= STEP) {
__asm__ __volatile__(
"movups 0x00(%0),%%xmm0/n/t"
"movups 0x10(%0),%%xmm1/n/t"
"movntps %%xmm0,0x00(%1)/n/t"
"movntps %%xmm1,0x10(%1)/n/t"
:
: "r" (from), "r" (to)
: "memory");
from += STEP;
/*
* Note: Intermixing the prefetch at *exactly* this point
* in time has been shown to be the fastest possible.
* Timing these prefetch instructions is a complete black
* art with nothing but trial and error showing the way.
* To that extent, this optimum version was found by using
* a userland version of this routine that we clocked for
* lots of runs.  We then fiddled with ordering until we
* settled on our highest speen routines.  So, the long
* and short of this is, don't mess with instruction ordering
* here or suffer permance penalties you will.
*/
__asm__ __volatile__(
"prefetchnta 0x20(%0)/n/t"
:
: "r" (from));
to += STEP;
n -= STEP;
}

return to;
}
}
#endif
#ifdef USE2
/*100M耗时约85ms*/
static inline void * memcopy(void *dest, const void *src, int size)
{
int i, n, len;
char *	to = (char *)dest;
char *from = (char *)src;
n = size;
len = size;
{
int i;
__asm__ __volatile__ (
"1: prefetchnta (%0)/n"
"prefetchnta 64(%0)/n"
"prefetchnta 128(%0)/n"
"prefetchnta 192(%0)/n"
: : "r" (from) );
for(i=0; i<len/64; i++) {
__asm__ __volatile__ (
"prefetchnta 168(%0)/n"
"movq (%0), %%mm0/n"
"movntq %%mm0, (%1)/n"
"movq 8(%0), %%mm1/n"
"movntq %%mm1, 8(%1)/n"
"movq 16(%0), %%mm2/n"
"movntq %%mm2, 16(%1)/n"
"movq 24(%0), %%mm3/n"
"movntq %%mm3, 24(%1)/n"
"movq 32(%0), %%mm4/n"
"movntq %%mm4, 32(%1)/n"
"movq 40(%0), %%mm5/n"
"movntq %%mm5, 40(%1)/n"
"movq 48(%0), %%mm6/n"
"movntq %%mm6, 48(%1)/n"
"movq 56(%0), %%mm7/n"
"movntq %%mm7, 56(%1)/n"
: : "r" (from), "r" (to) : "memory");
from+=64;
to+=64;
}
if (len&63)
memcpy(to, from, len&63);

return to;
}
}
#endif
/*用法: ./test 0; ./test 1*/
int main(int argc, char ** argv)
{
char * 	pcSrc 		= NULL;
char * 	pcDst 		= NULL;
char * 	pcSrc1 		= NULL;
char * 	pcDst1 		= NULL;
char * 	pcSrc2 		= NULL;
char * 	pcDst2 		= NULL;
int 	iChoice 	= 0;
int  a,b,c,d;
float * pfData = NULL;
if(argc > 2)
{
printf("Usage: './test 0' to use memcpy; or './test 1' to use memcopy/n");
}
if(argc == 2)
{
sscanf(argv[1], "%d", &iChoice);
printf("your choice is %d/n", iChoice);
}
pcSrc = new char[LEN];
pcDst = new char[LEN];
pcSrc1 = new char[LEN];
pcDst1 = new char[LEN];

pcSrc2 = new char[LEN];
pcDst2 = new char[LEN];
pfData = new float[LEN];

int iLoop = 0;
while((iLoop++) <= 255)
{
//TimeUse t("Loop");
{
{
int * piTemp = NULL;
piTemp = (int *)(pcDst);
TimeUse t("==");

for(int iTemp=0; iTemp<LEN/4;iTemp++)
{
*piTemp++=123;
}
}

{
TimeUse t("memset");
memset(pcSrc, iLoop, LEN);
}
memset(pcSrc1, iLoop, LEN);
memset(pcSrc2, iLoop, LEN);
for(int iLoop2=0; iLoop2<LEN; iLoop2++)
{
pfData[iLoop2]=1.0123456789+iLoop;
}
}
if(iChoice == 0)
{
TimeUse t("memcpy");
memcpy(pcDst, pcSrc, LEN);
}
else
{
{
TimeUse t("memcopy0");
memcopy(pcDst, pcSrc, LEN);
}
{
TimeUse t("memcopy1");
//memcopy(pcDst1, pcSrc1, LEN);
}
{
TimeUse t("memcopy2");
// memcopy(pcDst2, pcSrc2, LEN);
}
}
usleep(20000);
}
return 0;
}

试验结果：
1. 未优化，memcpy 100M数据：
[root@localhost opt]# ./test
memcpy Use 94 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms
memcpy Use 62 ms
memcpy Use 61 ms
memcpy Use 61
ms
memcpy Use 61 ms

2. 使用MMX/SSE优化，memcpy 100M数据：
[root@localhost
opt]# ./test 1
your choice is 1
memcopy0 Use 110
ms
memcopy1 Use 110 ms
memcopy2 Use 110 ms
memcopy0 Use
40 ms
memcopy1 Use 42 ms
memcopy2 Use 40 ms
memcopy0 Use 48
ms
memcopy1 Use 40 ms
memcopy2 Use 41 ms
memcopy0 Use 40 ms
memcopy1
Use 40
ms

初步结论：
使用MMS/SSE内存技术对memcpy的性能优化空间不太大，而且在执行初期，优化的性能甚至比不上未优化的性能。
从原理上讲，SSE会比MMX快，MMX会比常规memcpy快。可能受限于AT&T汇编掌握程度，暂时未能给出理想的优化结果。
如果谁有更好的想法，欢迎随时交流。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航