您的位置:首页 > 数据库 > Memcache

memcached空指针内存错误与死循环问题分析(memcached dead loop and crash bug! issue #260 and issue #370)

2014-06-27 15:01 567 查看
(由于这是发在memcached邮件列表的,所以只能用一下蹩脚的英文了)

(youshouldreadthediscussaboutissue#260first:https://github.com/memcached/memcached/pull/67)
Thanksfordormandotopullmebacktothedo_item_allocfunction,andialsoreadagaincarefullyabouthowtoreproduceandfixtheissue#260.Itisawfultofindtheextremelypathtoreproducethebut.ButimafraidthereMAYhaveotherproblems,evenatvervion.20,morehigherproblibility.  

reproducestepbystepinv1.4.20:

1)hashexpanding,switchtheitem_locktoglobal_lock;
2)InthreadA,do_item_alloctrytogetanewitem,andtrytogetaITEMfromtheLRUtailer.butitdoesnotlocktheITEM,becauseittrytoholdtheitem_locks[];
3)afterdo_item_allocchecktherefcount(afterlineitems.c:139),otherthreadBtrytoholdtheITEM,andincreasetherefcount,andbelieveitselfholdarefcount;
4)InthreadA,do_item_allocevictedtheITEM,andinitilizetheITEM(resettherefcountto1),asanewonereturningtothecaller;
5)InthreadB,itcallitem_removetodereferencetherefcount,andmaketherefcountto0,sotheITEMisfree!(thethreadAisholdingtheITEM,andtheITMEisinthehashtableyet);
6)So,anyterriblethingmayhappen,includingcrashanddeadloop.

Inversionlowerthan1.4.19,iteasiertooccurrevenifthereisnohashexpanding.Becausethecodeindodo_item_allocasfollowing:

1)ifthecur_hv==cur_hv,(itispossibleatreplaceoperation);
or
2)ifthefirstloophv!=cur_hv,andholdalock;thenthesecondloophv==cur_hv(withoutalock,butthelock'spointerisnotresetinthefirstloop!),thenreleasethelockwrong.

for(;tries>0&&search!=NULL;tries--,search=search->prev){
uint32_thv=hash(ITEM_key(search),search->nkey,0);
/*Attempttohashitemlockthe"search"item.Iflocked,no
*othercallerscanincrtherefcount
*/
/*FIXME:Ithinkweneedtomaskthehvhereforcomparison?*/
if(hv!=cur_hv&&(hold_lock=item_trylock(hv))==NULL)------>
continue;
/*Nowseeiftheitemisrefcountlocked*/
if(refcount_incr(&search->refcount)!=2){
refcount_decr(&search->refcount);
/*Oldrarebugcouldcausearefcountleak.Wehaven'tseen
*itinyears,butweleavethiscodeintopreventfailures
*justincase*/
if(search->time+TAIL_REPAIR_TIME<current_time){
itemstats[id].tailrepairs++;
search->refcount=1;
do_item_unlink_nolock(search,hv);
}
if(hold_lock)
item_trylock_unlock(hold_lock);
continue;
}


Ithinkit'sclearenough,butialsotrytoshowmoredetailasfollowing,andtrytoreproducethephenomenoningdb:

Idon'tknowwhythefunction"assoc_maintenance_thread"writedasfollowing,butotherthreadswillgettheglobal_lockbefore"assoc_maintenance_thread"getitagain.

staticvoid*assoc_maintenance_thread(void*arg){

while(do_run_maintenance_thread){
intii=0;

/*Lockthecache,andbulkmovemultiplebucketstothenew
*hashtable.*/
item_lock_global();---------------->step5)trytogettheglobal_lock,ithavetowaitforotherthreadsto-------------releasethelock.
mutex_lock(&cache_lock);

for(ii=0;ii<hash_bulk_move&&expanding;++ii){
.....
}

mutex_unlock(&cache_lock);
item_unlock_global();

if(!expanding){
/*finishedexpanding.tellallthreadstousefine-grainedlocks*/
switch_item_lock_type(ITEM_LOCK_GRANULAR);
slabs_rebalancer_resume();
/*Wearedoneexpanding..justwaitfornextinvocation*/
mutex_lock(&cache_lock);
started_expanding=false;
pthread_cond_wait(&maintenance_cond,&cache_lock);------------>step1)waithereforexpandingnotify.
/*Beforedoinganything,tellthreadstouseagloballock*/
mutex_unlock(&cache_lock);
slabs_rebalancer_pause();
switch_item_lock_type(ITEM_LOCK_GLOBAL);----------->step2)switchtotheglobal_lock,withoutholdingtheglobal_lock.
----otherthreadwillgettheglobal_lockfirst.itisnotthread-safe.
mutex_lock(&cache_lock);
assoc_expand();---->step3)expandthehashsize,buttheitemsarenotmovedtothenewbuckets.
mutex_unlock(&cache_lock);--->step4)releasethelock,MAYbenotthread-safe.
}
}
returnNULL;
}


andinthefunction"do_item_alloc",usingthefunction"item_trylock"togettheitem'slock.pleaselookcarefullyat"item_trylock",itdirectlyaccess"item_locks",hopinga"no-op"asitsaidinthecomment.theitem(search)isnotlockatall.

So,afterthechecker"if(refcount_incr(&search->refcount)!=2)",otherthreadsmayholdtheitemandincreasetherefcounte,thatwillmake

item*do_item_alloc(char*key,constsize_tnkey,constintflags,
constrel_time_texptime,constintnbytes,
constuint32_tcur_hv){
uint8_tnsuffix;
item*it=NULL;
charsuffix[40];
size_tntotal=item_make_header(nkey+1,flags,nbytes,suffix,&nsuffix);
if(settings.use_cas){
ntotal+=sizeof(uint64_t);
}

unsignedintid=slabs_clsid(ntotal);
if(id==0)
return0;

mutex_lock(&cache_lock);
/*doaquickcheckifwehaveanyexpireditemsinthetail..*/
inttries=5;
inttried_alloc=0;
item*search;
void*hold_lock=NULL;
rel_time_toldest_live=settings.oldest_live;

search=tails[id];
/*Wewalkup*only*forlockeditems.Neversearchingforexpired.
*WasteofCPUforalmostalldeployments*/
for(;tries>0&&search!=NULL;tries--,search=search->prev){
if(search->nbytes==0&&search->nkey==0&&search->it_flags==1){
/*Weareacrawler,ignoreit.*/
tries++;
continue;
}
uint32_thv=hash(ITEM_key(search),search->nkey);
/*Attempttohashitemlockthe"search"item.Iflocked,no
*othercallerscanincrtherefcount
*/
/*Don'taccidentallygrabourselves,orbailifwecan'tquicklock*/
if(hv==cur_hv||(hold_lock=item_trylock(hv))==NULL)--------------->1)item_trylockalwaysgetlockfromitem_locks[],notglobal_lock
continue;
/*Nowseeiftheitemisrefcountlocked*/
if(refcount_incr(&search->refcount)!=2){--------------->2)nowtheserch->refcount==2,meansonlythelru-linkreferencetheitem.
refcount_decr(&search->refcount);
/*Oldrarebugcouldcausearefcountleak.Wehaven'tseen
*itinyears,butweleavethiscodeintopreventfailures
*justincase*/
if(settings.tail_repair_time&&
search->time+settings.tail_repair_time<current_time){
itemstats[id].tailrepairs++;
search->refcount=1;
do_item_unlink_nolock(search,hv);
}
if(hold_lock)
item_trylock_unlock(hold_lock);
continue;
}

/*Expiredorflushed*/
if((search->exptime!=0&&search->exptime<current_time)------------------>3)afterthisline,otherthreadmaygotholdtheitemandincreasetherefcount.
------------------soiftheitemisallocasanewitem(refcountresetto1),thentheotherthread(
-------------------holdtheitem)calldo_item_remove,itwillfreethe"new"item(becausetherefcount-------------------is0).thisisveryhighlypossibleattheitemevictedcase.

||(search->time<=oldest_live&&oldest_live<=current_time)){
itemstats[id].reclaimed++;
if((search->it_flags&ITEM_FETCHED)==0){
itemstats[id].expired_unfetched++;
}
it=search;
slabs_adjust_mem_requested(it->slabs_clsid,ITEM_ntotal(it),ntotal);
do_item_unlink_nolock(it,hv);
/*Initializetheitemblock:*/
it->slabs_clsid=0;
}elseif((it=slabs_alloc(ntotal,id))==NULL){
tried_alloc=1;
if(settings.evict_to_free==0){
itemstats[id].outofmemory++;
}else{
itemstats[id].evicted++;
itemstats[id].evicted_time=current_time-search->time;
if(search->exptime!=0)
itemstats[id].evicted_nonzero++;
if((search->it_flags&ITEM_FETCHED)==0){
itemstats[id].evicted_unfetched++;
}


/*Specialcase.WhenITEM_LOCK_GLOBALmodeisenabled,thisshouldbecomea
*no-op,asit'sonlycalledfromwithintheitemlockifnecessary.
*However,wecan'tmixano-opandthreadswhicharestillsynchronizingto
*GLOBAL.Soinsteadwejustalwaystrytolock.WheninGLOBALmodethis
*turnsintoaneffectiveno-op.Threadsre-synchronizeafterthepowerlevel
*switchsoitshouldstaysafe.
*/
void*item_trylock(uint32_thv){
pthread_mutex_t*lock=&item_locks[hv&hashmask(item_lock_hashpower)];
if(pthread_mutex_trylock(lock)==0){
returnlock;
}
returnNULL;
}


gdbreproduce(partial):

bassoc.c:211
bitem_get
bitems.c:158

(gdb)infothread
IdTargetIdFrame
6Thread0x7ffff4d71700(LWP12647)"memcached"(running)
5Thread0x7ffff5572700(LWP12646)"memcached"(running)
4Thread0x7ffff5d73700(LWP12645)"memcached"(running)
*3Thread0x7ffff6574700(LWP12644)"memcached"(running)
2Thread0x7ffff6d75700(LWP12643)"memcached"(running)
1Thread0x7ffff7fe0740(LWP12642)"memcached"0x00007ffff76ad9a3inepoll_wait()at../sysdeps/unix/syscall-template.S:81

////step1////addanewitem,modifythevalueofhash_itemsto100000,tomakingahashexpanding.

(gdb)thread3
[Switchingtothread3(Thread0x7ffff6574700(LWP12644))]
#0do_item_alloc(key=0x7ffff0026414"e",nkey=1,flags=0,exptime=0,nbytes=3,cur_hv=0)atitems.c:159
159tried_alloc=1;
(gdb)c
Continuing.

Breakpoint4,assoc_insert(it=0x7ffff7f35f70,hv=1187334374)atassoc.c:170
170if(!expanding&&hash_items>(hashsize(hashpower)*3)/2){
(gdb)508it=do_item_get(key,nkey,hv);
phash_items
$4=1000000
Breakpoint1,assoc_maintenance_thread(arg=0x0)atassoc.c:211
211item_lock_global();
(gdb)infothread
IdTargetIdFrame
6Thread0x7ffff4d71700(LWP12647)"memcached"assoc_maintenance_thread(arg=0x0)atassoc.c:211
5Thread0x7ffff5572700(LWP12646)"memcached"(running)
4Thread0x7ffff5d73700(LWP12645)"memcached"(running)
*3Thread0x7ffff6574700(LWP12644)"memcached"(running)
2Thread0x7ffff6d75700(LWP12643)"memcached"(running)
1Thread0x7ffff7fe0740(LWP12642)"memcached"(running)

/////deleteallkeyinthecache.
/////step2///addaitem,key="71912_yhd.serial.product.get_1.0_0",totheslot2,asthefirstitemnow.

////step3///getthekey="71912_yhd.serial.product.get_1.0_0",

///step4//andtrytoaddanewkey="Y_ORDER_225426358_02"totheslot2.

Breakpoint5,do_item_alloc(key=0x7ffff0026414"71912_yhd.serial.product.get_1.0_0",nkey=34,flags=0,exptime=0,nbytes=3,cur_hv=0)atitems.c:92
92constuint32_tcur_hv){
(gdb)n

....

Breakpoint3,item_get(key=0x7fffe0026414"71912_yhd.serial.product.get_1.0_0",nkey=34)atthread.c:506
506hv=hash(key,nkey);
(gdb)infothread
IdTargetIdFrame
6Thread0x7ffff4d71700(LWP12647)"memcached"assoc_maintenance_thread(arg=0x0)atassoc.c:211
5Thread0x7ffff5572700(LWP12646)"memcached"(running)
4Thread0x7ffff5d73700(LWP12645)"memcached"(running)
3Thread0x7ffff6574700(LWP12644)"memcached"do_item_alloc(key=0x7ffff0026414"Y_ORDER_225426358_02",nkey=20,flags=0,exptime=0,nbytes=22,cur_hv=0)atitems.c:147
*2Thread0x7ffff6d75700(LWP12643)"memcached"item_get(key=0x7fffe0026414"71912_yhd.serial.product.get_1.0_0",nkey=34)atthread.c:506
1Thread0x7ffff7fe0740(LWP12642)"memcached"(running)
(gdb)n
507item_lock(hv);
(gdb)
508it=do_item_get(key,nkey,hv);
(gdb)
509item_unlock(hv);

(gdb)thread3
[Switchingtothread3(Thread0x7ffff6574700(LWP12644))]
#0do_item_alloc(key=0x7ffff0026414"Y_ORDER_225426358_02",nkey=20,flags=0,exptime=0,nbytes=22,cur_hv=0)atitems.c:147
147if((search->exptime!=0&&search->exptime<current_time)
(gdb)psearch->refcount
$21=3------->nowtherefcountiswrong.(maybeintheevictedprocessisbetter).
(gdb)bt
#0do_item_alloc(key=0x7ffff0026414"Y_ORDER_225426358_02",nkey=20,flags=0,exptime=0,nbytes=22,cur_hv=0)atitems.c:147
#10x0000000000417c26initem_alloc(key=0x7ffff0026414"Y_ORDER_225426358_02",nkey=20,flags=0,exptime=0,nbytes=22)atthread.c:495
#20x000000000040ace9inprocess_update_command(c=0x7ffff0026200,tokens=0x7ffff6573be0,ntokens=6,comm=2,handle_cas=false)atmemcached.c:3084
#30x000000000040bde2inprocess_command(c=0x7ffff0026200,command=0x7ffff0026410"set")atmemcached.c:3437
#40x000000000040cc47intry_read_command(c=0x7ffff0026200)atmemcached.c:3763
#50x000000000040d958indrive_machine(c=0x7ffff0026200)atmemcached.c:4108
#60x000000000040e4f7inevent_handler(fd=37,which=2,arg=0x7ffff0026200)atmemcached.c:4353
#70x00007ffff7ba3f24inevent_base_loop()from/usr/lib/x86_64-linux-gnu/libevent-2.0.so.5
#80x0000000000417926inworker_libevent(arg=0x635ea0)atthread.c:386
#90x00007ffff7980182instart_thread(arg=0x7ffff6574700)atpthread_create.c:312
#100x00007ffff76ad30dinclone()at../sysdeps/unix/sysv/linux/x86_64/clone.S:111
(gdb)


Connectedto127.0.0.1.
Escapecharacteris'^]'.
seta001
1
STORED
^[[A^[[B
ERROR
setb001
2
STORED
get71912_yhd.serial.product.get_1.0_0

jason@gy:~$telnet127.0.0.111211
Trying127.0.0.1...
Connectedto127.0.0.1.
Escapecharacteris'^]'.
setc001
3
STORED
set71912_yhd.serial.product.get_1.0_0001
3
STORED
delete71912_yhd.serial.product.get_1.0_0
DELETED
set71912_yhd.serial.product.get_1.0_0001
3
STORED
deleteY_ORDER_225426358_02
deletea
deleteb
deletec
NOT_FOUND
DELETED
NOT_FOUND
NOT_FOUND
setY_ORDER_225426358_020020
aaaaaaaaaaaaaaaaaaaa


Iwilltrytowriteatestscripttoreproduceit,andtrytocommitapatchtofixit.

------------------------------------------------

Ihavegotadeaplooponmemcached(v1.4.15,oncentos6.3x86_64),itcan'tbereproduced.Inourproductenvironment,therearehundredsofmemcachedinstancesrunning,andthisbughappend3timesthisyears.whenthebugoccurred,thouandsoftcpconnectionskeepinCLOSE_WAITstatusandreachedthemaximumconnectionnumber,thenclientscann'tconnecttothecacheservers.TheSAhavetorestartthememcachedinstancetorecoverourbusiness,buttherecenttimeigotthechancetocreateacorefile.

FYI:beforestart,youcangetthethememcachedpackage(withadebug-infopackage)at:http://mirrors.htbindustries.org/CentOS/6/x86_64/,andyoucangetthecorefileat:http://pan.baidu.com/s/1kTFTRQf

backtrackingdb.

Thread8(Thread0x7fa8a1ee1700(LWP26053)):
#00x0000003c9e2e7c73inepoll_wait()from/lib64/libc.so.6
#10x000000323cc12e4bin??()from/usr/lib64/libevent-1.4.so.2
#20x000000323cc068c3inevent_base_loop()from/usr/lib64/libevent-1.4.so.2
#30x0000000000406447inmain(argc=<valueoptimizedout>,
argv=<valueoptimizedout>)atmemcached.c:5228

Thread7(Thread0x7fa89e021700(LWP26061)):
#00x0000003c9e60b43cinpthread_cond_wait@@GLIBC_2.3.2()
from/lib64/libpthread.so.0
#10x000000000040cb33inslab_rebalance_thread(arg=<valueoptimizedout>)
atslabs.c:764
#20x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#30x0000003c9e2e767dinclone()from/lib64/libc.so.6

Thread6(Thread0x7fa89ea22700(LWP26060)):
#00x0000003c9e2ab91dinnanosleep()from/lib64/libc.so.6
#10x0000003c9e2ab790insleep()from/lib64/libc.so.6
#20x000000000040d0adinslab_maintenance_thread(arg=<valueoptimizedout>)
atslabs.c:728
#30x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#40x0000003c9e2e767dinclone()from/lib64/libc.so.6
---Type<return>tocontinue,orq<return>toquit---

Thread5(Thread0x7fa89f423700(LWP26059)):
#00x0000003c9e60b43cinpthread_cond_wait@@GLIBC_2.3.2()
from/lib64/libpthread.so.0
#10x000000000040fa8dinassoc_maintenance_thread(arg=<valueoptimizedout>)
atassoc.c:251
#20x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#30x0000003c9e2e767dinclone()from/lib64/libc.so.6

Thread4(Thread0x7fa89fe24700(LWP26058)):
#00x000000000040fd34inassoc_find(key=<valueoptimizedout>,
nkey=<valueoptimizedout>,hv=<valueoptimizedout>)atassoc.c:92
#10x000000000040ef2eindo_item_get(key=0x7fa88008bd34"B920818_0",
nkey=<valueoptimizedout>,hv=3789230535)atitems.c:523
#20x0000000000411076initem_get(key=0x7fa88008bd34"B920818_0",nkey=9)
atthread.c:499
#30x000000000040731einprocess_get_command(c=0x7fa88008bb30,
tokens=0x7fa89fe23bf0,ntokens=<valueoptimizedout>,return_cas=false)
atmemcached.c:2725
#40x0000000000409b63inprocess_command(c=0x7fa88008bb30,
command=<valueoptimizedout>)atmemcached.c:3249
#50x000000000040a7e2intry_read_command(c=0x7fa88008bb30)
atmemcached.c:3504
---Type<return>tocontinue,orq<return>toquit---
#60x000000000040b478indrive_machine(fd=<valueoptimizedout>,
which=<valueoptimizedout>,arg=0x7fa88008bb30)atmemcached.c:3824
#7event_handler(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa88008bb30)atmemcached.c:4065
#80x000000323cc06b44inevent_base_loop()from/usr/lib64/libevent-1.4.so.2
#90x000000000041070dinworker_libevent(arg=0x1ad5d88)atthread.c:384
#100x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#110x0000003c9e2e767dinclone()from/lib64/libc.so.6

Thread3(Thread0x7fa8a0825700(LWP26057)):
#00x0000003c9e6094b3inpthread_mutex_trylock()from/lib64/libpthread.so.0
#10x0000000000410d10inmutex_lock(hv=<valueoptimizedout>)
atmemcached.h:493
#2item_lock(hv=<valueoptimizedout>)atthread.c:127
#30x0000000000411069initem_get(
key=0x7fa8985d8944"1368_yhd.orders.get_1.0_visitDateList_0",nkey=39)
atthread.c:498
#40x000000000040731einprocess_get_command(c=0x7fa89829b7e0,
tokens=0x7fa8a0824bf0,ntokens=<valueoptimizedout>,return_cas=false)
atmemcached.c:2725
#50x0000000000409b63inprocess_command(c=0x7fa89829b7e0,
command=<valueoptimizedout>)atmemcached.c:3249
#60x000000000040a7e2intry_read_command(c=0x7fa89829b7e0)
---Type<return>tocontinue,orq<return>toquit---
atmemcached.c:3504
#70x000000000040b478indrive_machine(fd=<valueoptimizedout>,
which=<valueoptimizedout>,arg=0x7fa89829b7e0)atmemcached.c:3824
#8event_handler(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa89829b7e0)atmemcached.c:4065
#90x000000323cc06b44inevent_base_loop()from/usr/lib64/libevent-1.4.so.2
#100x000000000041070dinworker_libevent(arg=0x1ad2a00)atthread.c:384
#110x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#120x0000003c9e2e767dinclone()from/lib64/libc.so.6

Thread2(Thread0x7fa8a1226700(LWP26056)):
#00x0000003c9e6094b3inpthread_mutex_trylock()from/lib64/libpthread.so.0
#10x0000000000410d10inmutex_lock(hv=<valueoptimizedout>)
atmemcached.h:493
#2item_lock(hv=<valueoptimizedout>)atthread.c:127
#30x0000000000410d7dinstore_item(item=0x7fa89d0dddb8,comm=2,
c=0x7fa880118a00)atthread.c:598
#40x000000000040bed4incomplete_nread_ascii(fd=<valueoptimizedout>,
which=<valueoptimizedout>,arg=0x7fa880118a00)atmemcached.c:843
#5complete_nread(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa880118a00)atmemcached.c:2248
#6drive_machine(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa880118a00)atmemcached.c:3861
---Type<return>tocontinue,orq<return>toquit---
#7event_handler(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa880118a00)atmemcached.c:4065
#80x000000323cc06b44inevent_base_loop()from/usr/lib64/libevent-1.4.so.2
#90x000000000041070dinworker_libevent(arg=0x1acf678)atthread.c:384
#100x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#110x0000003c9e2e767dinclone()from/lib64/libc.so.6

Thread1(Thread0x7fa8a1c27700(LWP26055)):
#00x0000003c9e6094b3inpthread_mutex_trylock()from/lib64/libpthread.so.0
#10x0000000000410d10inmutex_lock(hv=<valueoptimizedout>)
atmemcached.h:493
#2item_lock(hv=<valueoptimizedout>)atthread.c:127
#30x0000000000411069initem_get(
key=0x7fa881755f64"4872_yhd.orders.get_1.0_0",nkey=25)atthread.c:498
#40x000000000040731einprocess_get_command(c=0x7fa88049e180,
tokens=0x7fa8a1c26bf0,ntokens=<valueoptimizedout>,return_cas=false)
atmemcached.c:2725
#50x0000000000409b63inprocess_command(c=0x7fa88049e180,
command=<valueoptimizedout>)atmemcached.c:3249
#60x000000000040a7e2intry_read_command(c=0x7fa88049e180)
atmemcached.c:3504
#70x000000000040b478indrive_machine(fd=<valueoptimizedout>,
which=<valueoptimizedout>,arg=0x7fa88049e180)atmemcached.c:3824
---Type<return>tocontinue,orq<return>toquit---
#8event_handler(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa88049e180)atmemcached.c:4065
#90x000000323cc06b44inevent_base_loop()from/usr/lib64/libevent-1.4.so.2
#100x000000000041070dinworker_libevent(arg=0x1acc2f0)atthread.c:384
#110x0000003c9e607851instart_thread()from/lib64/libpthread.so.0
#120x0000003c9e2e767dinclone()from/lib64/libc.so.6
(gdb)
(gdb)


ViewCode
It'seasytoknowthatthereissomethingwronginthread4,alsoknowthatthethread4gotalockofanitemthenotherthreadisblockbythelock.

(gdb)thread4
[Switchingtothread4(Thread0x7fa89fe24700(LWP26058))]#00x000000000040fd34inassoc_find(key=<valueoptimizedout>,nkey=<valueoptimizedout>,
hv=<valueoptimizedout>)atassoc.c:92
92if((nkey==it->nkey)&&(memcmp(key,ITEM_key(it),nkey)==0)){
(gdb)bt
#00x000000000040fd34inassoc_find(key=<valueoptimizedout>,
nkey=<valueoptimizedout>,hv=<valueoptimizedout>)atassoc.c:92
#10x000000000040ef2eindo_item_get(key=0x7fa88008bd34"B920818_0",
nkey=<valueoptimizedout>,hv=3789230535)atitems.c:523
#20x0000000000411076initem_get(key=0x7fa88008bd34"B920818_0",nkey=9)
atthread.c:499
#30x000000000040731einprocess_get_command(c=0x7fa88008bb30,
tokens=0x7fa89fe23bf0,ntokens=<valueoptimizedout>,return_cas=false)
atmemcached.c:2725
#40x0000000000409b63inprocess_command(c=0x7fa88008bb30,
command=<valueoptimizedout>)atmemcached.c:3249
#50x000000000040a7e2intry_read_command(c=0x7fa88008bb30)
atmemcached.c:3504
#60x000000000040b478indrive_machine(fd=<valueoptimizedout>,
which=<valueoptimizedout>,arg=0x7fa88008bb30)atmemcached.c:3824
#7event_handler(fd=<valueoptimizedout>,which=<valueoptimizedout>,
arg=0x7fa88008bb30)atmemcached.c:4065
#80x000000323cc06b44inevent_base_loop()from/usr/lib64/libevent-1.4.so.2


(gdb)pit
$1=(item*)0x7fa89799e5b0
(gdb)p*it
$2={next=0x7fa89d0646c0,prev=0x7fa8979b0760,h_next=0x7fa89799e5b0,time=31672441,exptime=31758841,nbytes=10,refcount=1,nsuffix=10'\n',
it_flags=3'\003',slabs_clsid=2'\002',nkey=34'"',data=0x7fa89799e5b0}
(gdb)


sotheit->h_nextispointtoitself,thenthedeadloophappend.

ihavecheckallthecodethatmodifythevalueit->h_next,(theassoc_insert/assoc_deleteassoc_maintenance_threadfunction),wheneveritismodified,ithavetogetthelock"cache_lock".soican'tfindthereasonwhyanitempointedtoitself.

moreinfomation:

Inthread4:

sir8-->nkey=9

dir9-->key="B920818_0"
hv=0xe1db11c7,sooffsetinprimary_hashtableis=0x111c7

(gdb)pprimary_hashtable
$22=(item**)0x7fa8840008c

soigetthefirstiteminthehashbucket.

(gdb)x/10g0x00007fa8840008c0+0x8*0x111c7
0x7fa8840896f8:0x00007fa890b46ef0


andthenthedeadloopitem'saddressistheseconditeminthehashbucketlist.yes,thereishashconflict.

(gdb)x/10xg0x00007fa890b46ef0
0x7fa890b46ef0:0x00007fa89d1884f00x00007fa89d1ee2b0
0x7fa890b46f00:0x00007fa89799e5b00x01f3710601f362f6
0x7fa890b46f10:0x03070001000000030x0000000000001501
0x7fa890b46f20:0x000000030ed02eb60x4544524f5f594247
0x7fa890b46f30:0x3632343532325f520x332032305f383533
(gdb)


theiteminthehashbucketlistis:

(gdb)pit
$1=(item*)0x7fa89799e5b0
(gdb)p*it
$2={next=0x7fa89d0646c0,prev=0x7fa8979b0760,h_next=0x7fa89799e5b0,time=31672441,exptime=31758841,nbytes=10,refcount=1,nsuffix=10'\n',
it_flags=3'\003',slabs_clsid=2'\002',nkey=34'"',data=0x7fa89799e5b0}
(gdb)
(gdb)p*(item*)0x00007fa89d1884f0
$12={next=0x7fa890b4f110,prev=0x7fa890b46ef0,h_next=0x0,time=32727785,exptime=32731385,nbytes=3,refcount=1,nsuffix=7'\a',
it_flags=3'\003',slabs_clsid=1'\001',nkey=21'\025',data=0x7fa89d1884f0}
(gdb)

item[1]:"71912_yhd.serial.product.get_1.0_0163848\r\n"nsuffix=10;slab2;nkey=34;nbytes=10

(littleendian)
(gdb)x/100xb0x00007fa89799e5b0
0x7fa89799e5b0:0xc00x460x060x9d0xa80x7f0x000x00
0x7fa89799e5b8:0x600x070x9b0x970xa80x7f0x000x00
0x7fa89799e5c0:0xb00xe50x990x970xa80x7f0x000x00
0x7fa89799e5c8:0x790x480xe30x010xf90x990xe40x01
0x7fa89799e5d0:0x0a0x000x000x000x010x000x0a0x03
0x7fa89799e5d8:0x020x220x000x000x000x000x000x00
0x7fa89799e5e0:0x6d0x330x4f0xd60x020x000x000x00
0x7fa89799e5e8:0x370x310x390x310x320x5f0x790x68
0x7fa89799e5f0:0x640x2e0x730x650x720x690x610x6c
0x7fa89799e5f8:0x2e0x700x720x6f0x640x750x630x74
0x7fa89799e600:0x2e0x670x650x740x5f0x310x2e0x30
0x7fa89799e608:0x5f0x300x200x200x310x360x330x38
0x7fa89799e610:0x340x200x380x0d0x0a0x00[0x000x008\r\n"
0x7fa89799e618:0x000x000x000x000x2d0x0d]0x0a0x0a
0x7fa89799e620:0x080xe10x0d0x0a0x0d0x700x0d0x0a
(key



item[0]:"Y_ORDER_225426358_02321\r\n1\r\n513\r\n"nssuffix=7,slab=1,nkey=21,nbytes=3
item[1]:"71912_yhd.serial.product.get_1.0_0163848\r\n"nsuffix=10;slab2;nkey=34;nbytes=10
whenthedeadloophappened,itislookingforkey"B920818_0".andthekeyisnotinthelist(maybeitis).

itisoddthattheitem[1]havenodata,althoughthenbyteis10!

----------------

iguessthatthebugoccurredwhenhashexpanding.thehashpower=17(thedefaultis16),andhash_items=101025,thehashexpandwhenhash_items>98304.Itisverypossiblethatthedealoophapenedafterexpanding,andthenallthethreadishanged.

(gdb)pexpand_bucket
$16=65536
(gdb)pstats.hash_is_expanding
$17=false
(gdb)phash_items
$18=101025
(gdb)p256*256*3/2
$19=98304

(gdb)pexpanding

$20=false
(gdb)

(gdb)phashpower
$21=17
(gdb)



ican'tnotfindthebuginthecode,soifanyguyshaveanysuggestion,pleasetellme.

thanksalot.
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: