您的位置:首页 > 其它

ceph源码分析--monitor leader选举

2018-01-19 01:29 786 查看
本篇博客主要分为三个部分

1.概述

2.选举的时机

3.选举的过程

1.概述

monitor在运行的过程中,必须存在一个leader节点来。众多的更新操作都是由leader节点来完成,写的命令也会经由peon转发到leader节点进行。

leader的选择是根据rank值来的,rank值小的为leader,而rank值跟IP地址有关。

2.选举的时机

在ceph中有大致三处会引发选举

1.收到quorum exit/enter

2.收到选举消息

3.bootstrap之后

之所以这样排序是因为从简单到复杂。

1)收到quorum exit/enter命令

在这个过程中participating会被改变,导致该monitor不再参与选举或者参与选举

void Monitor::handle_command(MonOpRequestRef op)
{
····
else if (prefix == "quorum") {
string quorumcmd;
cmd_getval(g_ceph_context, cmdmap, "quorumcmd", quorumcmd);
//quorum exit发起选举
if (quorumcmd == "exit") {
start_election();
elector.stop_participating();
rs = "stopped responding to quorum, initiated new election";
r = 0;
}
//quorum enter发起选举
else if (quorumcmd == "enter") {
elector.start_participating();
start_election();
rs = "started responding to quorum, initiated new election";
r = 0;
}
····
}


2)收到选举消息

monitor收到选举消息后

void Monitor::dispatch_op(MonOpRequestRef op)
{
···
case MSG_MON_ELECTION:
op->set_type_election();
//check privileges here for simplicity
if (!op->get_session()->is_capable("mon", MON_CAP_X)) {
dout(0) << "MMonElection received from entity without enough caps!"
<< op->get_session()->caps << dendl;
break;
}
if (!is_probing() && !is_synchronizing()) {
elector.dispatch(op);
}
break;
···
}


调用了void Elector::dispatch(MonOpRequestRef op)

这时收到的肯定是OP_PROPOSE

void Elector::dispatch(MonOpRequestRef op)
{
···
case MMonElection::OP_PROPOSE:
handle_propose(op);
return;
···
}


进入void Elector::handle_propose(MonOpRequestRef op)

void Elector::handle_propose(MonOpRequestRef op)
{
···
//如果收到一个“旧”的epoch,但是它并不在quorum里,发起选举
if (m->epoch < epoch) {
// got an "old" propose,
if (epoch % 2 == 0 &&    // in a non-election cycle
mon->quorum.count(from) == 0) {  // from someone outside the quorum
// a mon just started up, call a new election so they can rejoin!
dout(5) << " got propose from old epoch, quorum is " << mon->quorum
<< ", " << m->get_source() << " must have just started" << dendl;
// we may be active; make sure we reset things in the monitor appropriately.
mon->start_election();
} else {
dout(5) << " ignoring old propose" << dendl;
return;
}
···
//如果自身的rank值比发消息的monitor rank值要小,这时候要分两种情况
//如果已经回复过别人,说明已经有更小的rank值,则不回复,若没有回复过别人则自身rank值较小应发起选举
if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) {        // we already acked someone
assert(leader_acked < from);  // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
if (!electing_me) {
mon->start_election();
}
}
}
···
}


3)bootstrap之后

在bootstrap中也有两种情况

第一种:单monitor集群

void Monitor::win_standalone_election()

void Monitor::win_standalone_election()
{
dout(1) << "win_standalone_election" << dendl;

// bump election epoch, in case the previous epoch included other
// monitors; we need to be able to make the distinction.
elector.init();
elector.advance_epoch();

rank = monmap->get_rank(name);
assert(rank == 0);
set<int> q;
q.insert(rank);

map<int,Metadata> metadata;
collect_metadata(&metadata[0]);

win_election(elector.get_epoch(), q,
CEPH_FEATURES_ALL,
ceph::features::mon::get_supported(),
metadata);
}


第二种:先probe其他monitor

博客中主要讲解第二种先probe其他monitor

void Monitor::bootstrap()
{
dout(10) << "bootstrap" << dendl;
····
//单monitor集群选举
// singleton monitor?
if (monmap->size() == 1 && rank == 0) {
win_standalone_election();
return;
}

reset_probe_timeout();

// i'm outside the quorum
if (monmap->contains(name))
outside_quorum.insert(name);

// probe monitors
dout(10) << "probing other monitors" << dendl;
for (unsigned i = 0; i < monmap->size(); i++) {
if ((int)i != rank)
messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined),
monmap->get_inst(i));
}
for (set<entity_addr_t>::iterator p = extra_probe_peers.begin();
p != extra_probe_peers.end();
++p) {
if (*p != messenger->get_myaddr()) {
entity_inst_t i;
i.name = entity_name_t::MON(-1);
i.addr = *p;
messenger->send_message(new MMonProbe(monmap->fsid, MMonProbe::OP_PROBE, name, has_ever_joined), i);
}
}
}


接收方

void Monitor::handle_probe(MonOpRequestRef op)
{
MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
dout(10) << "handle_probe " << *m << dendl;
//必须有相同的fsid
if (m->fsid != monmap->fsid) {
dout(0) << "handle_probe ignoring fsid " << m->fsid << " != " << monmap->fsid << dendl;
return;
}

switch (m->op) {
//处理probe
case MMonProbe::OP_PROBE:
handle_probe_probe(op);
break;

case MMonProbe::OP_REPLY:
handle_probe_reply(op);
break;

case MMonProbe::OP_MISSING_FEATURES:
derr << __func__ << " missing features, have " << CEPH_FEATURES_ALL
<< ", required " << m->required_features
<< ", missing " << (m->required_features & ~CEPH_FEATURES_ALL)
<< dendl;
break;
}
}

void Monitor::handle_probe_probe(MonOpRequestRef op)
{
···
if (!is_probing() && !is_synchronizing()) {
// If the probing mon is way ahead of us, we need to re-bootstrap.
// Normally we capture this case when we initially bootstrap, but
// it is possible we pass those checks (we overlap with
// quorum-to-be) but fail to join a quorum before it moves past
// us.  We need to be kicked back to bootstrap so we can
// synchonize, not keep calling elections.
//from端数据比自己新,重新bootstrap
if (paxos->get_version() + 1 < m->paxos_first_version) {
dout(1) << " peer " << m->get_source_addr() << " has first_committed "
<< "ahead of us, re-bootstrapping" << dendl;
bootstrap();
goto out;

}
}

MMonProbe *r;
//
r = new MMonProbe(monmap->fsid, MMonProbe::OP_REPLY, name, has_ever_joined);
r->name = name;
r->quorum = quorum;
monmap->encode(r->monmap_bl, m->get_connection()->get_features());
r->paxos_first_version = paxos->get_first_committed();
r->paxos_last_version = paxos->get_version();
m->get_connection()->send_message(r);

// did we discover a peer here?
if (!monmap->contains(m->get_source_addr())) {
dout(1) << " adding peer " << m->get_source_addr()
<< " to list of hints" << dendl;
extra_probe_peers.insert(m->get_source_addr());
}

out:
return;
}


处理probe_reply消息,在这期间会同步monitor的一些数据

void Monitor::handle_probe_reply(MonOpRequestRef op)
{
MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl;
dout(10) << " monmap is " << *monmap << dendl;

// discover name and addrs during probing or electing states.
if (!is_probing() && !is_electing()) {
return;
}

// newer map, or they've joined a quorum and we haven't?
bufferlist mybl;
monmap->encode(mybl, m->get_connection()->get_features());
// make sure it's actually different; the checks below err toward
// taking the other guy's map, which could cause us to loop.
if (!mybl.contents_equal(m->monmap_bl)) {
MonMap *newmap = new MonMap;
newmap->decode(m->monmap_bl);
if (m->has_ever_joined && (newmap->get_epoch() > monmap->get_epoch() ||
!has_ever_joined)) {
dout(10) << " got newer/committed monmap epoch " << newmap->get_epoch()
<< ", mine was " << monmap->get_epoch() << dendl;
delete newmap;
monmap->decode(m->monmap_bl);
//远端的monmap更新,重新进行bootstrap
bootstrap();
return;
}
delete newmap;
}

// rename peer?
string peer_name = monmap->get_name(m->get_source_addr());
if (monmap->get_epoch() == 0 && peer_name.compare(0, 7, "noname-") == 0) {
dout(10) << " renaming peer " << m->get_source_addr() << " "
<< peer_name << " -> " << m->name << " in my monmap"
<< dendl;
monmap->rename(peer_name, m->name);

if (is_electing()) {
bootstrap();
return;
}
} else {
dout(10) << " peer name is " << peer_name << dendl;
}

// new initial peer?
if (monmap->get_epoch() == 0 &&
monmap->contains(m->name) &&
monmap->get_addr(m->name).is_blank_ip()) {
dout(1) << " learned initial mon " << m->name << " addr " << m->get_source_addr() << dendl;
monmap->set_addr(m->name, m->get_source_addr());

bootstrap();
return;
}

// end discover phase
if (!is_probing()) {
return;
}

assert(paxos != NULL);

if (is_synchronizing()) {
dout(10) << " currently syncing" << dendl;
return;
}

entity_inst_t other = m->get_source_inst();

if (m->paxos_last_version < sync_last_committed_floor) {
dout(10) << " peer paxos versions [" << m->paxos_first_version
<< "," << m->paxos_last_version << "] < my sync_last_committed_floor "
<< sync_last_committed_floor << ", ignoring"
<< dendl;
} else {
if (paxos->get_version() < m->paxos_first_version &&
m->paxos_first_version > 1) {  // no need to sync if we're 0 and they start at 1.
dout(10) << " peer paxos first versions [" << m->paxos_first_version
<< "," << m->paxos_last_version << "]"
<< " vs my version " << paxos->get_version()
<< " (too far ahead)"
<< dendl;
cancel_probe_timeout();
sync_start(other, true);
return;
}
if (paxos->get_version() + g_conf->paxos_max_join_drift < m->paxos_last_version) {
dout(10) << " peer paxos last version " << m->paxos_last_version
<< " vs my version " << paxos->get_version()
<< " (too far ahead)"
<< dendl;
cancel_probe_timeout();
sync_start(other, false);
return;
}
}

// is there an existing quorum?
if (m->quorum.size()) {
dout(10) << " existing quorum " << m->quorum << dendl;

dout(10) << " peer paxos version " << m->paxos_last_version
<< " vs my version " << paxos->get_version()
<< " (ok)"
<< dendl;

if (monmap->contains(name) &&
!monmap->get_addr(name).is_blank_ip()) {
// i'm part of the cluster; just initiate a new election
//已经在monmap,直接发起选举
start_election();
} else {
dout(10) << " ready to join, but i'm not in the monmap or my addr is blank, trying to join" << dendl;
messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()),
monmap->get_inst(*m->quorum.begin()));
}
} else {
if (monmap->contains(m->name)) {
dout(10) << " mon." << m->name << " is outside the quorum" << dendl;
outside_quorum.insert(m->name);
} else {
dout(10) << " mostly ignoring mon." << m->name << ", not part of monmap" << dendl;
return;
}

unsigned need = monmap->size() / 2 + 1;
dout(10) << " outside_quorum now " << outside_quorum << ", need " << need << dendl;
if (outside_quorum.size() >= need) {
if (outside_quorum.count(name)) {
//集群必须有半数以上的monitor可用,才能发起选举
dout(10) << " that's enough to form a new quorum, calling election" << dendl;
start_election();
} else {
dout(10) << " that's enough to form a new quorum, but it does not include me; waiting" << dendl;
}
} else {
dout(10) << " that's not yet enough for a new quorum, waiting" << dendl;
}
}
}


经过这个些步骤之后,开始选举

3.选举的过程

选举从void Monitor::start_election()开始

void Monitor::start_election()
{
dout(10) << "start_election" << dendl;
wait_for_paxos_write();
_reset();
state = STATE_ELECTING;

logger->inc(l_mon_num_elections);
logger->inc(l_mon_election_call);

clog->info() << "mon." << name << " calling new monitor election";
elector.call_election();
}


void call_election()

void call_election() {
start();
}


void Elector::start()

void Elector::start()
{
//ceph quorum exit退出之后这个值会置成false
if (!participating) {
dout(0) << "not starting new election -- not participating" << dendl;
return;
}
dout(5) << "start -- can i be leader?" << dendl;
//清空ack的集合
acked_me.clear();
init();

// start by trying to elect me
if (epoch % 2 == 0) {
bump_epoch(epoch+1);  // odd == election cycle
} else {
// do a trivial db write just to ensure it is writeable.
auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(Monitor::MONITOR_NAME, "election_writeable_test", rand());
int r = mon->store->apply_transaction(t);
assert(r >= 0);
}
start_stamp = ceph_clock_now();
electing_me = true;
acked_me[mon->rank].cluster_features = CEPH_FEATURES_ALL;
acked_me[mon->rank].mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&acked_me[mon->rank].metadata);
leader_acked = -1;

// bcast to everyone else
for (unsigned i=0; i<mon->monmap->size(); ++i) {
if ((int)i == mon->rank) continue;
//发起选举消息
MMonElection *m =
new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->messenger->send_message(m, mon->monmap->get_inst(i));
}

reset_timer();
}


void Elector::init()

void Elector::init()
{
//从db中取出epoch号+1,如果没有从1开始
epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
if (!epoch) {
dout(1) << "init, first boot, initializing epoch at 1 " << dendl;
epoch = 1;
} else if (epoch % 2) {
dout(1) << "init, last seen epoch " << epoch
<< ", mid-election, bumping" << dendl;
++epoch;
auto t(std::make_shared<MonitorDBStore::Transaction>());
t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
mon->store->apply_transaction(t);
} else {
dout(1) << "init, last seen epoch " << epoch << dendl;
}
}


void Elector::handle_propose(MonOpRequestRef op)

void Elector::handle_propose(MonOpRequestRef op)
{
op->mark_event("elector:handle_propose");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_propose from " << m->get_source() << dendl;
int from = m->get_source().num();

assert(m->epoch % 2 == 1); // election
uint64_t required_features = mon->get_required_features();
mon_feature_t required_mon_features = mon->get_required_mon_features();

dout(10) << __func__ << " required features " << required_features
<< " " << required_mon_features
<< ", peer features " << m->get_connection()->get_features()
<< " " << m->mon_features
<< dendl;

if ((required_features ^ m->get_connection()->get_features()) &
required_features) {
dout(5) << " ignoring propose from mon" << from
<< " without required features" << dendl;
nak_old_peer(op);
return;
} else if (!m->mon_features.contains_all(required_mon_features)) {
// all the features in 'required_mon_features' not in 'm->mon_features'
mon_feature_t missing = required_mon_features.diff(m->mon_features);
dout(5) << " ignoring propose from mon." << from
<< " without required mon_features " << missing
<< dendl;
nak_old_peer(op);
} else if (m->epoch > epoch) {
bump_epoch(m->epoch);
} else if (m->epoch < epoch) {
// got an "old" propose,
if (epoch % 2 == 0 &&    // in a non-election cycle
//收到集群外节点发来的选举消息,发起选举
mon->quorum.count(from) == 0) {  // from someone outside the quorum
// a mon just started up, call a new election so they can rejoin!
dout(5) << " got propose from old epoch, quorum is " << mon->quorum
<< ", " << m->get_source() << " must have just started" << dendl;
// we may be active; make sure we reset things in the monitor appropriately.
mon->start_election();
} else {
dout(5) << " ignoring old propose" << dendl;
return;
}
}
//如果本方的rank小于对方的rank值,如果回复过别人则不做处理,如果未回复过别人则发起选举。如果本方的rank大于对方的rank值,且之前回复的rank值较小,则不回复本次,如果本次的rank值较小则调用defer发ack。
if (mon->rank < from) {
// i would win over them.
if (leader_acked >= 0) {        // we already acked someone
assert(leader_acked < from);  // and they still win, of course
dout(5) << "no, we already acked " << leader_acked << dendl;
} else {
// wait, i should win!
if (!electing_me) {
mon->start_election();
}
}
} else {
// they would win over me
if (leader_acked < 0 ||      // haven't acked anyone yet, or
leader_acked > from ||   // they would win over who you did ack, or
leader_acked == from) {  // this is the guy we're already deferring to
defer(from);
} else {
// ignore them!
dout(5) << "no, we already acked " << leader_acked << dendl;
}
}
}


void Elector::defer(int who)

void Elector::defer(int who)
{
dout(5) << "defer to " << who << dendl;

if (electing_me) {
// drop out
acked_me.clear();
electing_me = false;
}

// ack them
leader_acked = who;
ack_stamp = ceph_clock_now();
MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
m->mon_features = ceph::features::mon::get_supported();
mon->collect_metadata(&m->metadata);

// This field is unused completely in luminous, but jewel uses it to
// determine whether we are a dumpling mon due to some crufty old
// code.  It only needs to see this buffer non-empty, so put
// something useless there.
m->sharing_bl = mon->get_local_commands_bl(mon->get_required_mon_features());

mon->messenger->send_message(m, mon->monmap->get_inst(who));

// set a timer
reset_timer(1.0);  // give the leader some extra time to declare victory
}


选举不超时的情况下

void Elector::handle_ack(MonOpRequestRef op)

void Elector::handle_ack(MonOpRequestRef op)
{
op->mark_event("elector:handle_ack");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_ack from " << m->get_source() << dendl;
int from = m->get_source().num();

assert(m->epoch % 2 == 1); // election
if (m->epoch > epoch) {
dout(5) << "woah, that's a newer epoch, i must have rebooted.  bumping and re-starting!" << dendl;
bump_epoch(m->epoch);
start();
return;
}
assert(m->epoch == epoch);
uint64_t required_features = mon->get_required_features();
if ((required_features ^ m->get_connection()->get_features()) &
required_features) {
dout(5) << " ignoring ack from mon" << from
<< " without required features" << dendl;
return;
}

mon_feature_t required_mon_features = mon->get_required_mon_features();
if (!m->mon_features.contains_all(required_mon_features)) {
mon_feature_t missing = required_mon_features.diff(m->mon_features);
dout(5) << " ignoring ack from mon." << from
<< " without required mon_features " << missing
<< dendl;
return;
}

if (electing_me) {
// thanks
acked_me[from].cluster_features = m->get_connection()->get_features();
acked_me[from].mon_features = m->mon_features;
acked_me[from].metadata = m->metadata;
dout(5) << " so far i have {";
for (map<int, elector_info_t>::const_iterator p = acked_me.begin();
p != acked_me.end();
++p) {
if (p != acked_me.begin())
*_dout << ",";
*_dout << " mon." << p->first << ":"
<< " features " << p->second.cluster_features
<< " " << p->second.mon_features;
}
*_dout << " }" << dendl;

// is that _everyone_?
//获得所有的ack之后
if (acked_me.size() == mon->monmap->size()) {
// if yes, shortcut to election finish
victory();
}
} else {
// ignore, i'm deferring already.
assert(leader_acked >= 0);
}
}


选举超时的情况下

void Elector::expire()
{
dout(5) << "election timer expired" << dendl;
//超时情况下半数以上monitor回复即可
// did i win?
if (electing_me &&
acked_me.size() > (unsigned)(mon->monmap->size() / 2)) {
// i win
victory();
} else {
// whoever i deferred to didn't declare victory quickly enough.
if (mon->has_ever_joined)
start();
else
mon->bootstrap();
}
}


如果赢得选举

void Elector::victory()

void Elector::victory()
{
leader_acked = -1;
electing_me = false;

uint64_t cluster_features = CEPH_FEATURES_ALL;
mon_feature_t mon_features = ceph::features::mon::get_supported();
set<int> quorum;
map<int,Metadata> metadata;
for (map<int, elector_info_t>::iterator p = acked_me.begin();
p != acked_me.end();
++p) {
quorum.insert(p->first);
cluster_features &= p->second.cluster_features;
mon_features &= p->second.mon_features;
metadata[p->first] = p->second.metadata;
}

cancel_timer();

assert(epoch % 2 == 1);  // election
bump_epoch(epoch+1);     // is over!

// tell everyone!
for (set<int>::iterator p = quorum.begin();
p != quorum.end();
++p) {
if (*p == mon->rank) continue;
MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch,
mon->monmap);
m->quorum = quorum;
m->quorum_features = cluster_features;
m->mon_features = mon_features;
m->sharing_bl = mon->get_local_commands_bl(mon_features);
mon->messenger->send_message(m, mon->monmap->get_inst(*p));
}

// tell monitor
//赢得选举,把自己标为leader
mon->win_election(epoch, quorum,
cluster_features, mon_features, metadata);
}


void Elector::handle_victory(MonOpRequestRef op)

void Elector::handle_victory(MonOpRequestRef op)
{
op->mark_event("elector:handle_victory");
MMonElection *m = static_cast<MMonElection*>(op->get_req());
dout(5) << "handle_victory from " << m->get_source()
<< " quorum_features " << m->quorum_features
<< " " << m->mon_features
<< dendl;
int from = m->get_source().num();

assert(from < mon->rank);
assert(m->epoch % 2 == 0);

leader_acked = -1;

// i should have seen this election if i'm getting the victory.
if (m->epoch != epoch + 1) {
dout(5) << "woah, that's a funny epoch, i must have rebooted.  bumping and re-starting!" << dendl;
bump_epoch(m->epoch);
start();
return;
}

bump_epoch(m->epoch);

// they win
//输掉选举,把自己标成peon
mon->lose_election(epoch, m->quorum, from,
m->quorum_features, m->mon_features);

// cancel my timer
cancel_timer();

// stash leader's commands
assert(m->sharing_bl.length());
vector<MonCommand> new_cmds;
bufferlist::iterator bi = m->sharing_bl.begin();
MonCommand::decode_vector(new_cmds, bi);
mon->set_leader_commands(new_cmds);
}


初始化leader

void Monitor::win_election(epoch_t epoch, set& active, uint64_t features,const mon_feature_t& mon_features,const map

void Monitor::win_election(epoch_t epoch, set<int>& active, uint64_t features,const mon_feature_t& mon_features,const map<int,Metadata>& metadata)
{
dout(10) << __func__ << " epoch " << epoch << " quorum " << active
<< " features " << features
<< " mon_features " << mon_features
<< dendl;
assert(is_electing());
state = STATE_LEADER;
leader_since = ceph_clock_now();
leader = rank;
quorum = active;
quorum_con_features = features;
quorum_mon_features = mon_features;
pending_metadata = metadata;
outside_quorum.clear();

clog->info() << "mon." << name << "@" << rank
<< " won leader election with quorum " << quorum;

set_leader_commands(get_local_commands(mon_features));

paxos->leader_init();
// NOTE: tell monmap monitor first.  This is important for the
// bootstrap case to ensure that the very first paxos proposal
// codifies the monmap.  Otherwise any manner of chaos can ensue
// when monitors are call elections or participating in a paxos
// round without agreeing on who the participants are.
monmon()->election_finished();
_finish_svc_election();
health_monitor->start(epoch);

logger->inc(l_mon_election_win);

// inject new metadata in first transaction.
{
// include previous metadata for missing mons (that aren't part of
// the current quorum).
map<int,Metadata> m = metadata;
for (unsigned rank = 0; rank < monmap->size(); ++rank) {
if (m.count(rank) == 0 &&
mon_metadata.count(rank)) {
m[rank] = mon_metadata[rank];
}
}

// FIXME: This is a bit sloppy because we aren't guaranteed to submit
// a new transaction immediately after the election finishes.  We should
// do that anyway for other reasons, though.
MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
bufferlist bl;
::encode(m, bl);
t->put(MONITOR_STORE_PREFIX, "last_metadata", bl);
}

finish_election();
if (monmap->size() > 1 &&
monmap->get_epoch() > 0) {
timecheck_start();
health_tick_start();
do_health_to_clog_interval();
scrub_event_start();
}
}


输掉选举,把自己初始化成peon

void Monitor::lose_election(epoch_t epoch, set &q, int l,uint64_t features,const mon_feature_t& mon_features)

void Monitor::lose_election(epoch_t epoch, set<int> &q, int l,
uint64_t features,
const mon_feature_t& mon_features)
{
state = STATE_PEON;
leader_since = utime_t();
leader = l;
quorum = q;
outside_quorum.clear();
quorum_con_features = features;
quorum_mon_features = mon_features;
dout(10) << "lose_election, epoch " << epoch << " leader is mon" << leader
<< " quorum is " << quorum << " features are " << quorum_con_features
<< " mon_features are " << quorum_mon_features
<< dendl;

paxos->peon_init();
_finish_svc_election();
health_monitor->start(epoch);

logger->inc(l_mon_election_lose);

finish_election();

if ((quorum_con_features & CEPH_FEATURE_MON_METADATA) &&
!HAVE_FEATURE(quorum_con_features, SERVER_LUMINOUS)) {
// for pre-luminous mons only
Metadata sys_info;
collect_metadata(&sys_info);
messenger->send_message(new MMonMetadata(sys_info),
monmap->get_inst(get_leader()));
}
}


选举结束void Monitor::finish_election()

void Monitor::finish_election()
{
apply_quorum_to_compatset_features();
apply_monmap_to_compatset_features();
timecheck_finish();
exited_quorum = utime_t();
finish_contexts(g_ceph_context, waitfor_quorum);
finish_contexts(g_ceph_context, maybe_wait_for_quorum);
resend_routed_requests();
update_logger();
register_cluster_logger();

// am i named properly?
string cur_name = monmap->get_name(messenger->get_myaddr());
if (cur_name != name) {
dout(10) << " renaming myself from " << cur_name << " -> " << name << dendl;
messenger->send_message(new MMonJoin(monmap->fsid, name, messenger->get_myaddr()),
monmap->get_inst(*quorum.begin()));
}
}


选举的基本流程已经基本结束

关于选举的部分还遗留着几个问题

1.handle_nak没有介绍

2.完成选举之后的初始化(paxos)

3.为什么收到集群外的选举需要重新发起选举?
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: