/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2005, 2010 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ #include "db_config.h" #include "db_int.h" static void __repmgr_config_elect __P((ENV *, u_int32_t, u_int32_t *, u_int32_t *)); static db_timeout_t __repmgr_compute_response_time __P((ENV *)); static int __repmgr_elect __P((ENV *, u_int32_t, u_int32_t, db_timespec *)); static int __repmgr_elect_main __P((ENV *, REPMGR_RUNNABLE *)); static void *__repmgr_elect_thread __P((void *)); /* * Starts an election thread. * * PUBLIC: int __repmgr_init_election __P((ENV *, u_int32_t)); * * !!! * Caller must hold mutex. */ int __repmgr_init_election(env, flags) ENV *env; u_int32_t flags; { DB_REP *db_rep; REPMGR_RUNNABLE *th; int ret; u_int i, new_size; COMPQUIET(th, NULL); db_rep = env->rep_handle; if (db_rep->finished) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "ignoring elect thread request %#lx; repmgr is finished", (u_long)flags)); return (0); } /* Find an available slot, indexed by 'i'; allocate more if needed. */ for (i = 0; i < db_rep->aelect_threads; i++) { th = db_rep->elect_threads[i]; if (th == NULL) break; if (th->finished) { if ((ret = __repmgr_thread_join(th)) != 0) return (ret); /* Reuse the space in a moment. */ break; } } if (i == db_rep->aelect_threads) { new_size = db_rep->aelect_threads + 1; if ((ret = __os_realloc(env, sizeof(REPMGR_RUNNABLE*) * new_size, &db_rep->elect_threads)) != 0) return (ret); db_rep->aelect_threads = new_size; STAT(db_rep->region->mstat.st_max_elect_threads = new_size); th = db_rep->elect_threads[i] = NULL; } if (th == NULL && (ret = __os_malloc(env, sizeof(REPMGR_RUNNABLE), &th)) != 0) return (ret); th->run = __repmgr_elect_thread; th->args.flags = flags; if ((ret = __repmgr_thread_start(env, th)) == 0) STAT(db_rep->region->mstat.st_elect_threads++); else { __os_free(env, th); th = NULL; } db_rep->elect_threads[i] = th; return (ret); } static void * __repmgr_elect_thread(argsp) void *argsp; { REPMGR_RUNNABLE *th; ENV *env; int ret; th = argsp; env = th->env; RPRINT(env, (env, DB_VERB_REPMGR_MISC, "starting election thread")); if ((ret = __repmgr_elect_main(env, th)) != 0) { __db_err(env, ret, "election thread failed"); __repmgr_thread_failure(env, ret); } RPRINT(env, (env, DB_VERB_REPMGR_MISC, "election thread is exiting")); th->finished = TRUE; return (NULL); } static int __repmgr_elect_main(env, th) ENV *env; REPMGR_RUNNABLE *th; { DB_REP *db_rep; REP *rep; #ifdef DB_WIN32 DWORD duration; db_timeout_t t; #else struct timespec deadline; #endif db_timespec failtime, now, repstart_time, target, wait_til; db_timeout_t response_time; u_int32_t flags, nsites, nvotes; int done_repstart, ret, suppress_election; enum { ELECTION, REPSTART } action; COMPQUIET(action, ELECTION); db_rep = env->rep_handle; rep = db_rep->region; flags = th->args.flags; if (LF_ISSET(ELECT_F_EVENT_NOTIFY)) DB_EVENT(env, DB_EVENT_REP_MASTER_FAILURE, NULL); /* * As a freshly started thread, lay claim to the title of being * "preferred". If an older thread is sleeping for retry, when it wakes * up it will relinquish its role (since there's no need for multiple * threads to sleep and retry). */ LOCK_MUTEX(db_rep->mutex); db_rep->preferred_elect_thr = th; UNLOCK_MUTEX(db_rep->mutex); /* * The 'done_repstart' flag keeps track of which was our most recent * operation (repstart or election), so that we can alternate * appropriately. There are a few different ways this thread can be * invoked, and all but one specify some form of immediate election be * called. The one exception is at initial start-up, where we * first probe for a master by sending out rep_start(CLIENT) calls. */ if (LF_ISSET(ELECT_F_IMMED)) { __repmgr_config_elect(env, flags, &nsites, &nvotes); /* * When the election succeeds, we've successfully completed * everything we need to do. If it fails in an unexpected way, * we abort all processing as usual. The only time we need to * stay in here and do some more work is on DB_REP_UNAVAIL, * in which case we want to wait a while and retry later. */ if ((ret = __repmgr_elect(env, nsites, nvotes, &failtime)) != DB_REP_UNAVAIL) goto out; done_repstart = FALSE; } else { /* * We didn't really have an election failure, because in this * case we haven't even done an election yet. But the timing * we want turns out the same: we want to wait for the election * retry time and then call for an election if nothing else * interesting happens before then. */ __os_gettime(env, &failtime, 1); /* * Although we didn't do a repstart in this thread, we know that * our caller did one just before creating the thread. */ done_repstart = TRUE; } LOCK_MUTEX(db_rep->mutex); for (;;) { ret = 0; if (db_rep->finished) goto unlock; /* * If we've become the master (which could happen after an * election in another election thread), or we find we have a * working connection to a known master, then we're quite * content: that's really the essential purpose of this whole * thread. */ if (__repmgr_master_is_known(env)) goto unlock; /* * When circumstances force us to do an immediate election, we * may be forced to create multiple threads in order to do so. * But we certainly don't need multiple threads sleeping, * alternating and retrying. The "preferred election thread" is * the one that has the authority and responsibility to * persevere until our work is done. Note that this role can * switch from one thread to another, depending on the timing of * events. In particular, when an election fails the thread * that got the failure becomes the chosen one that will remain * to avenge the failure. */ if (db_rep->preferred_elect_thr != th) goto unlock; timespecclear(&wait_til); __os_gettime(env, &now, 1); /* * See if it's time to retry the operation. Normally it's an * election we're interested in retrying. But we refrain from * calling for elections if so configured. */ suppress_election = LF_ISSET(ELECT_F_STARTUP) ? db_rep->init_policy == DB_REP_CLIENT : !FLD_ISSET(rep->config, REP_C_ELECTIONS); repstart_time = db_rep->repstart_time; target = suppress_election ? repstart_time : failtime; TIMESPEC_ADD_DB_TIMEOUT(&target, rep->election_retry_wait); if (timespeccmp(&now, &target, >=)) { /* * We've surpassed our target retry time. * However, elections should generally alternate with * rep_start calls, so do that if we haven't done one * since the last election. */ action = suppress_election ? REPSTART : (done_repstart ? ELECTION : REPSTART); } else if (db_rep->new_connection) { /* Seen a recent new connection, let's do rep_start. */ action = REPSTART; } else wait_til = target; if (!timespecisset(&wait_til)) { response_time = __repmgr_compute_response_time(env); target = repstart_time; TIMESPEC_ADD_DB_TIMEOUT(&target, response_time); if (timespeccmp(&now, &target, <)) { /* We haven't waited long enough. */ wait_til = target; } } if (timespecisset(&wait_til)) { #ifdef DB_WIN32 timespecsub(&wait_til, &now); DB_TIMESPEC_TO_TIMEOUT(t, &wait_til, TRUE); duration = t / US_PER_MS; if ((ret = SignalObjectAndWait(*db_rep->mutex, db_rep->check_election, duration, FALSE)) != WAIT_OBJECT_0 && ret != WAIT_TIMEOUT) goto out; LOCK_MUTEX(db_rep->mutex); /* * Although there could be multiple threads, only the * "preferred" thread resets the event object. If the * others tried to do so, the preferred thread might * miss the wake-up. Another way of saying this is that * the precise meaning of the check_election event is * that "there may be some election-thread-related work * to do, and the correct thread to do it has not yet * been woken up". */ if (ret == WAIT_OBJECT_0 && db_rep->preferred_elect_thr == th && !ResetEvent(db_rep->check_election)) { ret = GetLastError(); goto unlock; } #else deadline.tv_sec = wait_til.tv_sec; deadline.tv_nsec = wait_til.tv_nsec; if ((ret = pthread_cond_timedwait( &db_rep->check_election, db_rep->mutex, &deadline)) != ETIMEDOUT && ret != 0) goto unlock; #endif continue; } UNLOCK_MUTEX(db_rep->mutex); if (action == ELECTION) { db_rep->new_connection = FALSE; __repmgr_config_elect(env, 0, &nsites, &nvotes); if ((ret = __repmgr_elect(env, nsites, nvotes, &failtime)) != DB_REP_UNAVAIL) goto out; done_repstart = FALSE; LOCK_MUTEX(db_rep->mutex); db_rep->preferred_elect_thr = th; } else { DB_ASSERT(env, action == REPSTART); db_rep->new_connection = FALSE; if ((ret = __repmgr_repstart(env, DB_REP_CLIENT)) != 0) goto out; done_repstart = TRUE; LOCK_MUTEX(db_rep->mutex); __os_gettime(env, &db_rep->repstart_time, 1); } } #ifdef HAVE_STATISTICS /* * We normally don't bother taking a mutex to increment statistics. But * in this case, since we're incrementing and decrementing in pairs, it * could be very weird if we were "off somewhat". For example, we could * get a negative value. And this is not a high-traffic, performance- * critical path. * On the other hand, it suffices to take repmgr's (handle-based) * mutex, rather than the rep mutex which normally protects shared * memory, since all election thread activity must be occurring in the * single listener process, under control of one single rep handle. */ out: LOCK_MUTEX(db_rep->mutex); unlock: rep->mstat.st_elect_threads--; UNLOCK_MUTEX(db_rep->mutex); #else unlock: UNLOCK_MUTEX(db_rep->mutex); out: #endif return (ret); } static db_timeout_t __repmgr_compute_response_time(env) ENV *env; { DB_REP *db_rep; REP *rep; db_timeout_t ato, eto; db_rep = env->rep_handle; rep = db_rep->region; /* * Avoid crowding operations too close together. If we've just recently * done a rep_start, wait a moment in case there's a master out there, * to give it a chance to respond with a NEWMASTER message. This is * particularly an issue at start-up time, when we're likely to have * several "new connection establishment" events bombarding us with lots * of rep_start requests in quick successtion. * * We don't have a separate user configuration for rep_start response, * but it's reasonable to expect it to be similar to either the ack * timeout or the election timeout, whichever is smaller. However, only * consider the ack timeout if all signs point to it being in use. */ ato = rep->ack_timeout; eto = rep->elect_timeout; if (ato > 0 && rep->perm_policy != DB_REPMGR_ACKS_NONE && rep->priority > 0 && ato < eto) return (ato); return (eto); } static void __repmgr_config_elect(env, flags, nsitesp, nvotesp) ENV *env; u_int32_t flags, *nsitesp, *nvotesp; { DB_REP *db_rep; REP *rep; u_int32_t invitation, nsites, nvotes; db_rep = env->rep_handle; nsites = __repmgr_get_nsites(db_rep); /* * With only 2 sites in the group, even a single failure could make it * impossible to get a majority. So, fudge a little, unless the user * really wants strict safety. */ if (nsites == 2 && !FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT)) nvotes = 1; else nvotes = ELECTION_MAJORITY(nsites); if (LF_ISSET(ELECT_F_INVITEE)) { /* * We're going to the election party because we were invited by * another site. Accept the other site's suggested value, if * it's reasonable. (I.e., the other site may have wanted to do * a "fast" election after losing contact with the master. If * so, let's not spoil it by imposing our own full nsites count * on it.) */ rep = db_rep->region; invitation = rep->nsites; if (invitation == nsites || invitation == nsites - 1) nsites = invitation; } if (LF_ISSET(ELECT_F_FAST) && nsites > nvotes) { /* * If we're doing an election because we noticed that the master * failed, it's reasonable to expect that the master won't * participate. By not waiting for its vote, we can probably * complete the election faster. But note that we shouldn't * allow this to affect nvotes calculation. * * However, if we have 2 sites, and strict majority is turned * on, now nvotes would be 2, and it doesn't make sense to * rep_elect to see nsites of 1 in that case. So only decrement * nsites if it currently exceeds nvotes. */ nsites--; } /* The rule for leases overrides all of the above. */ if (IS_USING_LEASES(env)) nsites = 0; *nsitesp = nsites; *nvotesp = nvotes; } static int __repmgr_elect(env, nsites, nvotes, failtimep) ENV *env; u_int32_t nsites, nvotes; db_timespec *failtimep; { DB_REP *db_rep; int ret; db_rep = env->rep_handle; switch (ret = __rep_elect_int(env, nsites, nvotes, 0)) { case DB_REP_UNAVAIL: __os_gettime(env, failtimep, 1); DB_EVENT(env, DB_EVENT_REP_ELECTION_FAILED, NULL); break; case 0: if (db_rep->takeover_pending) { db_rep->takeover_pending = FALSE; ret = __repmgr_repstart(env, DB_REP_MASTER); } break; case DB_REP_IGNORE: ret = 0; break; default: __db_err(env, ret, "unexpected election failure"); break; } return (ret); } /* * When turning on elections in an already-running system, check to see if we're * in a state where we need an election (i.e., we would have started one * previously if elections hadn't been turned off), and if so start one. * * PUBLIC: int __repmgr_turn_on_elections __P((ENV *)); */ int __repmgr_turn_on_elections(env) ENV *env; { DB_REP *db_rep; REP *rep; int ret; db_rep = env->rep_handle; rep = db_rep->region; ret = 0; DB_ASSERT(env, REP_ON(env)); LOCK_MUTEX(db_rep->mutex); if (db_rep->selector == NULL || !FLD_ISSET(rep->config, REP_C_ELECTIONS) || __repmgr_master_is_known(env)) goto out; ret = __repmgr_init_election(env, ELECT_F_IMMED); out: UNLOCK_MUTEX(db_rep->mutex); return (ret); }