/*- * See the file LICENSE for redistribution information. * * Copyright (c) 2005, 2010 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ #include "db_config.h" #include "db_int.h" #include "dbinc/mp.h" /* * The functions in this module implement a simple wire protocol for * transmitting messages, both replication messages and our own internal control * messages. The protocol is as follows: * * 1 byte - message type (defined in repmgr.h) * 4 bytes - size of control * 4 bytes - size of rec * ? bytes - control * ? bytes - rec * * where both sizes are 32-bit binary integers in network byte order. * Either control or rec can have zero length, but even in this case the * 4-byte length will be present. * Putting both lengths right up at the front allows us to read in fewer * phases, and allows us to allocate buffer space for both parts (plus a wrapper * struct) at once. */ /* * In sending a message, we first try to send it in-line, in the sending thread, * and without first copying the message, by using scatter/gather I/O, using * iovecs to point to the various pieces of the message. If that all works * without blocking, that's optimal. * If we find that, for a particular connection, we can't send without * blocking, then we must copy the message for sending later in the select() * thread. In the course of doing that, we might as well "flatten" the message, * forming one single buffer, to simplify life. Not only that, once we've gone * to the trouble of doing that, other sites to which we also want to send the * message (in the case of a broadcast), may as well take advantage of the * simplified structure also. * The sending_msg structure below holds it all. Note that this structure, * and the "flat_msg" structure, are allocated separately, because (1) the * flat_msg version is usually not needed; and (2) when a flat_msg is needed, it * will need to live longer than the wrapping sending_msg structure. * Note that, for the broadcast case, where we're going to use this * repeatedly, the iovecs is a template that must be copied, since in normal use * the iovecs pointers and lengths get adjusted after every partial write. */ struct sending_msg { REPMGR_IOVECS iovecs; u_int8_t type; u_int32_t control_size_buf, rec_size_buf; REPMGR_FLAT *fmsg; }; static int final_cleanup __P((ENV *, REPMGR_CONNECTION *, void *)); static int flatten __P((ENV *, struct sending_msg *)); static void remove_connection __P((ENV *, REPMGR_CONNECTION *)); static int __repmgr_close_connection __P((ENV *, REPMGR_CONNECTION *)); static int __repmgr_destroy_connection __P((ENV *, REPMGR_CONNECTION *)); static void setup_sending_msg __P((struct sending_msg *, u_int, const DBT *, const DBT *)); static int __repmgr_send_internal __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, int)); static int enqueue_msg __P((ENV *, REPMGR_CONNECTION *, struct sending_msg *, size_t)); static REPMGR_SITE *__repmgr_available_site __P((ENV *, int)); static REPMGR_SITE *__repmgr_find_available_peer __P((ENV *)); /* * __repmgr_send -- * The send function for DB_ENV->rep_set_transport. * * PUBLIC: int __repmgr_send __P((DB_ENV *, const DBT *, const DBT *, * PUBLIC: const DB_LSN *, int, u_int32_t)); */ int __repmgr_send(dbenv, control, rec, lsnp, eid, flags) DB_ENV *dbenv; const DBT *control, *rec; const DB_LSN *lsnp; int eid; u_int32_t flags; { DB_REP *db_rep; REP *rep; ENV *env; REPMGR_CONNECTION *conn; REPMGR_SITE *site; u_int available, nclients, needed, npeers_sent, nsites_sent, quorum; int ret, t_ret; env = dbenv->env; db_rep = env->rep_handle; rep = db_rep->region; ret = 0; COMPQUIET(quorum, 0); LOCK_MUTEX(db_rep->mutex); /* * If we're already "finished", we can't send anything. This covers the * case where a bulk buffer is flushed at env close, or perhaps an * unexpected __repmgr_thread_failure. */ if (db_rep->finished) { ret = DB_REP_UNAVAIL; goto out; } /* * Check whether we need to refresh our site address information with * more recent updates from shared memory. */ if (rep->siteinfo_seq > db_rep->siteinfo_seq && (ret = __repmgr_sync_siteaddr(env)) != 0) goto out; if (eid == DB_EID_BROADCAST) { if ((ret = __repmgr_send_broadcast(env, REPMGR_REP_MESSAGE, control, rec, &nsites_sent, &npeers_sent)) != 0) goto out; } else { DB_ASSERT(env, IS_KNOWN_REMOTE_SITE(eid)); /* * Since repmgr's simple c2c implementation doesn't truly manage * staged synchronization it doesn't work well with master * leases. So, disable it during the time when a new master may * be trying to establish its first set of lease grants. */ if (IS_USING_LEASES(env) && !rep->stat.st_startup_complete) LF_CLR(DB_REP_ANYWHERE); /* * If this is a request that can be sent anywhere, then see if * we can send it to our peer (to save load on the master), but * not if it's a rerequest, 'cuz that likely means we tried this * already and failed. */ if ((flags & (DB_REP_ANYWHERE | DB_REP_REREQUEST)) == DB_REP_ANYWHERE && (site = __repmgr_find_available_peer(env)) != NULL) { VPRINT(env, (env, DB_VERB_REPMGR_MISC, "sending request to peer")); } else if ((site = __repmgr_available_site(env, eid)) == NULL) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "ignoring message sent to unavailable site")); ret = DB_REP_UNAVAIL; goto out; } conn = site->ref.conn; /* Pass the "blockable" argument as TRUE. */ if ((ret = __repmgr_send_one(env, conn, REPMGR_REP_MESSAGE, control, rec, TRUE)) == DB_REP_UNAVAIL && (t_ret = __repmgr_bust_connection(env, conn)) != 0) ret = t_ret; if (ret != 0) goto out; nsites_sent = 1; npeers_sent = site->priority > 0 ? 1 : 0; } /* * Right now, nsites and npeers represent the (maximum) number of sites * we've attempted to begin sending the message to. Of course we * haven't really received any ack's yet. But since we've only sent to * nsites/npeers other sites, that's the maximum number of ack's we * could possibly expect. If even that number fails to satisfy our PERM * policy, there's no point waiting for something that will never * happen. */ if (LF_ISSET(DB_REP_PERMANENT)) { /* Number of sites in the group besides myself. */ nclients = __repmgr_get_nsites(db_rep) - 1; switch (rep->perm_policy) { case DB_REPMGR_ACKS_NONE: needed = 0; COMPQUIET(available, 0); break; case DB_REPMGR_ACKS_ONE: needed = 1; available = nsites_sent; break; case DB_REPMGR_ACKS_ALL: /* Number of sites in the group besides myself. */ needed = nclients; available = nsites_sent; break; case DB_REPMGR_ACKS_ONE_PEER: needed = 1; available = npeers_sent; break; case DB_REPMGR_ACKS_ALL_PEERS: /* * Too hard to figure out "needed", since we're not * keeping track of how many peers we have; so just skip * the optimization in this case. */ needed = 1; available = npeers_sent; break; case DB_REPMGR_ACKS_QUORUM: case DB_REPMGR_ACKS_ALL_AVAILABLE: /* * The minimum number of acks necessary to ensure that * the transaction is durable if an election is held. * * Unless instructed otherwise, our special handling for * 2-site groups means that a client that loses contact * with the master elects itself master (even though * that doesn't constitute a majority). In order to * provide the expected guarantee implied by the * definition of "quorum" we have to fudge the ack * calculation in this case: specifically, we need to * make sure that the client has received it in order * for us to consider it "perm". * * Note that turning the usual strict behavior back on * in a 2-site group results in "0" as the number of * clients needed to ack a txn in order for it to have * arrived at a quorum. This is the correct result, * strange as it may seem! This may well mean that in a * 2-site group the QUORUM policy is rarely the right * choice. */ if (nclients > 1 || FLD_ISSET(db_rep->region->config, REP_C_2SITE_STRICT)) needed = nclients / 2; else needed = 1; if (rep->perm_policy == DB_REPMGR_ACKS_ALL_AVAILABLE) { quorum = needed; needed = available = nsites_sent; } else { available = npeers_sent; quorum = 0; } break; default: COMPQUIET(available, 0); COMPQUIET(needed, 0); (void)__db_unknown_path(env, "__repmgr_send"); break; } if (needed == 0) goto out; if (available < needed) { ret = DB_REP_UNAVAIL; goto out; } /* In ALL_PEERS case, display of "needed" might be confusing. */ VPRINT(env, (env, DB_VERB_REPMGR_MISC, "will await acknowledgement: need %u", needed)); ret = __repmgr_await_ack(env, lsnp, needed); /* * If using ACKS_ALL_AVAILABLE and all possible sites acked, * only return success if we have a quorum minimum available * to ensure data integrity. */ if (ret == 0 && rep->perm_policy == DB_REPMGR_ACKS_ALL_AVAILABLE && available < quorum) ret = DB_REP_UNAVAIL; } out: UNLOCK_MUTEX(db_rep->mutex); if (ret != 0 && LF_ISSET(DB_REP_PERMANENT)) { STAT(db_rep->region->mstat.st_perm_failed++); DB_EVENT(env, DB_EVENT_REP_PERM_FAILED, NULL); } return (ret); } static REPMGR_SITE * __repmgr_available_site(env, eid) ENV *env; int eid; { DB_REP *db_rep; REPMGR_SITE *site; db_rep = env->rep_handle; site = SITE_FROM_EID(eid); if (IS_SITE_HANDSHAKEN(site)) return (site); return (NULL); } /* * Synchronize our list of sites with new information that has been added to the * list in the shared region. * * PUBLIC: int __repmgr_sync_siteaddr __P((ENV *)); */ int __repmgr_sync_siteaddr(env) ENV *env; { DB_REP *db_rep; REP *rep; char *host; u_int added; int ret; db_rep = env->rep_handle; rep = db_rep->region; ret = 0; MUTEX_LOCK(env, rep->mtx_repmgr); if (db_rep->my_addr.host == NULL && rep->my_addr.host != INVALID_ROFF) { host = R_ADDR(env->reginfo, rep->my_addr.host); if ((ret = __repmgr_pack_netaddr(env, host, rep->my_addr.port, NULL, &db_rep->my_addr)) != 0) goto out; } added = db_rep->site_cnt; if ((ret = __repmgr_copy_in_added_sites(env)) == 0) ret = __repmgr_init_new_sites(env, added, db_rep->site_cnt); out: MUTEX_UNLOCK(env, rep->mtx_repmgr); return (ret); } /* * Sends message to all sites with which we currently have an active * connection. Sets result parameters according to how many sites we attempted * to begin sending to, even if we did nothing more than queue it for later * delivery. * * !!! * Caller must hold env->mutex. * PUBLIC: int __repmgr_send_broadcast __P((ENV *, u_int, * PUBLIC: const DBT *, const DBT *, u_int *, u_int *)); */ int __repmgr_send_broadcast(env, type, control, rec, nsitesp, npeersp) ENV *env; u_int type; const DBT *control, *rec; u_int *nsitesp, *npeersp; { DB_REP *db_rep; struct sending_msg msg; REPMGR_CONNECTION *conn; REPMGR_SITE *site; u_int eid, nsites, npeers; int ret; static const u_int version_max_msg_type[] = { 0, REPMGR_MAX_V1_MSG_TYPE, REPMGR_MAX_V2_MSG_TYPE, REPMGR_MAX_V3_MSG_TYPE }; db_rep = env->rep_handle; /* * Sending a broadcast is quick, because we allow no blocking. So it * shouldn't much matter. But just in case, take the timestamp before * sending, so that if anything we err on the side of keeping clients * placated (i.e., possibly sending a heartbeat slightly more frequently * than necessary). */ __os_gettime(env, &db_rep->last_bcast, 1); setup_sending_msg(&msg, type, control, rec); nsites = npeers = 0; /* Send to (only the main connection with) every site. */ for (eid = 0; eid < db_rep->site_cnt; eid++) { if ((site = __repmgr_available_site(env, (int)eid)) == NULL) continue; conn = site->ref.conn; DB_ASSERT(env, IS_VALID_EID(conn->eid) && conn->version > 0 && conn->version <= DB_REPMGR_VERSION); /* * Skip if the type of message we're sending is beyond the range * of known message types for this connection's version. * * !!! * Don't be misled by the apparent generality of this simple * test. It works currently, because the only kinds of messages * that we broadcast are REP_MESSAGE and HEARTBEAT. But in the * future other kinds of messages might require more intricate * per-connection-version customization (for example, * per-version message format conversion, addition of new * fields, etc.). */ if (type > version_max_msg_type[conn->version]) continue; /* * Broadcast messages are either application threads committing * transactions, or replication status message that we can * afford to lose. So don't allow blocking for them (pass * "blockable" argument as FALSE). */ if ((ret = __repmgr_send_internal(env, conn, &msg, FALSE)) == 0) { site = SITE_FROM_EID(conn->eid); nsites++; if (site->priority > 0) npeers++; } else if (ret == DB_TIMEOUT) { /* * Couldn't send because of a full output queue. * Incrementing counters would be wrong, but it's * otherwise OK in the sense that the connection isn't * definitively known to be broken, and rep protocol * always allows us to drop a message if we have to. */ ret = 0; } else if (ret == DB_REP_UNAVAIL) { if ((ret = __repmgr_bust_connection(env, conn)) != 0) return (ret); } else return (ret); } *nsitesp = nsites; *npeersp = npeers; return (0); } /* * __repmgr_send_one -- * Send a message to a site, or if you can't just yet, make a copy of it * and arrange to have it sent later. 'rec' may be NULL, in which case we send * a zero length and no data. * * !!! * Note that the mutex should be held through this call. * It doubles as a synchronizer to make sure that two threads don't * intersperse writes that are part of two single messages. * * PUBLIC: int __repmgr_send_one __P((ENV *, REPMGR_CONNECTION *, * PUBLIC: u_int, const DBT *, const DBT *, int)); */ int __repmgr_send_one(env, conn, msg_type, control, rec, blockable) ENV *env; REPMGR_CONNECTION *conn; u_int msg_type; const DBT *control, *rec; int blockable; { struct sending_msg msg; int ret; setup_sending_msg(&msg, msg_type, control, rec); if ((ret = __repmgr_send_internal(env, conn, &msg, blockable)) == DB_TIMEOUT && !blockable) ret = 0; return (ret); } /* * Attempts a "best effort" to send a message on the given site. If there is an * excessive backlog of message already queued on the connection, what shall we * do? If the caller doesn't mind blocking, we'll wait (a limited amount of * time) for the queue to drain. Otherwise we'll simply drop the message. This * is always allowed by the replication protocol. But in the case of a * multi-message response to a request like PAGE_REQ, LOG_REQ or ALL_REQ we * almost always get a flood of messages that instantly fills our queue, so * blocking improves performance (by avoiding the need for the client to * re-request). * * How long shall we wait? We could of course create a new timeout * configuration type, so that the application could set it directly. But that * would start to overwhelm the user with too many choices to think about. We * already have an ACK timeout, which is the user's estimate of how long it * should take to send a message to the client, have it be processed, and return * a message back to us. We multiply that by the queue size, because that's how * many messages have to be swallowed up by the client before we're able to * start sending again (at least to a rough approximation). */ static int __repmgr_send_internal(env, conn, msg, blockable) ENV *env; REPMGR_CONNECTION *conn; struct sending_msg *msg; int blockable; { DB_REP *db_rep; REP *rep; REPMGR_IOVECS iovecs; SITE_STRING_BUFFER buffer; db_timeout_t drain_to; int ret; size_t nw; size_t total_written; db_rep = env->rep_handle; rep = db_rep->region; DB_ASSERT(env, conn->state != CONN_CONNECTING && conn->state != CONN_DEFUNCT); if (!STAILQ_EMPTY(&conn->outbound_queue)) { /* * Output to this site is currently owned by the select() * thread, so we can't try sending in-line here. We can only * queue the msg for later. */ VPRINT(env, (env, DB_VERB_REPMGR_MISC, "msg to %s to be queued", __repmgr_format_eid_loc(env->rep_handle, conn->eid, buffer))); if (conn->out_queue_length >= OUT_QUEUE_LIMIT && blockable && conn->state != CONN_CONGESTED) { VPRINT(env, (env, DB_VERB_REPMGR_MISC, "block msg thread, await queue space")); if ((drain_to = rep->ack_timeout) == 0) drain_to = DB_REPMGR_DEFAULT_ACK_TIMEOUT; VPRINT(env, (env, DB_VERB_REPMGR_MISC, "will await drain")); conn->blockers++; ret = __repmgr_await_drain(env, conn, drain_to * OUT_QUEUE_LIMIT); conn->blockers--; VPRINT(env, (env, DB_VERB_REPMGR_MISC, "drain returned %d (%d,%d)", ret, db_rep->finished, conn->out_queue_length)); if (db_rep->finished) return (DB_TIMEOUT); if (ret != 0) return (ret); if (STAILQ_EMPTY(&conn->outbound_queue)) goto empty; } if (conn->out_queue_length < OUT_QUEUE_LIMIT) return (enqueue_msg(env, conn, msg, 0)); else { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "queue limit exceeded")); STAT(env->rep_handle-> region->mstat.st_msgs_dropped++); return (DB_TIMEOUT); } } empty: /* * Send as much data to the site as we can, without blocking. Keep * writing as long as we're making some progress. Make a scratch copy * of iovecs for our use, since we destroy it in the process of * adjusting pointers after each partial I/O. */ memcpy(&iovecs, &msg->iovecs, sizeof(iovecs)); total_written = 0; while ((ret = __repmgr_writev(conn->fd, &iovecs.vectors[iovecs.offset], iovecs.count-iovecs.offset, &nw)) == 0) { total_written += nw; if (__repmgr_update_consumed(&iovecs, nw)) /* all written */ return (0); } switch (ret) { case WOULDBLOCK: #if defined(DB_REPMGR_EAGAIN) && DB_REPMGR_EAGAIN != WOULDBLOCK case DB_REPMGR_EAGAIN: #endif break; default: #ifdef EBADF DB_ASSERT(env, ret != EBADF); #endif __db_err(env, ret, "socket writing failure"); STAT(env->rep_handle->region->mstat.st_connection_drop++); return (DB_REP_UNAVAIL); } VPRINT(env, (env, DB_VERB_REPMGR_MISC, "wrote only %lu bytes to %s", (u_long)total_written, __repmgr_format_eid_loc(env->rep_handle, conn->eid, buffer))); /* * We can't send any more without blocking: queue (a pointer to) a * "flattened" copy of the message, so that the select() thread will * finish sending it later. */ if ((ret = enqueue_msg(env, conn, msg, total_written)) != 0) return (ret); STAT(env->rep_handle->region->mstat.st_msgs_queued++); /* * Wake the main select thread so that it can discover that it has * received ownership of this connection. Note that we didn't have to * do this in the previous case (above), because the non-empty queue * implies that the select() thread is already managing ownership of * this connection. */ #ifdef DB_WIN32 if (WSAEventSelect(conn->fd, conn->event_object, FD_READ|FD_WRITE|FD_CLOSE) == SOCKET_ERROR) { ret = net_errno; __db_err(env, ret, "can't add FD_WRITE event bit"); return (ret); } #endif return (__repmgr_wake_main_thread(env)); } /* * PUBLIC: int __repmgr_is_permanent __P((ENV *, const DB_LSN *, u_int)); * * Count up how many sites have ack'ed the given LSN. Returns TRUE if enough * sites have ack'ed; FALSE otherwise. * * !!! * Caller must hold the mutex. */ int __repmgr_is_permanent(env, lsnp, needed) ENV *env; const DB_LSN *lsnp; u_int needed; { DB_REP *db_rep; REP *rep; REPMGR_SITE *site; u_int eid, nsites, npeers; int is_perm, has_missing_peer; db_rep = env->rep_handle; rep = db_rep->region; if (rep->perm_policy == DB_REPMGR_ACKS_NONE) return (TRUE); nsites = npeers = 0; has_missing_peer = FALSE; for (eid = 0; eid < db_rep->site_cnt; eid++) { site = SITE_FROM_EID(eid); if (!F_ISSET(site, SITE_HAS_PRIO)) { /* * Never connected to this site: since we can't know * whether it's a peer, assume the worst. */ has_missing_peer = TRUE; continue; } if (LOG_COMPARE(&site->max_ack, lsnp) >= 0) { nsites++; if (site->priority > 0) npeers++; } else { /* This site hasn't ack'ed the message. */ if (site->priority > 0) has_missing_peer = TRUE; } } switch (rep->perm_policy) { case DB_REPMGR_ACKS_ALL: case DB_REPMGR_ACKS_ALL_AVAILABLE: case DB_REPMGR_ACKS_ONE: is_perm = (nsites >= needed); break; case DB_REPMGR_ACKS_ONE_PEER: case DB_REPMGR_ACKS_QUORUM: is_perm = (npeers >= needed); break; case DB_REPMGR_ACKS_ALL_PEERS: if (db_rep->site_cnt < __repmgr_get_nsites(db_rep) - 1) { /* Assume missing site might be a peer. */ has_missing_peer = TRUE; } is_perm = !has_missing_peer; break; default: is_perm = FALSE; (void)__db_unknown_path(env, "__repmgr_is_permanent"); } return (is_perm); } /* * Abandons a connection, to recover from an error. Takes necessary recovery * action. Note that we don't actually close and clean up the connection here; * that happens later, in the select() thread main loop. See further * explanation at function __repmgr_disable_connection(). * * PUBLIC: int __repmgr_bust_connection __P((ENV *, REPMGR_CONNECTION *)); * * !!! * Caller holds mutex. */ int __repmgr_bust_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { DB_REP *db_rep; REP *rep; REPMGR_SITE *site; u_int32_t flags; int connecting, ret, subordinate_conn, eid; db_rep = env->rep_handle; rep = db_rep->region; ret = 0; eid = conn->eid; connecting = (conn->state == CONN_CONNECTING); __repmgr_disable_connection(env, conn); /* * Any sort of connection, in any active state, could produce an error. * But when we're done DEFUNCT-ifying it here it should end up on the * orphans list. So, move it if it's not already there. */ if (IS_VALID_EID(eid)) { site = SITE_FROM_EID(eid); subordinate_conn = (conn != site->ref.conn); /* Note: schedule_connection_attempt wakes the main thread. */ if (!subordinate_conn && (ret = __repmgr_schedule_connection_attempt(env, (u_int)eid, FALSE)) != 0) return (ret); /* * If the failed connection was the one between us and the * master, assume that the master may have failed, and call for * an election. But only do this for the connection to the main * master process, not a subordinate one. And only do it if * we're our site's main process, not a subordinate one. And * only do it if the connection had managed to progress beyond * the "connecting" state, because otherwise it was just a * reconnection attempt that may have found the site unreachable * or the master process not running. And skip it if the * application has configured us not to do elections. */ if (!IS_SUBORDINATE(db_rep) && !subordinate_conn && !connecting && eid == rep->master_id) { /* * Even if we're not doing elections, defer the event * notification to later execution in the election * thread. We don't want to fire an event in the select * thread, and certainly not while holding the mutex. */ flags = ELECT_F_EVENT_NOTIFY; if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS)) LF_SET(ELECT_F_IMMED | ELECT_F_FAST); else RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Master failure, but no elections")); if ((ret = __repmgr_init_election(env, flags)) != 0) return (ret); } /* * If we're the master site, and we lose a main connection to a * client (whether we're the main replication process or a * subordinate process), then the client is going to have * trouble receiving live log records from us. So, set the * temporary log archive block timer, to give the client a * fighting chance to restart/recover/reconnect. (We don't care * about the client's subordinate connections to us -- i.e., * connections with a subordinate process at the client site -- * because those sites can only be reading, not applying updates * from us.) */ if (!subordinate_conn && !connecting && rep->master_id == SELF_EID) { RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Repmgr: bust connection. Block archive")); MASTER_UPDATE(env, (REGENV *)env->reginfo->primary); } } else { /* * The connection was not marked with a valid EID, so we know it * must have been an incoming connection in the very early * stages. Obviously it's correct for us to avoid the * site-specific recovery steps above. But even if we have just * learned which site the connection came from, and are killing * it because it's redundant, it means we already have a * perfectly fine connection, and so -- again -- it makes sense * for us to be skipping scheduling a reconnection, and checking * for a master crash. * * One way or another, make sure the main thread is poked, so * that we do the deferred clean-up. */ ret = __repmgr_wake_main_thread(env); } return (ret); } /* * Remove a connection from the possibility of any further activity, making sure * it ends up on the main connections list, so that it will be cleaned up at the * next opportunity in the select() thread. * * Various threads write onto TCP/IP sockets, and an I/O error could occur at * any time. However, only the dedicated "select()" thread may close the socket * file descriptor, because under POSIX we have to drop our mutex and then call * select() as two distinct (non-atomic) operations. * * To simplify matters, there is a single place in the select thread where we * close and clean up after any defunct connection. Even if the I/O error * happens in the select thread we follow this convention. * * When an error occurs, we disable the connection (mark it defunct so that no * one else will try to use it, and so that the select thread will find it and * clean it up), and then usually take some additional recovery action: schedule * a connection retry for later, and possibly call for an election if it was a * connection to the master. (This happens in the function * __repmgr_bust_connection.) But sometimes we don't want to do the recovery * part; just the disabling part. * * PUBLIC: void __repmgr_disable_connection __P((ENV *, REPMGR_CONNECTION *)); */ void __repmgr_disable_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { DB_REP *db_rep; REPMGR_SITE *site; int eid; db_rep = env->rep_handle; eid = conn->eid; if (IS_VALID_EID(eid)) { site = SITE_FROM_EID(eid); if (conn != site->ref.conn) /* It's a subordinate connection. */ TAILQ_REMOVE(&site->sub_conns, conn, entries); TAILQ_INSERT_TAIL(&db_rep->connections, conn, entries); } conn->state = CONN_DEFUNCT; conn->eid = -1; } /* * PUBLIC: int __repmgr_cleanup_connection __P((ENV *, REPMGR_CONNECTION *)); * * !!! * Idempotent. This can be called repeatedly as blocking message threads (of * which there could be multiples) wake up in case of error on the connection. */ int __repmgr_cleanup_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { DB_REP *db_rep; int ret; db_rep = env->rep_handle; if ((ret = __repmgr_close_connection(env, conn)) != 0) goto out; /* * If there's a blocked message thread waiting, we mustn't yank the * connection struct out from under it. Instead, just wake it up. * We'll get another chance to come back through here soon. */ if (conn->blockers > 0) { ret = __repmgr_signal(&conn->drained); goto out; } DB_ASSERT(env, !IS_VALID_EID(conn->eid) && conn->state == CONN_DEFUNCT); TAILQ_REMOVE(&db_rep->connections, conn, entries); ret = __repmgr_destroy_connection(env, conn); out: return (ret); } static void remove_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { DB_REP *db_rep; REPMGR_SITE *site; db_rep = env->rep_handle; if (IS_VALID_EID(conn->eid)) { site = SITE_FROM_EID(conn->eid); if (site->state == SITE_CONNECTED && conn == site->ref.conn) { /* Not on any list, so no need to do anything. */ } else TAILQ_REMOVE(&site->sub_conns, conn, entries); } else TAILQ_REMOVE(&db_rep->connections, conn, entries); } static int __repmgr_close_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { int ret; DB_ASSERT(env, conn->state == CONN_DEFUNCT || env->rep_handle->finished); ret = 0; if (conn->fd != INVALID_SOCKET) { ret = closesocket(conn->fd); conn->fd = INVALID_SOCKET; if (ret == SOCKET_ERROR) { ret = net_errno; __db_err(env, ret, "closing socket"); } #ifdef DB_WIN32 if (!WSACloseEvent(conn->event_object) && ret == 0) ret = net_errno; #endif } return (ret); } static int __repmgr_destroy_connection(env, conn) ENV *env; REPMGR_CONNECTION *conn; { QUEUED_OUTPUT *out; REPMGR_FLAT *msg; DBT *dbt; int ret; /* * Deallocate any input and output buffers we may have. */ if (conn->reading_phase == DATA_PHASE) { if (conn->msg_type == REPMGR_REP_MESSAGE) __os_free(env, conn->input.rep_message); else { dbt = &conn->input.repmgr_msg.cntrl; if (dbt->size > 0) __os_free(env, dbt->data); dbt = &conn->input.repmgr_msg.rec; if (dbt->size > 0) __os_free(env, dbt->data); } } while (!STAILQ_EMPTY(&conn->outbound_queue)) { out = STAILQ_FIRST(&conn->outbound_queue); STAILQ_REMOVE_HEAD(&conn->outbound_queue, entries); msg = out->msg; if (--msg->ref_count <= 0) __os_free(env, msg); __os_free(env, out); } ret = __repmgr_free_cond(&conn->drained); __os_free(env, conn); return (ret); } static int enqueue_msg(env, conn, msg, offset) ENV *env; REPMGR_CONNECTION *conn; struct sending_msg *msg; size_t offset; { QUEUED_OUTPUT *q_element; int ret; if (msg->fmsg == NULL && ((ret = flatten(env, msg)) != 0)) return (ret); if ((ret = __os_malloc(env, sizeof(QUEUED_OUTPUT), &q_element)) != 0) return (ret); q_element->msg = msg->fmsg; msg->fmsg->ref_count++; /* encapsulation would be sweeter */ q_element->offset = offset; /* Put it on the connection's outbound queue. */ STAILQ_INSERT_TAIL(&conn->outbound_queue, q_element, entries); conn->out_queue_length++; return (0); } /* * Either "control" or "rec" (or both) may be NULL, in which case we treat it * like a zero-length DBT. */ static void setup_sending_msg(msg, type, control, rec) struct sending_msg *msg; u_int type; const DBT *control, *rec; { u_int32_t control_size, rec_size; /* * The wire protocol is documented in a comment at the top of this * module. */ __repmgr_iovec_init(&msg->iovecs); msg->type = type; __repmgr_add_buffer(&msg->iovecs, &msg->type, sizeof(msg->type)); control_size = control == NULL ? 0 : control->size; msg->control_size_buf = htonl(control_size); __repmgr_add_buffer(&msg->iovecs, &msg->control_size_buf, sizeof(msg->control_size_buf)); rec_size = rec == NULL ? 0 : rec->size; msg->rec_size_buf = htonl(rec_size); __repmgr_add_buffer( &msg->iovecs, &msg->rec_size_buf, sizeof(msg->rec_size_buf)); if (control->size > 0) __repmgr_add_dbt(&msg->iovecs, control); if (rec_size > 0) __repmgr_add_dbt(&msg->iovecs, rec); msg->fmsg = NULL; } /* * Convert a message stored as iovec pointers to various pieces, into flattened * form, by copying all the pieces, and then make the iovec just point to the * new simplified form. */ static int flatten(env, msg) ENV *env; struct sending_msg *msg; { u_int8_t *p; size_t msg_size; int i, ret; DB_ASSERT(env, msg->fmsg == NULL); msg_size = msg->iovecs.total_bytes; if ((ret = __os_malloc(env, sizeof(*msg->fmsg) + msg_size, &msg->fmsg)) != 0) return (ret); msg->fmsg->length = msg_size; msg->fmsg->ref_count = 0; p = &msg->fmsg->data[0]; for (i = 0; i < msg->iovecs.count; i++) { memcpy(p, msg->iovecs.vectors[i].iov_base, msg->iovecs.vectors[i].iov_len); p = &p[msg->iovecs.vectors[i].iov_len]; } __repmgr_iovec_init(&msg->iovecs); __repmgr_add_buffer(&msg->iovecs, &msg->fmsg->data[0], msg_size); return (0); } /* * PUBLIC: REPMGR_SITE *__repmgr_find_site __P((ENV *, const char *, u_int)); */ REPMGR_SITE * __repmgr_find_site(env, host, port) ENV *env; const char *host; u_int port; { DB_REP *db_rep; REPMGR_SITE *site; u_int i; db_rep = env->rep_handle; for (i = 0; i < db_rep->site_cnt; i++) { site = &db_rep->sites[i]; if (strcmp(site->net_addr.host, host) == 0 && site->net_addr.port == port) return (site); } return (NULL); } /* * Scan the list of remote sites, returning the first one that is a peer, * is not the current master, and is available. */ static REPMGR_SITE * __repmgr_find_available_peer(env) ENV *env; { DB_REP *db_rep; REP *rep; REPMGR_SITE *site; u_int i; db_rep = env->rep_handle; rep = db_rep->region; for (i = 0; i < db_rep->site_cnt; i++) { site = &db_rep->sites[i]; if (F_ISSET(site, SITE_IS_PEER) && EID_FROM_SITE(site) != rep->master_id && IS_SITE_AVAILABLE(site)) return (site); } return (NULL); } /* * Initialize the fields of a (given) netaddr structure, with the given values. * We copy the host name, but take ownership of the ADDRINFO buffer. * * All inputs are assumed to have been already validated. * * PUBLIC: int __repmgr_pack_netaddr __P((ENV *, const char *, * PUBLIC: u_int, ADDRINFO *, repmgr_netaddr_t *)); */ int __repmgr_pack_netaddr(env, host, port, list, addr) ENV *env; const char *host; u_int port; ADDRINFO *list; repmgr_netaddr_t *addr; { int ret; DB_ASSERT(env, host != NULL); if ((ret = __os_strdup(env, host, &addr->host)) != 0) return (ret); addr->port = (u_int16_t)port; addr->address_list = list; addr->current = NULL; return (0); } /* * PUBLIC: int __repmgr_getaddr __P((ENV *, * PUBLIC: const char *, u_int, int, ADDRINFO **)); */ int __repmgr_getaddr(env, host, port, flags, result) ENV *env; const char *host; u_int port; int flags; /* Matches struct addrinfo declaration. */ ADDRINFO **result; { ADDRINFO *answer, hints; char buffer[10]; /* 2**16 fits in 5 digits. */ /* * Ports are really 16-bit unsigned values, but it's too painful to * push that type through the API. */ if (port > UINT16_MAX) { __db_errx(env, "port %u larger than max port %u", port, UINT16_MAX); return (EINVAL); } memset(&hints, 0, sizeof(hints)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_flags = flags; (void)snprintf(buffer, sizeof(buffer), "%u", port); /* * Although it's generally bad to discard error information, the return * code from __os_getaddrinfo is undependable. Our callers at least * would like to be able to distinguish errors in getaddrinfo (which we * want to consider to be re-tryable), from other failure (e.g., EINVAL, * above). */ if (__os_getaddrinfo(env, host, port, buffer, &hints, &answer) != 0) return (DB_REP_UNAVAIL); *result = answer; return (0); } /* * Adds a new site to our array of known sites (unless it already exists), * and schedules it for immediate connection attempt. Whether it exists or not, * we set newsitep, either to the already existing site, or to the newly created * site. Unless newsitep is passed in as NULL, which is allowed. * * PUBLIC: int __repmgr_add_site __P((ENV *, * PUBLIC: const char *, u_int, REPMGR_SITE **, u_int32_t, int)); * * !!! * Caller is expected to hold db_rep->mutex on entry. */ int __repmgr_add_site(env, host, port, sitep, flags, from_user) ENV *env; const char *host; u_int port; REPMGR_SITE **sitep; u_int32_t flags; int from_user; { int peer, state; state = SITE_IDLE; peer = LF_ISSET(DB_REPMGR_PEER); return (__repmgr_add_site_int(env, host, port, sitep, peer, state, from_user)); } /* * PUBLIC: int __repmgr_add_site_int __P((ENV *, * PUBLIC: const char *, u_int, REPMGR_SITE **, int, int, int)); */ int __repmgr_add_site_int(env, host, port, sitep, peer, state, from_user) ENV *env; const char *host; u_int port; REPMGR_SITE **sitep; int peer, state, from_user; { DB_REP *db_rep; REP *rep; DB_THREAD_INFO *ip; REPMGR_SITE *site; u_int base; int eid, locked, pre_exist, ret, t_ret, touched; db_rep = env->rep_handle; rep = db_rep->region; COMPQUIET(site, NULL); COMPQUIET(pre_exist, 0); eid = DB_EID_INVALID; touched = FALSE; /* Make sure we're up to date before adding to our local list. */ ENV_ENTER(env, ip); MUTEX_LOCK(env, rep->mtx_repmgr); locked = TRUE; base = db_rep->site_cnt; if ((ret = __repmgr_copy_in_added_sites(env)) != 0) goto out; /* Once we're this far, we're committed to doing init_new_sites. */ /* If it's still not found, now it's safe to add it. */ if ((site = __repmgr_find_site(env, host, port)) == NULL) { pre_exist = FALSE; /* * Store both locally and in shared region. */ if ((ret = __repmgr_new_site(env, &site, host, port, state, peer)) != 0) goto init; eid = EID_FROM_SITE(site); DB_ASSERT(env, (u_int)eid == db_rep->site_cnt - 1); if ((ret = __repmgr_share_netaddrs(env, rep, (u_int)eid, db_rep->site_cnt)) != 0) { /* * Rescind the added local slot. */ db_rep->site_cnt--; __repmgr_cleanup_netaddr(env, &site->net_addr); } } else if (from_user) { /* * A user-supplied existing site might have peer information * to share, but an existing site from internal repmgr paths * will not. */ pre_exist = TRUE; eid = EID_FROM_SITE(site); if (peer && !F_ISSET(site, SITE_IS_PEER)) { F_SET(site, SITE_IS_PEER); touched = TRUE; } else if (!peer && F_ISSET(site, SITE_IS_PEER)) { F_CLR(site, SITE_IS_PEER); touched = TRUE; } if (touched) /* * Only share peer information, not the site itself. * Pass the same value for start and limit to avoid * creating any new sites but still share peer values. */ ret = __repmgr_share_netaddrs(env, rep, (u_int)eid, (u_int)eid); } /* * Bump sequence count if this is a new site or if we updated any * peer value. */ if (!pre_exist || touched) db_rep->siteinfo_seq = ++rep->siteinfo_seq; init: MUTEX_UNLOCK(env, rep->mtx_repmgr); locked = FALSE; /* * Initialize all new sites (including the ones we snarfed via * copy_in_added_sites), even if it doesn't include a pre_existing one. * But if the new one is already connected, it doesn't need this * initialization, so skip over that one (which we accomplish by making * two calls with sub-ranges). */ if (state != SITE_CONNECTED || eid == DB_EID_INVALID) t_ret = __repmgr_init_new_sites(env, base, db_rep->site_cnt); else if ((t_ret = __repmgr_init_new_sites(env, base, (u_int)eid)) == 0) t_ret = __repmgr_init_new_sites(env, (u_int)(eid+1), db_rep->site_cnt); if (t_ret != 0 && ret == 0) ret = t_ret; out: if (locked) MUTEX_UNLOCK(env, rep->mtx_repmgr); ENV_LEAVE(env, ip); if (ret == 0) { if (sitep != NULL) *sitep = site; if (pre_exist) ret = EEXIST; } return (ret); } /* * Initialize a socket for listening. Sets a file descriptor for the socket, * ready for an accept() call in a thread that we're happy to let block. * * PUBLIC: int __repmgr_listen __P((ENV *)); */ int __repmgr_listen(env) ENV *env; { ADDRINFO *ai; DB_REP *db_rep; char *why; int sockopt, ret; socket_t s; db_rep = env->rep_handle; /* Use OOB value as sentinel to show no socket open. */ s = INVALID_SOCKET; if ((ai = ADDR_LIST_FIRST(&db_rep->my_addr)) == NULL) { if ((ret = __repmgr_getaddr(env, db_rep->my_addr.host, db_rep->my_addr.port, AI_PASSIVE, &ai)) == 0) ADDR_LIST_INIT(&db_rep->my_addr, ai); else return (ret); } /* * Given the assert is correct, we execute the loop at least once, which * means 'why' will have been set by the time it's needed. But of * course lint doesn't know about DB_ASSERT. */ COMPQUIET(why, ""); DB_ASSERT(env, ai != NULL); for (; ai != NULL; ai = ADDR_LIST_NEXT(&db_rep->my_addr)) { if ((s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) == INVALID_SOCKET) { why = "can't create listen socket"; continue; } /* * When testing, it's common to kill and restart regularly. On * some systems, this causes bind to fail with "address in use" * errors unless this option is set. */ sockopt = 1; if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (sockopt_t)&sockopt, sizeof(sockopt)) != 0) { why = "can't set REUSEADDR socket option"; break; } if (bind(s, ai->ai_addr, (socklen_t)ai->ai_addrlen) != 0) { why = "can't bind socket to listening address"; (void)closesocket(s); s = INVALID_SOCKET; continue; } if (listen(s, 5) != 0) { why = "listen()"; break; } if ((ret = __repmgr_set_nonblocking(s)) != 0) { __db_err(env, ret, "can't unblock listen socket"); goto clean; } db_rep->listen_fd = s; return (0); } ret = net_errno; __db_err(env, ret, "%s", why); clean: if (s != INVALID_SOCKET) (void)closesocket(s); return (ret); } /* * PUBLIC: int __repmgr_net_close __P((ENV *)); */ int __repmgr_net_close(env) ENV *env; { DB_REP *db_rep; REP *rep; int ret; db_rep = env->rep_handle; rep = db_rep->region; ret = __repmgr_each_connection(env, final_cleanup, NULL, FALSE); if (db_rep->listen_fd != INVALID_SOCKET) { if (closesocket(db_rep->listen_fd) == SOCKET_ERROR && ret == 0) ret = net_errno; db_rep->listen_fd = INVALID_SOCKET; rep->listener = 0; } return (ret); } static int final_cleanup(env, conn, unused) ENV *env; REPMGR_CONNECTION *conn; void *unused; { int ret, t_ret; COMPQUIET(unused, NULL); remove_connection(env, conn); ret = __repmgr_close_connection(env, conn); if ((t_ret = __repmgr_destroy_connection(env, conn)) != 0 && ret == 0) ret = t_ret; return (ret); } /* * PUBLIC: void __repmgr_net_destroy __P((ENV *, DB_REP *)); */ void __repmgr_net_destroy(env, db_rep) ENV *env; DB_REP *db_rep; { REPMGR_RETRY *retry; REPMGR_SITE *site; u_int i; __repmgr_cleanup_netaddr(env, &db_rep->my_addr); if (db_rep->sites == NULL) return; while (!TAILQ_EMPTY(&db_rep->retries)) { retry = TAILQ_FIRST(&db_rep->retries); TAILQ_REMOVE(&db_rep->retries, retry, entries); __os_free(env, retry); } DB_ASSERT(env, TAILQ_EMPTY(&db_rep->connections)); for (i = 0; i < db_rep->site_cnt; i++) { site = &db_rep->sites[i]; DB_ASSERT(env, TAILQ_EMPTY(&site->sub_conns)); __repmgr_cleanup_netaddr(env, &site->net_addr); } __os_free(env, db_rep->sites); db_rep->sites = NULL; }