/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996, 2010 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ #include "db_config.h" #include "db_int.h" #include "dbinc/db_page.h" #include "dbinc/fop.h" #include "dbinc/btree.h" #include "dbinc/hash.h" #include "dbinc/mp.h" #include "dbinc/qam.h" #include "dbinc/txn.h" #ifndef lint static const char copyright[] = "Copyright (c) 1996, 2010 Oracle and/or its affiliates. All rights reserved.\n"; #endif static int __db_log_corrupt __P((ENV *, DB_LSN *)); static int __env_init_rec_42 __P((ENV *)); static int __env_init_rec_43 __P((ENV *)); static int __env_init_rec_46 __P((ENV *)); static int __env_init_rec_47 __P((ENV *)); static int __env_init_rec_48 __P((ENV *)); static int __log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *)); static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); static int __log_backup __P((ENV *, DB_LOGC *, DB_LSN *, DB_LSN*)); /* * __db_apprec -- * Perform recovery. If max_lsn is non-NULL, then we are trying * to synchronize this system up with another system that has a max * LSN of max_lsn, so we need to roll back sufficiently far for that * to work. See __log_backup for details. * * PUBLIC: int __db_apprec __P((ENV *, * PUBLIC: DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t)); */ int __db_apprec(env, ip, max_lsn, trunclsn, update, flags) ENV *env; DB_THREAD_INFO *ip; DB_LSN *max_lsn, *trunclsn; int update; u_int32_t flags; { DBT data; DB_ENV *dbenv; DB_LOGC *logc; DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn, tlsn; DB_LSN *vtrunc_ckp, *vtrunc_lsn; DB_TXNHEAD *txninfo; DB_TXNREGION *region; REGENV *renv; REGINFO *infop; __txn_ckp_args *ckp_args; time_t now, tlow; double nfiles; u_int32_t hi_txn, log_size, txnid; int32_t low; int all_recovered, have_rec, progress, ret, t_ret; char *p, *pass; char t1[CTIME_BUFLEN], t2[CTIME_BUFLEN], time_buf[CTIME_BUFLEN]; COMPQUIET(nfiles, (double)0.001); dbenv = env->dbenv; logc = NULL; ckp_args = NULL; hi_txn = TXN_MAXIMUM; txninfo = NULL; pass = "initial"; ZERO_LSN(lsn); /* * XXX * Get the log size. No locking required because we're single-threaded * during recovery. */ log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size; /* * If we need to, update the env handle timestamp. */ if (update && REP_ON(env)) { infop = env->reginfo; renv = infop->primary; (void)time(&renv->rep_timestamp); } /* Set in-recovery flags. */ F_SET(env->lg_handle, DBLOG_RECOVER); region = env->tx_handle->reginfo.primary; F_SET(region, TXN_IN_RECOVERY); /* Allocate a cursor for the log. */ if ((ret = __log_cursor(env, &logc)) != 0) goto err; /* * If the user is specifying recovery to a particular point in time * or to a particular LSN, find the point to start recovery from. */ ZERO_LSN(lowlsn); if (max_lsn != NULL) { if ((ret = __log_backup(env, logc, max_lsn, &lowlsn)) != 0) goto err; } else if (dbenv->tx_timestamp != 0) { if ((ret = __log_earliest(env, logc, &low, &lowlsn)) != 0) goto err; if ((int32_t)dbenv->tx_timestamp < low) { t1[sizeof(t1) - 1] = '\0'; (void)strncpy(t1, __os_ctime( &dbenv->tx_timestamp, time_buf), sizeof(t1) - 1); if ((p = strchr(t1, '\n')) != NULL) *p = '\0'; t2[sizeof(t2) - 1] = '\0'; tlow = (time_t)low; (void)strncpy(t2, __os_ctime( &tlow, time_buf), sizeof(t2) - 1); if ((p = strchr(t2, '\n')) != NULL) *p = '\0'; __db_errx(env, "Invalid recovery timestamp %s; earliest time is %s", t1, t2); ret = EINVAL; goto err; } } /* * Recovery is done in three passes: * Pass #0: * We need to find the position from which we will open files. * We need to open files beginning with the earlier of the * most recent checkpoint LSN and a checkpoint LSN before the * recovery timestamp, if specified. We need to be before the * most recent checkpoint LSN because we are going to collect * information about which transactions were begun before we * start rolling forward. Those that were should never be undone * because queue cannot use LSNs to determine what operations can * safely be aborted and it cannot rollback operations in * transactions for which there may be records not processed * during recovery. We need to consider earlier points in time * in case we are recovering to a particular timestamp. * * Pass #1: * Read forward through the log from the position found in pass 0 * opening and closing files, and recording transactions for which * we've seen their first record (the transaction's prev_lsn is * 0,0). At the end of this pass, we know all transactions for * which we've seen begins and we have the "current" set of files * open. * * Pass #2: * Read backward through the log undoing any uncompleted TXNs. * There are four cases: * 1. If doing catastrophic recovery, we read to the * beginning of the log * 2. If we are doing normal reovery, then we have to roll * back to the most recent checkpoint LSN. * 3. If we are recovering to a point in time, then we have * to roll back to the checkpoint whose ckp_lsn is earlier * than the specified time. __log_earliest will figure * this out for us. * 4. If we are recovering back to a particular LSN, then * we have to roll back to the checkpoint whose ckp_lsn * is earlier than the max_lsn. __log_backup will figure * that out for us. * In case 2, "uncompleted TXNs" include all those who committed * after the user's specified timestamp. * * Pass #3: * Read forward through the log from the LSN found in pass #2, * redoing any committed TXNs (which committed after any user- * specified rollback point). During this pass, checkpoint * file information is ignored, and file openings and closings * are redone. * * ckp_lsn -- lsn of the last checkpoint or the first in the log. * first_lsn -- the lsn where the forward passes begin. * last_lsn -- the last lsn in the log, used for feedback * lowlsn -- the lsn we are rolling back to, if we are recovering * to a point in time. * lsn -- temporary use lsn. * stop_lsn -- the point at which forward roll should stop */ /* * Find out the last lsn, so that we can estimate how far along we * are in recovery. This will help us determine how much log there * is between the first LSN that we're going to be working with and * the last one. We assume that each of the three phases takes the * same amount of time (a false assumption) and then use the %-age * of the amount of log traversed to figure out how much of the * pass we've accomplished. * * If we can't find any log records, we're kind of done. */ #ifdef UMRW ZERO_LSN(last_lsn); #endif memset(&data, 0, sizeof(data)); if ((ret = __logc_get(logc, &last_lsn, &data, DB_LAST)) != 0) { if (ret == DB_NOTFOUND) ret = 0; else __db_errx(env, "Last log record not found"); goto err; } do { /* txnid is after rectype, which is a u_int32. */ LOGCOPY_32(env, &txnid, (u_int8_t *)data.data + sizeof(u_int32_t)); if (txnid != 0) break; } while ((ret = __logc_get(logc, &lsn, &data, DB_PREV)) == 0); /* * There are no transactions, so there is nothing to do unless * we're recovering to an LSN. If we are, we need to proceed since * we'll still need to do a vtruncate based on information we haven't * yet collected. */ if (ret == DB_NOTFOUND) ret = 0; else if (ret != 0) goto err; hi_txn = txnid; /* * Pass #0 * Find the LSN from which we begin OPENFILES. * * If this is a catastrophic recovery, or if no checkpoint exists * in the log, the LSN is the first LSN in the log. * * Otherwise, it is the minimum of (1) the LSN in the last checkpoint * and (2) the LSN in the checkpoint before any specified recovery * timestamp or max_lsn. */ /* * Get the first LSN in the log; it's an initial default * even if this is not a catastrophic recovery. */ if ((ret = __logc_get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) { if (ret == DB_NOTFOUND) ret = 0; else __db_errx(env, "First log record not found"); goto err; } first_lsn = ckp_lsn; have_rec = 1; if (!LF_ISSET(DB_RECOVER_FATAL)) { if ((ret = __txn_getckp(env, &ckp_lsn)) == 0 && (ret = __logc_get(logc, &ckp_lsn, &data, DB_SET)) == 0) { /* We have a recent checkpoint. This is LSN (1). */ if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0) { __db_errx(env, "Invalid checkpoint record at [%ld][%ld]", (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); goto err; } first_lsn = ckp_args->ckp_lsn; __os_free(env, ckp_args); have_rec = 0; } /* * If LSN (2) exists, use it if it's before LSN (1). * (If LSN (1) doesn't exist, first_lsn is the * beginning of the log, so will "win" this check.) * * XXX * In the recovery-to-a-timestamp case, lowlsn is chosen by * __log_earliest, and is the checkpoint LSN of the * *earliest* checkpoint in the unreclaimed log. I * (krinsky) believe that we could optimize this by looking * instead for the LSN of the *latest* checkpoint before * the timestamp of interest, but I'm not sure that this * is worth doing right now. (We have to look for lowlsn * and low anyway, to make sure the requested timestamp is * somewhere in the logs we have, and all that's required * is that we pick *some* checkpoint after the beginning of * the logs and before the timestamp. */ if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) && LOG_COMPARE(&lowlsn, &first_lsn) < 0) { DB_ASSERT(env, have_rec == 0); first_lsn = lowlsn; } } /* Get the record at first_lsn if we don't have it already. */ if (!have_rec && (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) { __db_errx(env, "Checkpoint LSN record [%ld][%ld] not found", (u_long)first_lsn.file, (u_long)first_lsn.offset); goto err; } if (dbenv->db_feedback != NULL) { if (last_lsn.file == first_lsn.file) nfiles = (double) (last_lsn.offset - first_lsn.offset) / log_size; else nfiles = (double)(last_lsn.file - first_lsn.file) + (double)((log_size - first_lsn.offset) + last_lsn.offset) / log_size; /* We are going to divide by nfiles; make sure it isn't 0. */ if (nfiles < 0.001) nfiles = 0.001; } /* Find a low txnid. */ ret = 0; if (hi_txn != 0) do { /* txnid is after rectype, which is a u_int32. */ LOGCOPY_32(env, &txnid, (u_int8_t *)data.data + sizeof(u_int32_t)); if (txnid != 0) break; } while ((ret = __logc_get(logc, &lsn, &data, DB_NEXT)) == 0); /* * There are no transactions and we're not recovering to an LSN (see * above), so there is nothing to do. */ if (ret == DB_NOTFOUND) { if (LOG_COMPARE(&lsn, &last_lsn) != 0) ret = __db_log_corrupt(env, &lsn); else ret = 0; } /* Reset to the first lsn. */ if (ret != 0 || (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) goto err; /* Initialize the transaction list. */ if ((ret = __db_txnlist_init(env, ip, txnid, hi_txn, max_lsn, &txninfo)) != 0) goto err; /* * Pass #1 * Run forward through the log starting at the first relevant lsn. */ if ((ret = __env_openfiles(env, logc, txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0) goto err; /* If there were no transactions, then we can bail out early. */ if (hi_txn == 0 && max_lsn == NULL) { lsn = last_lsn; goto done; } /* * Pass #2. * * We used first_lsn to tell us how far back we need to recover, * use it here. */ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) __db_msg(env, "Recovery starting from [%lu][%lu]", (u_long)first_lsn.file, (u_long)first_lsn.offset); pass = "backward"; for (ret = __logc_get(logc, &lsn, &data, DB_LAST); ret == 0 && LOG_COMPARE(&lsn, &first_lsn) >= 0; ret = __logc_get(logc, &lsn, &data, DB_PREV)) { if (dbenv->db_feedback != NULL) { progress = 34 + (int)(33 * (__lsn_diff(&first_lsn, &last_lsn, &lsn, log_size, 0) / nfiles)); dbenv->db_feedback(dbenv, DB_RECOVER, progress); } tlsn = lsn; ret = __db_dispatch(env, &env->recover_dtab, &data, &tlsn, DB_TXN_BACKWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; else ret = 0; } } if (ret == DB_NOTFOUND) { if (LOG_COMPARE(&lsn, &first_lsn) > 0) ret = __db_log_corrupt(env, &lsn); else ret = 0; } if (ret != 0) goto err; /* * Pass #3. If we are recovering to a timestamp or to an LSN, * we need to make sure that we don't roll-forward beyond that * point because there may be non-transactional operations (e.g., * closes that would fail). The last_lsn variable is used for * feedback calculations, but use it to set an initial stopping * point for the forward pass, and then reset appropriately to * derive a real stop_lsn that tells how far the forward pass * should go. */ pass = "forward"; stop_lsn = last_lsn; if (max_lsn != NULL || dbenv->tx_timestamp != 0) stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn; for (ret = __logc_get(logc, &lsn, &data, DB_NEXT); ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) { if (dbenv->db_feedback != NULL) { progress = 67 + (int)(33 * (__lsn_diff(&first_lsn, &last_lsn, &lsn, log_size, 1) / nfiles)); dbenv->db_feedback(dbenv, DB_RECOVER, progress); } tlsn = lsn; ret = __db_dispatch(env, &env->recover_dtab, &data, &tlsn, DB_TXN_FORWARD_ROLL, txninfo); if (ret != 0) { if (ret != DB_TXN_CKP) goto msgerr; else ret = 0; } /* * If we are recovering to a timestamp or an LSN, * we need to make sure that we don't try to roll * forward beyond the soon-to-be end of log. */ if (LOG_COMPARE(&lsn, &stop_lsn) >= 0) break; } if (ret == DB_NOTFOUND) ret = __db_log_corrupt(env, &lsn); if (ret != 0) goto err; if (max_lsn == NULL) region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid; done: /* We are going to truncate, so we'd best close the cursor. */ if (logc != NULL) { if ((ret = __logc_close(logc)) != 0) goto err; logc = NULL; } /* * Also flush the cache before truncating the log. It's recovery, * ignore any application max-write configuration. */ if ((ret = __memp_sync_int(env, NULL, 0, DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0) goto err; if (dbenv->tx_timestamp != 0) { /* Run recovery up to this timestamp. */ region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; vtrunc_lsn = &((DB_TXNHEAD *)txninfo)->maxlsn; vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn; } else if (max_lsn != NULL) { /* This is a HA client syncing to the master. */ if (!IS_ZERO_LSN(((DB_TXNHEAD *)txninfo)->ckplsn)) region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; else if ((ret = __txn_findlastckp(env, ®ion->last_ckp, max_lsn)) != 0) goto err; vtrunc_lsn = max_lsn; vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn; } else { /* * The usual case: we recovered the whole (valid) log; clear * out any partial record after the recovery point. */ vtrunc_lsn = &lsn; vtrunc_ckp = ®ion->last_ckp; } if ((ret = __log_vtruncate(env, vtrunc_lsn, vtrunc_ckp, trunclsn)) != 0) goto err; /* * Usually we close all files at the end of recovery, unless there are * prepared transactions or errors in the checkpoint. */ all_recovered = region->stat.st_nrestores == 0; /* * Log a checkpoint here so subsequent recoveries can skip what's been * done; this is unnecessary for HA rep clients, as they do not write * log records. */ if (max_lsn == NULL && !LF_ISSET(DB_NO_CHECKPOINT) && (ret = __txn_checkpoint(env, 0, 0, DB_CKP_INTERNAL | DB_FORCE)) != 0) { /* * If there was no space for the checkpoint or flushng db * pages we can still bring the environment up, if only for * read-only access. We must not close the open files because a * subsequent recovery might still need to redo this portion * of the log [#18590]. */ if (max_lsn == NULL && ret == ENOSPC) { if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) __db_msg(env, "Recovery continuing after non-fatal checkpoint error: %s", db_strerror(ret)); all_recovered = 0; } else goto err; } if (all_recovered ) { /* Close all the db files that are open. */ if ((ret = __dbreg_close_files(env, 0)) != 0) goto err; } else { if ((ret = __dbreg_mark_restored(env)) != 0) goto err; F_SET(env->lg_handle, DBLOG_OPENFILES); } if (max_lsn != NULL) { /* * Now we need to open files that should be open in order for * client processing to continue. However, since we've * truncated the log, we need to recompute from where the * openfiles pass should begin. */ if ((ret = __log_cursor(env, &logc)) != 0) goto err; if ((ret = __logc_get(logc, &first_lsn, &data, DB_FIRST)) != 0) { if (ret == DB_NOTFOUND) ret = 0; else __db_errx(env, "First log record not found"); goto err; } if ((ret = __txn_getckp(env, &first_lsn)) == 0 && (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) == 0) { /* We have a recent checkpoint. This is LSN (1). */ if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0) { __db_errx(env, "Invalid checkpoint record at [%ld][%ld]", (u_long)first_lsn.file, (u_long)first_lsn.offset); goto err; } first_lsn = ckp_args->ckp_lsn; __os_free(env, ckp_args); } if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) goto err; if ((ret = __env_openfiles(env, logc, txninfo, &data, &first_lsn, max_lsn, nfiles, 1)) != 0) goto err; } else if (all_recovered) { /* * If there are no transactions that need resolution, whether * because they are prepared or because recovery will need to * process them, we need to reset the transaction ID space and * log this fact. */ if ((ret = __txn_reset(env)) != 0) goto err; } else { if ((ret = __txn_recycle_id(env)) != 0) goto err; } if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) { (void)time(&now); __db_msg(env, "Recovery complete at %.24s", __os_ctime(&now, time_buf)); __db_msg(env, "%s %lx %s [%lu][%lu]", "Maximum transaction ID", (u_long)(txninfo == NULL ? TXN_MINIMUM : ((DB_TXNHEAD *)txninfo)->maxid), "Recovery checkpoint", (u_long)region->last_ckp.file, (u_long)region->last_ckp.offset); } if (0) { msgerr: __db_errx(env, "Recovery function for LSN %lu %lu failed on %s pass", (u_long)lsn.file, (u_long)lsn.offset, pass); } err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) ret = t_ret; if (txninfo != NULL) __db_txnlist_end(env, txninfo); dbenv->tx_timestamp = 0; F_CLR(env->lg_handle, DBLOG_RECOVER); F_CLR(region, TXN_IN_RECOVERY); return (ret); } /* * Figure out how many logfiles we have processed. If we are moving * forward (is_forward != 0), then we're computing current - low. If * we are moving backward, we are computing high - current. max is * the number of bytes per logfile. */ static double __lsn_diff(low, high, current, max, is_forward) DB_LSN *low, *high, *current; u_int32_t max; int is_forward; { double nf; /* * There are three cases in each direction. If you are in the * same file, then all you need worry about is the difference in * offsets. If you are in different files, then either your offsets * put you either more or less than the integral difference in the * number of files -- we need to handle both of these. */ if (is_forward) { if (current->file == low->file) nf = (double)(current->offset - low->offset) / max; else if (current->offset < low->offset) nf = (double)((current->file - low->file) - 1) + (double)((max - low->offset) + current->offset) / max; else nf = (double)(current->file - low->file) + (double)(current->offset - low->offset) / max; } else { if (current->file == high->file) nf = (double)(high->offset - current->offset) / max; else if (current->offset > high->offset) nf = (double)((high->file - current->file) - 1) + (double) ((max - current->offset) + high->offset) / max; else nf = (double)(high->file - current->file) + (double)(high->offset - current->offset) / max; } return (nf); } /* * __log_backup -- * * This is used to find the earliest log record to process when a client * is trying to sync up with a master whose max LSN is less than this * client's max lsn; we want to roll back everything after that. * * Find the latest checkpoint whose ckp_lsn is less than the max lsn. */ static int __log_backup(env, logc, max_lsn, start_lsn) ENV *env; DB_LOGC *logc; DB_LSN *max_lsn, *start_lsn; { DBT data; DB_LSN lsn; __txn_ckp_args *ckp_args; int ret; memset(&data, 0, sizeof(data)); ckp_args = NULL; if ((ret = __txn_getckp(env, &lsn)) != 0) goto err; while ((ret = __logc_get(logc, &lsn, &data, DB_SET)) == 0) { if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0) return (ret); /* * Follow checkpoints through the log until * we find one with a ckp_lsn less than * or equal max_lsn. */ if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) { *start_lsn = ckp_args->ckp_lsn; break; } lsn = ckp_args->last_ckp; /* * If there are no more checkpoints behind us, we're * done. Break with DB_NOTFOUND. */ if (IS_ZERO_LSN(lsn)) { ret = DB_NOTFOUND; break; } __os_free(env, ckp_args); ckp_args = NULL; } if (ckp_args != NULL) __os_free(env, ckp_args); /* * If we walked back through all the checkpoints, * set the cursor on the first log record. */ err: if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND)) ret = __logc_get(logc, start_lsn, &data, DB_FIRST); return (ret); } /* * __log_earliest -- * * Return the earliest recovery point for the log files present. The * earliest recovery time is the time stamp of the first checkpoint record * whose checkpoint LSN is greater than the first LSN we process. */ static int __log_earliest(env, logc, lowtime, lowlsn) ENV *env; DB_LOGC *logc; int32_t *lowtime; DB_LSN *lowlsn; { __txn_ckp_args *ckpargs; DB_LSN first_lsn, lsn; DBT data; u_int32_t rectype; int cmp, ret; memset(&data, 0, sizeof(data)); /* * Read forward through the log looking for the first checkpoint * record whose ckp_lsn is greater than first_lsn. */ for (ret = __logc_get(logc, &first_lsn, &data, DB_FIRST); ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) { LOGCOPY_32(env, &rectype, data.data); if (rectype != DB___txn_ckp) continue; if ((ret = __txn_ckp_read(env, data.data, &ckpargs)) == 0) { cmp = LOG_COMPARE(&ckpargs->ckp_lsn, &first_lsn); *lowlsn = ckpargs->ckp_lsn; *lowtime = ckpargs->timestamp; __os_free(env, ckpargs); if (cmp >= 0) break; } } return (ret); } /* * __env_openfiles -- * Perform the pass of recovery that opens files. This is used * both during regular recovery and an initial call to txn_recover (since * we need files open in order to abort prepared, but not yet committed * transactions). * * See the comments in db_apprec for a detailed description of the * various recovery passes. * * If we are not doing feedback processing (i.e., we are doing txn_recover * processing and in_recovery is zero), then last_lsn can be NULL. * * PUBLIC: int __env_openfiles __P((ENV *, * PUBLIC: DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int)); */ int __env_openfiles(env, logc, txninfo, data, open_lsn, last_lsn, nfiles, in_recovery) ENV *env; DB_LOGC *logc; void *txninfo; DBT *data; DB_LSN *open_lsn, *last_lsn; double nfiles; int in_recovery; { DB_ENV *dbenv; DB_LSN lsn, tlsn; u_int32_t log_size; int progress, ret; dbenv = env->dbenv; /* * XXX * Get the log size. No locking required because we're single-threaded * during recovery. */ log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size; lsn = *open_lsn; for (;;) { if (in_recovery && dbenv->db_feedback != NULL) { DB_ASSERT(env, last_lsn != NULL); progress = (int)(33 * (__lsn_diff(open_lsn, last_lsn, &lsn, log_size, 1) / nfiles)); dbenv->db_feedback(dbenv, DB_RECOVER, progress); } tlsn = lsn; ret = __db_dispatch(env, &env->recover_dtab, data, &tlsn, in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES, txninfo); if (ret != 0 && ret != DB_TXN_CKP) { __db_errx(env, "Recovery function for LSN %lu %lu failed", (u_long)lsn.file, (u_long)lsn.offset); break; } if ((ret = __logc_get(logc, &lsn, data, DB_NEXT)) != 0) { if (ret == DB_NOTFOUND) { if (last_lsn != NULL && LOG_COMPARE(&lsn, last_lsn) != 0) ret = __db_log_corrupt(env, &lsn); else ret = 0; } break; } } return (ret); } static int __db_log_corrupt(env, lsnp) ENV *env; DB_LSN *lsnp; { __db_errx(env, "Log file corrupt at LSN: [%lu][%lu]", (u_long)lsnp->file, (u_long)lsnp->offset); return (EINVAL); } /* * __env_init_rec -- * * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t)); */ int __env_init_rec(env, version) ENV *env; u_int32_t version; { int ret; /* * We need to prime the recovery table with the current recovery * functions. Then we overwrite only specific entries based on * each previous version we support. */ if ((ret = __bam_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __crdel_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __db_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __dbreg_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __fop_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __ham_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __qam_init_recover(env, &env->recover_dtab)) != 0) goto err; if ((ret = __txn_init_recover(env, &env->recover_dtab)) != 0) goto err; /* * After installing all the current recovery routines, we want to * override them with older versions if we are reading a down rev * log (from a downrev replication master). If a log record is * changed then we must use the previous version for all older * logs. If a record is changed in multiple revisions then the * oldest revision that applies must be used. Therefore we override * the recovery functions in reverse log version order. */ if (version == DB_LOGVERSION) goto done; if ((ret = __env_init_rec_48(env)) != 0) goto err; /* * Patch 2 added __db_pg_trunc but did not replace any log records * so we want to override the same functions as in the original release. */ if (version >= DB_LOGVERSION_48) goto done; if ((ret = __env_init_rec_47(env)) != 0) goto err; if (version == DB_LOGVERSION_47) goto done; if ((ret = __env_init_rec_46(env)) != 0) goto err; /* * There are no log record/recovery differences between 4.4 and 4.5. * The log version changed due to checksum. There are no log recovery * differences between 4.5 and 4.6. The name of the rep_gen in * txn_checkpoint changed (to spare, since we don't use it anymore). */ if (version >= DB_LOGVERSION_44) goto done; if ((ret = __env_init_rec_43(env)) != 0) goto err; if (version == DB_LOGVERSION_43) goto done; if (version != DB_LOGVERSION_42) { __db_errx(env, "Unknown version %lu", (u_long)version); ret = EINVAL; goto err; } ret = __env_init_rec_42(env); done: err: return (ret); } static int __env_init_rec_42(env) ENV *env; { int ret; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_relink_42_recover, DB___db_relink_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_pg_alloc_42_recover, DB___db_pg_alloc_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_pg_free_42_recover, DB___db_pg_free_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_pg_freedata_42_recover, DB___db_pg_freedata_42)) != 0) goto err; #ifdef HAVE_HASH if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __ham_metagroup_42_recover, DB___ham_metagroup_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __ham_groupalloc_42_recover, DB___ham_groupalloc_42)) != 0) goto err; #endif if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __txn_ckp_42_recover, DB___txn_ckp_42)) != 0) goto err; err: return (ret); } static int __env_init_rec_43(env) ENV *env; { int ret; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __bam_relink_43_recover, DB___bam_relink_43)) != 0) goto err; /* * We want to use the 4.2-based txn_regop record. */ if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __txn_regop_42_recover, DB___txn_regop_42)) != 0) goto err; err: return (ret); } static int __env_init_rec_46(env) ENV *env; { int ret; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __bam_merge_44_recover, DB___bam_merge_44)) != 0) goto err; err: return (ret); } static int __env_init_rec_47(env) ENV *env; { int ret; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __bam_split_42_recover, DB___bam_split_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __fop_create_42_recover, DB___fop_create_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __fop_write_42_recover, DB___fop_write_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __fop_rename_42_recover, DB___fop_rename_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __fop_rename_noundo_46_recover, DB___fop_rename_noundo_46)) != 0) goto err; err: return (ret); } static int __env_init_rec_48(env) ENV *env; { int ret; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_addrem_42_recover, DB___db_addrem_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __db_big_42_recover, DB___db_big_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __bam_split_48_recover, DB___bam_split_48)) != 0) goto err; #ifdef HAVE_HASH if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __ham_insdel_42_recover, DB___ham_insdel_42)) != 0) goto err; if ((ret = __db_add_recovery_int(env, &env->recover_dtab, __ham_replace_42_recover, DB___ham_replace_42)) != 0) goto err; #endif err: return (ret); }