rpm/db/mp/mp_alloc.c

584 lines
16 KiB
C

/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996,2007 Oracle. All rights reserved.
*
* $Id: mp_alloc.c,v 12.33 2007/06/01 18:32:44 bostic Exp $
*/
#include "db_config.h"
#include "db_int.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
static void __memp_bad_buffer __P((DB_ENV *, DB_MPOOL_HASH *));
/*
* __memp_alloc --
* Allocate some space from a cache region.
*
* PUBLIC: int __memp_alloc __P((DB_MPOOL *,
* PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
*/
int
__memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
DB_MPOOL *dbmp;
REGINFO *infop;
MPOOLFILE *mfp;
size_t len;
roff_t *offsetp;
void *retp;
{
BH *bhp, *oldest_bhp, *tbhp;
BH_FROZEN_PAGE *frozen_bhp;
DB_ENV *dbenv;
DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
u_int32_t buckets, buffers, high_priority, priority;
u_int32_t put_counter, total_buckets;
int aggressive, alloc_freeze, giveup, got_oldest, ret;
u_int8_t *endp;
void *p;
dbenv = dbmp->dbenv;
c_mp = infop->primary;
dbht = R_ADDR(infop, c_mp->htab);
hp_end = &dbht[c_mp->htab_buckets];
buckets = buffers = put_counter = total_buckets = 0;
aggressive = alloc_freeze = giveup = got_oldest = 0;
hp_tmp = NULL;
STAT(c_mp->stat.st_alloc++);
/*
* If we're allocating a buffer, and the one we're discarding is the
* same size, we don't want to waste the time to re-integrate it into
* the shared memory free list. If the DB_MPOOLFILE argument isn't
* NULL, we'll compare the underlying page sizes of the two buffers
* before free-ing and re-allocating buffers.
*/
if (mfp != NULL) {
len = SSZA(BH, buf) + mfp->stat.st_pagesize;
/* Add space for alignment padding for MVCC diagnostics. */
MVCC_BHSIZE(mfp, len);
}
MPOOL_REGION_LOCK(dbenv, infop);
/*
* Anything newer than 1/10th of the buffer pool is ignored during
* allocation (unless allocation starts failing).
*/
high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
/*
* First we try to allocate from free memory. If that fails, scan the
* buffer pool to find buffers with low priorities. We consider small
* sets of hash buckets each time to limit the amount of work needing
* to be done. This approximates LRU, but not very well. We either
* find a buffer of the same size to use, or we will free 3 times what
* we need in the hopes it will coalesce into a contiguous chunk of the
* right size. In the latter case we branch back here and try again.
*/
alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) {
if (mfp != NULL)
c_mp->stat.st_pages++;
MPOOL_REGION_UNLOCK(dbenv, infop);
/*
* For MVCC diagnostics, align the pointer so that the buffer
* starts on a page boundary.
*/
MVCC_BHALIGN(mfp, p);
found: if (offsetp != NULL)
*offsetp = R_OFFSET(infop, p);
*(void **)retp = p;
/*
* Update the search statistics.
*
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
#ifdef HAVE_STATISTICS
total_buckets += buckets;
if (total_buckets != 0) {
if (total_buckets > c_mp->stat.st_alloc_max_buckets)
c_mp->stat.st_alloc_max_buckets = total_buckets;
c_mp->stat.st_alloc_buckets += total_buckets;
}
if (buffers != 0) {
if (buffers > c_mp->stat.st_alloc_max_pages)
c_mp->stat.st_alloc_max_pages = buffers;
c_mp->stat.st_alloc_pages += buffers;
}
#endif
return (0);
} else if (giveup || c_mp->stat.st_pages == 0) {
MPOOL_REGION_UNLOCK(dbenv, infop);
__db_errx(dbenv,
"unable to allocate space from the buffer cache");
return (ret);
}
ret = 0;
/*
* We re-attempt the allocation every time we've freed 3 times what
* we need. Reset our free-space counter.
*/
freed_space = 0;
total_buckets += buckets;
buckets = 0;
/*
* Walk the hash buckets and find the next two with potentially useful
* buffers. Free the buffer with the lowest priority from the buckets'
* chains.
*/
for (;;) {
/* All pages have been freed, make one last try */
if (c_mp->stat.st_pages == 0)
goto alloc;
/* Check for wrap around. */
hp = &dbht[c_mp->last_checked++];
if (hp >= hp_end) {
c_mp->last_checked = 0;
hp = &dbht[c_mp->last_checked++];
}
/*
* The failure mode is when there are too many buffers we can't
* write or there's not enough memory in the system to support
* the number of pinned buffers.
*
* Get aggressive if we've reviewed the entire cache without
* freeing 3 times the needed space. (The code resets the
* counter when we free 3 times the needed space.) Aggressive
* means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
* b: sync the mpool to force out queue extent pages. While we
* might not have enough space for what we want and flushing
* is expensive, why not?
* c: look at a buffer in every hash bucket rather than choose
* the more preferable of two.
* d: start to think about giving up.
*
* If we get here twice, sleep for a second, hopefully someone
* else will run and free up some memory.
*
* Always try to allocate memory too, in case some other thread
* returns its memory to the region.
*
* We don't have any way to know an allocation has no way to
* succeed. Fail if no pages are returned to the cache after
* we've been trying for a relatively long time.
*
* !!!
* This test ignores pathological cases like no buffers in the
* system -- we check for that early on, so it isn't possible.
*/
if (buckets++ == c_mp->htab_buckets) {
if (freed_space > 0)
goto alloc;
MPOOL_REGION_UNLOCK(dbenv, infop);
switch (++aggressive) {
case 1:
break;
case 2:
put_counter = c_mp->put_counter;
/* FALLTHROUGH */
case 3:
case 4:
case 5:
case 6:
(void)__memp_sync_int(
dbenv, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
__os_sleep(dbenv, 1, 0);
break;
default:
aggressive = 1;
if (put_counter == c_mp->put_counter)
giveup = 1;
break;
}
MPOOL_REGION_LOCK(dbenv, infop);
goto alloc;
}
/*
* Skip empty buckets.
*
* We can check for empty buckets before locking as we
* only care if the pointer is zero or non-zero.
*/
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
continue;
/*
* Skip buckets that only have pinned pages.
*
* Again we are doing this without locking. If we misread
* the number we might improperly skip a bucket but this is
* not fatal.
*/
if (hp->hash_priority == UINT32_MAX)
continue;
if (!aggressive) {
/* Adjust if the bucket has not been reset. */
priority = hp->hash_priority;
if (c_mp->lru_reset != 0 &&
c_mp->lru_reset <= hp - dbht)
priority -= MPOOL_BASE_DECREMENT;
/*
* Skip high priority buckets.
*/
if (priority > high_priority)
continue;
/*
* Find two buckets and select the one with the lowest
* priority. Performance testing shows that looking
* at two improves the LRUness and looking at more only
* does a little better.
*/
if (hp_tmp == NULL) {
hp_tmp = hp;
continue;
}
if (c_mp->lru_reset &&
c_mp->lru_reset <= hp_tmp - dbht) {
if (priority > hp_tmp->hash_priority -
MPOOL_BASE_DECREMENT)
hp = hp_tmp;
} else if (priority > hp_tmp->hash_priority)
hp = hp_tmp;
hp_tmp = NULL;
}
/* Unlock the region and lock the hash bucket. */
MPOOL_REGION_UNLOCK(dbenv, infop);
MUTEX_LOCK(dbenv, hp->mtx_hash);
/* Remember the priority of the buffer we're looking for. */
priority = hp->hash_priority;
#ifdef DIAGNOSTIC
__memp_check_order(dbenv, hp);
#endif
/*
* The lowest priority page is first in the bucket, as they are
* maintained in sorted order.
*
* The buffer may have been freed or its priority changed while
* we switched from the region lock to the hash lock. If so,
* we have to restart. We will still take the first buffer on
* the bucket's list, though, if it has a low enough priority.
*/
this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
goto next_hb;
buffers++;
/* Find the associated MPOOLFILE. */
bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
/* Select the lowest priority buffer in the chain. */
for (oldest_bhp = bhp, tbhp = SH_CHAIN_PREV(bhp, vc, __bh);
tbhp != NULL;
oldest_bhp = tbhp, tbhp = SH_CHAIN_PREV(tbhp, vc, __bh))
if (tbhp->ref <= bhp->ref &&
tbhp->priority <= bhp->priority)
bhp = tbhp;
/*
* Prefer the last buffer in the chain.
*
* If the oldest buffer isn't obsolete with respect to the
* cached old reader LSN, recalculate the oldest reader LSN
* and retry. There is a tradeoff here, because if we had the
* LSN earlier, we might have found pages to evict, but to get
* it, we need to lock the transaction region.
*/
if (oldest_bhp != bhp && oldest_bhp->ref == 0) {
if (F_ISSET(bhp, BH_FROZEN) &&
!F_ISSET(oldest_bhp, BH_FROZEN))
bhp = oldest_bhp;
else if (BH_OBSOLETE(oldest_bhp, hp->old_reader))
bhp = oldest_bhp;
else if (!got_oldest &&
__txn_oldest_reader(dbenv, &hp->old_reader) == 0) {
got_oldest = 1;
if (BH_OBSOLETE(oldest_bhp, hp->old_reader))
bhp = oldest_bhp;
}
}
if (bhp->ref != 0 || (bhp != oldest_bhp &&
!aggressive && bhp->priority > priority))
goto next_hb;
/* If the page is dirty, pin it and write it. */
ret = 0;
if (F_ISSET(bhp, BH_DIRTY)) {
++bhp->ref;
ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
--bhp->ref;
#ifdef HAVE_STATISTICS
if (ret == 0)
++c_mp->stat.st_rw_evict;
#endif
}
#ifdef HAVE_STATISTICS
else
++c_mp->stat.st_ro_evict;
#endif
/*
* Freeze this buffer, if necessary. That is, if the buffer
* itself or the next version created could be read by the
* oldest reader in the system.
*/
if (ret == 0 && bh_mfp->multiversion) {
if (!got_oldest && !SH_CHAIN_HASPREV(bhp, vc) &&
!BH_OBSOLETE(bhp, hp->old_reader)) {
(void)__txn_oldest_reader(dbenv,
&hp->old_reader);
got_oldest = 1;
}
if (SH_CHAIN_HASPREV(bhp, vc) ||
!BH_OBSOLETE(bhp, hp->old_reader)) {
/*
* Before freezing, double-check that we have
* an up-to-date old_reader LSN.
*/
if (!aggressive ||
F_ISSET(bhp, BH_FROZEN) || bhp->ref != 0)
goto next_hb;
ret = __memp_bh_freeze(dbmp,
infop, hp, bhp, &alloc_freeze);
}
}
/*
* If a write fails for any reason, we can't proceed.
*
* We released the hash bucket lock while doing I/O, so another
* thread may have acquired this buffer and incremented the ref
* count after we wrote it, in which case we can't have it.
*
* If there's a write error and we're having problems finding
* something to allocate, avoid selecting this buffer again
* by making it the bucket's least-desirable buffer.
*/
if (ret != 0 || bhp->ref != 0) {
if (ret != 0 && aggressive)
__memp_bad_buffer(dbenv, hp);
goto next_hb;
}
/*
* If we need some empty buffer headers for freezing, turn the
* buffer we've found into frozen headers and put them on the
* free list. Only reset alloc_freeze if we've actually
* allocated some frozen buffer headers.
*
* Check to see if the buffer is the size we're looking for.
* If so, we can simply reuse it. Otherwise, free the buffer
* and its space and keep looking.
*/
if (F_ISSET(bhp, BH_FROZEN)) {
++bhp->ref;
if ((ret = __memp_bh_thaw(dbmp, infop, hp,
bhp, NULL)) != 0) {
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
return (ret);
}
alloc_freeze = 0;
goto this_hb;
} else if (alloc_freeze) {
if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
return (ret);
MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE | PROT_EXEC);
MPOOL_REGION_LOCK(dbenv, infop);
SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
(BH_FROZEN_ALLOC *)bhp, links);
frozen_bhp = (BH_FROZEN_PAGE *)
((BH_FROZEN_ALLOC *)bhp + 1);
endp = (u_int8_t *)bhp->buf + bh_mfp->stat.st_pagesize;
while ((u_int8_t *)(frozen_bhp + 1) < endp) {
SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen,
(BH *)frozen_bhp, hq);
frozen_bhp++;
}
alloc_freeze = 0;
continue;
} else if (mfp != NULL &&
mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
return (ret);
p = bhp;
goto found;
} else {
freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize;
if ((ret = __memp_bhfree(dbmp, infop,
hp, bhp, BH_FREE_FREEMEM)) != 0)
return (ret);
}
if (aggressive > 1)
aggressive = 1;
/*
* Unlock this hash bucket and re-acquire the region lock. If
* we're reaching here as a result of calling memp_bhfree, the
* hash bucket lock has already been discarded.
*/
if (0) {
next_hb: MUTEX_UNLOCK(dbenv, hp->mtx_hash);
}
MPOOL_REGION_LOCK(dbenv, infop);
/*
* Retry the allocation as soon as we've freed up sufficient
* space. We're likely to have to coalesce of memory to
* satisfy the request, don't try until it's likely (possible?)
* we'll succeed.
*/
if (freed_space >= 3 * len)
goto alloc;
}
/* NOTREACHED */
}
/*
* __memp_free --
* Free some space from a cache region.
*
* PUBLIC: void __memp_free __P((REGINFO *, MPOOLFILE *, void *));
*/
void
__memp_free(infop, mfp, buf)
REGINFO *infop;
MPOOLFILE *mfp;
void *buf;
{
MVCC_BHUNALIGN(mfp, buf);
COMPQUIET(mfp, NULL);
__env_alloc_free(infop, buf);
}
/*
* __memp_bad_buffer --
* Make the first buffer in a hash bucket the least desirable buffer.
*/
static void
__memp_bad_buffer(dbenv, hp)
DB_ENV *dbenv;
DB_MPOOL_HASH *hp;
{
BH *bhp, *last_bhp;
u_int32_t priority;
/*
* Get the first buffer from the bucket. If it is also the last buffer
* (in other words, it is the only buffer in the bucket), we're done.
*/
bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
last_bhp = SH_TAILQ_LASTP(&hp->hash_bucket, hq, __bh);
if (bhp == last_bhp)
return;
/* There are multiple buffers in the bucket, remove the first one. */
SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
/*
* Find the highest priority buffer in the bucket. Buffers are
* sorted by priority, so it's the last one in the bucket.
*/
priority = BH_PRIORITY(last_bhp);
/*
* Append our buffer to the bucket and set its priority to be just as
* bad.
*/
SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
for (; bhp != NULL ; bhp = SH_CHAIN_PREV(bhp, vc, __bh))
bhp->priority = priority;
/* Reset the hash bucket's priority. */
hp->hash_priority = BH_PRIORITY(
SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh));
#ifdef DIAGNOSTIC
__memp_check_order(dbenv, hp);
#else
COMPQUIET(dbenv, NULL);
#endif
}
#ifdef DIAGNOSTIC
/*
* __memp_check_order --
* Verify the priority ordering of a hash bucket chain.
*
* PUBLIC: #ifdef DIAGNOSTIC
* PUBLIC: void __memp_check_order __P((DB_ENV *, DB_MPOOL_HASH *));
* PUBLIC: #endif
*/
void
__memp_check_order(dbenv, hp)
DB_ENV *dbenv;
DB_MPOOL_HASH *hp;
{
BH *bhp, *first_bhp, *tbhp;
u_int32_t dirty, priority, last_priority;
dirty = 0;
/*
* Assumes the hash bucket is locked.
*/
last_priority = hp->hash_priority;
for (bhp = first_bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));
if (F_ISSET(bhp, BH_DIRTY))
dirty++;
priority = BH_PRIORITY(bhp);
DB_ASSERT(dbenv, (bhp == first_bhp) ?
priority == last_priority : priority >= last_priority);
last_priority = priority;
/* Chains have referential integrity. */
for (tbhp = SH_CHAIN_PREV(bhp, vc, __bh); tbhp != NULL;
tbhp = SH_CHAIN_PREV(tbhp, vc, __bh))
DB_ASSERT(dbenv, tbhp == SH_CHAIN_PREV(
SH_CHAIN_NEXT(tbhp, vc, __bh), vc, __bh));
/*
* No repeats.
* XXX This is O(N**2) where N is the number of buffers in the
* bucket, but we generally assume small buckets.
*/
for (tbhp = SH_TAILQ_NEXT(bhp, hq, __bh); tbhp != NULL;
tbhp = SH_TAILQ_NEXT(tbhp, hq, __bh))
DB_ASSERT(dbenv, bhp->pgno != tbhp->pgno ||
bhp->mf_offset != tbhp->mf_offset);
}
DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
}
#endif