OpenCloudOS-Kernel/fs/ocfs2/dlm/dlmmaster.c

3590 lines
98 KiB
C
Raw Normal View History

/* -*- mode: c; c-basic-offset: 8; -*-
* vim: noexpandtab sw=8 ts=8 sts=0:
*
* dlmmod.c
*
* standalone DLM module
*
* Copyright (C) 2004 Oracle. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/sysctl.h>
#include <linux/random.h>
#include <linux/blkdev.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/spinlock.h>
#include <linux/delay.h>
#include "cluster/heartbeat.h"
#include "cluster/nodemanager.h"
#include "cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
#include "cluster/masklog.h"
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node,
int idx);
static void dlm_mle_node_up(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node,
int idx);
static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
static int dlm_do_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
void *nodemap, u32 flags);
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
const char *name,
unsigned int namelen)
{
if (dlm != mle->dlm)
return 0;
if (namelen != mle->mnamelen ||
memcmp(name, mle->mname, namelen) != 0)
return 0;
return 1;
}
static struct kmem_cache *dlm_lockres_cache;
static struct kmem_cache *dlm_lockname_cache;
static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
enum dlm_mle_type type,
struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
const char *name,
unsigned int namelen);
static void dlm_put_mle(struct dlm_master_list_entry *mle);
static void __dlm_put_mle(struct dlm_master_list_entry *mle);
static int dlm_find_mle(struct dlm_ctxt *dlm,
struct dlm_master_list_entry **mle,
char *name, unsigned int namelen);
static int dlm_do_master_request(struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle, int to);
static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
int *blocked);
static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
int blocked);
static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
struct dlm_master_list_entry **oldmle,
const char *name, unsigned int namelen,
u8 new_master, u8 master);
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 target);
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
int dlm_is_host_down(int errno)
{
switch (errno) {
case -EBADF:
case -ECONNREFUSED:
case -ENOTCONN:
case -ECONNRESET:
case -EPIPE:
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ETIMEDOUT:
case -ECONNABORTED:
case -ENETDOWN:
case -ENETUNREACH:
case -ENETRESET:
case -ESHUTDOWN:
case -ENOPROTOOPT:
case -EINVAL: /* if returned from our tcp code,
this means there is no socket */
return 1;
}
return 0;
}
/*
* MASTER LIST FUNCTIONS
*/
/*
* regarding master list entries and heartbeat callbacks:
*
* in order to avoid sleeping and allocation that occurs in
* heartbeat, master list entries are simply attached to the
* dlm's established heartbeat callbacks. the mle is attached
* when it is created, and since the dlm->spinlock is held at
* that time, any heartbeat event will be properly discovered
* by the mle. the mle needs to be detached from the
* dlm->mle_hb_events list as soon as heartbeat events are no
* longer useful to the mle, and before the mle is freed.
*
* as a general rule, heartbeat events are no longer needed by
* the mle once an "answer" regarding the lock master has been
* received.
*/
static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle)
{
assert_spin_locked(&dlm->spinlock);
list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
}
static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle)
{
if (!list_empty(&mle->hb_events))
list_del_init(&mle->hb_events);
}
static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle)
{
spin_lock(&dlm->spinlock);
__dlm_mle_detach_hb_events(dlm, mle);
spin_unlock(&dlm->spinlock);
}
static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
{
struct dlm_ctxt *dlm;
dlm = mle->dlm;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
mle->inuse++;
kref_get(&mle->mle_refs);
}
static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
{
struct dlm_ctxt *dlm;
dlm = mle->dlm;
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
mle->inuse--;
__dlm_put_mle(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
}
/* remove from list and free */
static void __dlm_put_mle(struct dlm_master_list_entry *mle)
{
struct dlm_ctxt *dlm;
dlm = mle->dlm;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
if (!kref_read(&mle->mle_refs)) {
/* this may or may not crash, but who cares.
* it's a BUG. */
mlog(ML_ERROR, "bad mle: %p\n", mle);
dlm_print_one_mle(mle);
BUG();
} else
kref_put(&mle->mle_refs, dlm_mle_release);
}
/* must not have any spinlocks coming in */
static void dlm_put_mle(struct dlm_master_list_entry *mle)
{
struct dlm_ctxt *dlm;
dlm = mle->dlm;
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
__dlm_put_mle(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
}
static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
{
kref_get(&mle->mle_refs);
}
static void dlm_init_mle(struct dlm_master_list_entry *mle,
enum dlm_mle_type type,
struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
const char *name,
unsigned int namelen)
{
assert_spin_locked(&dlm->spinlock);
mle->dlm = dlm;
mle->type = type;
INIT_HLIST_NODE(&mle->master_hash_node);
INIT_LIST_HEAD(&mle->hb_events);
memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
spin_lock_init(&mle->spinlock);
init_waitqueue_head(&mle->wq);
atomic_set(&mle->woken, 0);
kref_init(&mle->mle_refs);
memset(mle->response_map, 0, sizeof(mle->response_map));
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
mle->inuse = 0;
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
if (mle->type == DLM_MLE_MASTER) {
BUG_ON(!res);
mle->mleres = res;
memcpy(mle->mname, res->lockname.name, res->lockname.len);
mle->mnamelen = res->lockname.len;
mle->mnamehash = res->lockname.hash;
} else {
BUG_ON(!name);
mle->mleres = NULL;
memcpy(mle->mname, name, namelen);
mle->mnamelen = namelen;
mle->mnamehash = dlm_lockid_hash(name, namelen);
}
atomic_inc(&dlm->mle_tot_count[mle->type]);
atomic_inc(&dlm->mle_cur_count[mle->type]);
/* copy off the node_map and register hb callbacks on our copy */
memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
clear_bit(dlm->node_num, mle->vote_map);
clear_bit(dlm->node_num, mle->node_map);
/* attach the mle to the domain node up/down events */
__dlm_mle_attach_hb_events(dlm, mle);
}
void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
{
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
if (!hlist_unhashed(&mle->master_hash_node))
hlist_del_init(&mle->master_hash_node);
}
void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
{
struct hlist_head *bucket;
assert_spin_locked(&dlm->master_lock);
bucket = dlm_master_hash(dlm, mle->mnamehash);
hlist_add_head(&mle->master_hash_node, bucket);
}
/* returns 1 if found, 0 if not */
static int dlm_find_mle(struct dlm_ctxt *dlm,
struct dlm_master_list_entry **mle,
char *name, unsigned int namelen)
{
struct dlm_master_list_entry *tmpmle;
struct hlist_head *bucket;
unsigned int hash;
assert_spin_locked(&dlm->master_lock);
hash = dlm_lockid_hash(name, namelen);
bucket = dlm_master_hash(dlm, hash);
hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
*mle = tmpmle;
return 1;
}
return 0;
}
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
{
struct dlm_master_list_entry *mle;
assert_spin_locked(&dlm->spinlock);
list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
if (node_up)
dlm_mle_node_up(dlm, mle, NULL, idx);
else
dlm_mle_node_down(dlm, mle, NULL, idx);
}
}
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node, int idx)
{
spin_lock(&mle->spinlock);
if (!test_bit(idx, mle->node_map))
mlog(0, "node %u already removed from nodemap!\n", idx);
else
clear_bit(idx, mle->node_map);
spin_unlock(&mle->spinlock);
}
static void dlm_mle_node_up(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
struct o2nm_node *node, int idx)
{
spin_lock(&mle->spinlock);
if (test_bit(idx, mle->node_map))
mlog(0, "node %u already in node map!\n", idx);
else
set_bit(idx, mle->node_map);
spin_unlock(&mle->spinlock);
}
int dlm_init_mle_cache(void)
{
dlm_mle_cache = kmem_cache_create("o2dlm_mle",
sizeof(struct dlm_master_list_entry),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (dlm_mle_cache == NULL)
return -ENOMEM;
return 0;
}
void dlm_destroy_mle_cache(void)
{
if (dlm_mle_cache)
kmem_cache_destroy(dlm_mle_cache);
}
static void dlm_mle_release(struct kref *kref)
{
struct dlm_master_list_entry *mle;
struct dlm_ctxt *dlm;
mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
dlm = mle->dlm;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
mle->type);
/* remove from list if not already */
__dlm_unlink_mle(dlm, mle);
/* detach the mle from the domain node up/down events */
__dlm_mle_detach_hb_events(dlm, mle);
atomic_dec(&dlm->mle_cur_count[mle->type]);
/* NOTE: kfree under spinlock here.
* if this is bad, we can move this to a freelist. */
kmem_cache_free(dlm_mle_cache, mle);
}
/*
* LOCK RESOURCE FUNCTIONS
*/
int dlm_init_master_caches(void)
{
dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
sizeof(struct dlm_lock_resource),
0, SLAB_HWCACHE_ALIGN, NULL);
if (!dlm_lockres_cache)
goto bail;
dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
DLM_LOCKID_NAME_MAX, 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!dlm_lockname_cache)
goto bail;
return 0;
bail:
dlm_destroy_master_caches();
return -ENOMEM;
}
void dlm_destroy_master_caches(void)
{
if (dlm_lockname_cache) {
kmem_cache_destroy(dlm_lockname_cache);
dlm_lockname_cache = NULL;
}
if (dlm_lockres_cache) {
kmem_cache_destroy(dlm_lockres_cache);
dlm_lockres_cache = NULL;
}
}
static void dlm_lockres_release(struct kref *kref)
{
struct dlm_lock_resource *res;
struct dlm_ctxt *dlm;
res = container_of(kref, struct dlm_lock_resource, refs);
dlm = res->dlm;
/* This should not happen -- all lockres' have a name
* associated with them at init time. */
BUG_ON(!res->lockname.name);
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
atomic_dec(&dlm->res_cur_count);
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
!list_empty(&res->converting) ||
!list_empty(&res->blocked) ||
!list_empty(&res->dirty) ||
!list_empty(&res->recovering) ||
!list_empty(&res->purge)) {
mlog(ML_ERROR,
"Going to BUG for resource %.*s."
" We're on a list! [%c%c%c%c%c%c%c]\n",
res->lockname.len, res->lockname.name,
!hlist_unhashed(&res->hash_node) ? 'H' : ' ',
!list_empty(&res->granted) ? 'G' : ' ',
!list_empty(&res->converting) ? 'C' : ' ',
!list_empty(&res->blocked) ? 'B' : ' ',
!list_empty(&res->dirty) ? 'D' : ' ',
!list_empty(&res->recovering) ? 'R' : ' ',
!list_empty(&res->purge) ? 'P' : ' ');
dlm_print_one_lock_resource(res);
}
/* By the time we're ready to blow this guy away, we shouldn't
* be on any lists. */
BUG_ON(!hlist_unhashed(&res->hash_node));
BUG_ON(!list_empty(&res->granted));
BUG_ON(!list_empty(&res->converting));
BUG_ON(!list_empty(&res->blocked));
BUG_ON(!list_empty(&res->dirty));
BUG_ON(!list_empty(&res->recovering));
BUG_ON(!list_empty(&res->purge));
kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
kmem_cache_free(dlm_lockres_cache, res);
}
void dlm_lockres_put(struct dlm_lock_resource *res)
{
kref_put(&res->refs, dlm_lockres_release);
}
static void dlm_init_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
const char *name, unsigned int namelen)
{
char *qname;
/* If we memset here, we lose our reference to the kmalloc'd
* res->lockname.name, so be sure to init every field
* correctly! */
qname = (char *) res->lockname.name;
memcpy(qname, name, namelen);
res->lockname.len = namelen;
res->lockname.hash = dlm_lockid_hash(name, namelen);
init_waitqueue_head(&res->wq);
spin_lock_init(&res->spinlock);
INIT_HLIST_NODE(&res->hash_node);
INIT_LIST_HEAD(&res->granted);
INIT_LIST_HEAD(&res->converting);
INIT_LIST_HEAD(&res->blocked);
INIT_LIST_HEAD(&res->dirty);
INIT_LIST_HEAD(&res->recovering);
INIT_LIST_HEAD(&res->purge);
INIT_LIST_HEAD(&res->tracking);
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
res->inflight_assert_workers = 0;
res->dlm = dlm;
kref_init(&res->refs);
atomic_inc(&dlm->res_tot_count);
atomic_inc(&dlm->res_cur_count);
/* just for consistency */
spin_lock(&res->spinlock);
dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
spin_unlock(&res->spinlock);
res->state = DLM_LOCK_RES_IN_PROGRESS;
res->last_used = 0;
spin_lock(&dlm->spinlock);
list_add_tail(&res->tracking, &dlm->tracking_list);
spin_unlock(&dlm->spinlock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
}
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int namelen)
{
struct dlm_lock_resource *res = NULL;
res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
if (!res)
goto error;
res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
if (!res->lockname.name)
goto error;
dlm_init_lockres(dlm, res, name, namelen);
return res;
error:
if (res)
kmem_cache_free(dlm_lockres_cache, res);
return NULL;
}
void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, int bit)
{
assert_spin_locked(&res->spinlock);
mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
res->lockname.name, bit, __builtin_return_address(0));
set_bit(bit, res->refmap);
}
void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, int bit)
{
assert_spin_locked(&res->spinlock);
mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
res->lockname.name, bit, __builtin_return_address(0));
clear_bit(bit, res->refmap);
}
static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
res->inflight_locks++;
mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
res->lockname.len, res->lockname.name, res->inflight_locks,
__builtin_return_address(0));
}
void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
__dlm_lockres_grab_inflight_ref(dlm, res);
}
void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_locks == 0);
res->inflight_locks--;
mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
res->lockname.len, res->lockname.name, res->inflight_locks,
__builtin_return_address(0));
wake_up(&res->wq);
}
void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
res->inflight_assert_workers++;
mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
dlm->name, res->lockname.len, res->lockname.name,
res->inflight_assert_workers);
}
static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
BUG_ON(res->inflight_assert_workers == 0);
res->inflight_assert_workers--;
mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
dlm->name, res->lockname.len, res->lockname.name,
res->inflight_assert_workers);
}
static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
spin_lock(&res->spinlock);
__dlm_lockres_drop_inflight_worker(dlm, res);
spin_unlock(&res->spinlock);
}
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
* lockid is null terminated
*
* if not, allocate enough for the lockres and for
* the temporary structure used in doing the mastering.
*
* also, do a lookup in the dlm->master_list to see
* if another node has begun mastering the same lock.
* if so, there should be a block entry in there
* for this name, and we should *not* attempt to master
* the lock here. need to wait around for that node
* to assert_master (or die).
*
*/
struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
const char *lockid,
int namelen,
int flags)
{
struct dlm_lock_resource *tmpres=NULL, *res=NULL;
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *alloc_mle = NULL;
int blocked = 0;
int ret, nodenum;
struct dlm_node_iter iter;
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
BUG_ON(!lockid);
hash = dlm_lockid_hash(lockid, namelen);
mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
lookup:
spin_lock(&dlm->spinlock);
tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
if (tmpres) {
spin_unlock(&dlm->spinlock);
spin_lock(&tmpres->spinlock);
/*
* Right after dlm spinlock was released, dlm_thread could have
* purged the lockres. Check if lockres got unhashed. If so
* start over.
*/
if (hlist_unhashed(&tmpres->hash_node)) {
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
/* Wait on the thread that is mastering the resource */
if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
__dlm_wait_on_lockres(tmpres);
BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
/* Wait on the resource purge to complete before continuing */
if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
BUG_ON(tmpres->owner == dlm->node_num);
__dlm_wait_on_lockres_flags(tmpres,
DLM_LOCK_RES_DROPPING_REF);
spin_unlock(&tmpres->spinlock);
dlm_lockres_put(tmpres);
tmpres = NULL;
goto lookup;
}
/* Grab inflight ref to pin the resource */
dlm_lockres_grab_inflight_ref(dlm, tmpres);
spin_unlock(&tmpres->spinlock);
if (res) {
spin_lock(&dlm->track_lock);
if (!list_empty(&res->tracking))
list_del_init(&res->tracking);
else
mlog(ML_ERROR, "Resource %.*s not "
"on the Tracking list\n",
res->lockname.len,
res->lockname.name);
spin_unlock(&dlm->track_lock);
dlm_lockres_put(res);
}
res = tmpres;
goto leave;
}
if (!res) {
spin_unlock(&dlm->spinlock);
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
if (!res)
goto leave;
goto lookup;
}
mlog(0, "no lockres found, allocated our own: %p\n", res);
if (flags & LKM_LOCAL) {
/* caller knows it's safe to assume it's not mastered elsewhere
* DONE! return right away */
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, dlm->node_num);
__dlm_insert_lockres(dlm, res);
dlm_lockres_grab_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
/* lockres still marked IN_PROGRESS */
goto wake_waiters;
}
/* check master list to see if another node has started mastering it */
spin_lock(&dlm->master_lock);
/* if we found a block, wait for lock to be mastered by another node */
blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
if (blocked) {
int mig;
if (mle->type == DLM_MLE_MASTER) {
mlog(ML_ERROR, "master entry for nonexistent lock!\n");
BUG();
}
mig = (mle->type == DLM_MLE_MIGRATION);
/* if there is a migration in progress, let the migration
* finish before continuing. we can wait for the absence
* of the MIGRATION mle: either the migrate finished or
* one of the nodes died and the mle was cleaned up.
* if there is a BLOCK here, but it already has a master
* set, we are too late. the master does not have a ref
* for us in the refmap. detach the mle and drop it.
* either way, go back to the top and start over. */
if (mig || mle->master != O2NM_MAX_NODES) {
BUG_ON(mig && mle->master == dlm->node_num);
/* we arrived too late. the master does not
* have a ref for us. retry. */
mlog(0, "%s:%.*s: late on %s\n",
dlm->name, namelen, lockid,
mig ? "MIGRATION" : "BLOCK");
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
/* master is known, detach */
if (!mig)
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
mle = NULL;
/* this is lame, but we can't wait on either
* the mle or lockres waitqueue here */
if (mig)
msleep(100);
goto lookup;
}
} else {
/* go ahead and try to master lock on this node */
mle = alloc_mle;
/* make sure this does not get freed below */
alloc_mle = NULL;
dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
set_bit(dlm->node_num, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
/* still holding the dlm spinlock, check the recovery map
* to see if there are any nodes that still need to be
* considered. these will not appear in the mle nodemap
* but they might own this lockres. wait on them. */
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
}
}
/* at this point there is either a DLM_MLE_BLOCK or a
* DLM_MLE_MASTER on the master list, so it's safe to add the
* lockres to the hashtable. anyone who finds the lock will
* still have to wait on the IN_PROGRESS. */
/* finally add the lockres to its hash bucket */
__dlm_insert_lockres(dlm, res);
/* since this lockres is new it doesn't not require the spinlock */
__dlm_lockres_grab_inflight_ref(dlm, res);
/* get an extra ref on the mle in case this is a BLOCK
* if so, the creator of the BLOCK may try to put the last
* ref at this time in the assert master handler, so we
* need an extra one to keep from a bad ptr deref. */
dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
redo_request:
while (wait_on_recovery) {
/* any cluster changes that occurred after dropping the
* dlm spinlock would be detectable be a change on the mle,
* so we only need to clear out the recovery map once. */
if (dlm_is_recovery_lock(lockid, namelen)) {
mlog(0, "%s: Recovery map is not empty, but must "
"master $RECOVERY lock now\n", dlm->name);
if (!dlm_pre_master_reco_lockres(dlm, res))
wait_on_recovery = 0;
else {
mlog(0, "%s: waiting 500ms for heartbeat state "
"change\n", dlm->name);
msleep(500);
}
continue;
}
dlm_kick_recovery_thread(dlm);
msleep(1000);
dlm_wait_for_recovery(dlm);
spin_lock(&dlm->spinlock);
bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
if (bit < O2NM_MAX_NODES) {
mlog(0, "%s: res %.*s, At least one node (%d) "
"to recover before lock mastery can begin\n",
dlm->name, namelen, (char *)lockid, bit);
wait_on_recovery = 1;
} else
wait_on_recovery = 0;
spin_unlock(&dlm->spinlock);
if (wait_on_recovery)
dlm_wait_for_node_recovery(dlm, bit, 10000);
}
/* must wait for lock to be mastered elsewhere */
if (blocked)
goto wait;
ret = -EINVAL;
dlm_node_iter_init(mle->vote_map, &iter);
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
ret = dlm_do_master_request(res, mle, nodenum);
if (ret < 0)
mlog_errno(ret);
if (mle->master != O2NM_MAX_NODES) {
/* found a master ! */
if (mle->master <= nodenum)
break;
/* if our master request has not reached the master
* yet, keep going until it does. this is how the
* master will know that asserts are needed back to
* the lower nodes. */
mlog(0, "%s: res %.*s, Requests only up to %u but "
"master is %u, keep going\n", dlm->name, namelen,
lockid, nodenum, mle->master);
}
}
wait:
/* keep going until the response map includes all nodes */
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
wait_on_recovery = 1;
mlog(0, "%s: res %.*s, Node map changed, redo the master "
"request now, blocked=%d\n", dlm->name, res->lockname.len,
res->lockname.name, blocked);
if (++tries > 20) {
mlog(ML_ERROR, "%s: res %.*s, Spinning on "
"dlm_wait_for_lock_mastery, blocked = %d\n",
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
dlm_print_one_mle(mle);
tries = 0;
}
goto redo_request;
}
mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
res->lockname.name, res->owner);
/* make sure we never continue without this */
BUG_ON(res->owner == O2NM_MAX_NODES);
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
/* put the extra ref */
dlm_put_mle_inuse(mle);
wake_waiters:
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
leave:
/* need to free the unused mle */
if (alloc_mle)
kmem_cache_free(dlm_mle_cache, alloc_mle);
return res;
}
#define DLM_MASTERY_TIMEOUT_MS 5000
static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
int *blocked)
{
u8 m;
int ret, bit;
int map_changed, voting_done;
int assert, sleep;
recheck:
ret = 0;
assert = 0;
/* check if another node has already become the owner */
spin_lock(&res->spinlock);
if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
res->lockname.len, res->lockname.name, res->owner);
spin_unlock(&res->spinlock);
/* this will cause the master to re-assert across
* the whole cluster, freeing up mles */
if (res->owner != dlm->node_num) {
ret = dlm_do_master_request(res, mle, res->owner);
if (ret < 0) {
/* give recovery a chance to run */
mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
msleep(500);
goto recheck;
}
}
ret = 0;
goto leave;
}
spin_unlock(&res->spinlock);
spin_lock(&mle->spinlock);
m = mle->master;
map_changed = (memcmp(mle->vote_map, mle->node_map,
sizeof(mle->vote_map)) != 0);
voting_done = (memcmp(mle->vote_map, mle->response_map,
sizeof(mle->vote_map)) == 0);
/* restart if we hit any errors */
if (map_changed) {
int b;
mlog(0, "%s: %.*s: node map changed, restarting\n",
dlm->name, res->lockname.len, res->lockname.name);
ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
b = (mle->type == DLM_MLE_BLOCK);
if ((*blocked && !b) || (!*blocked && b)) {
mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
dlm->name, res->lockname.len, res->lockname.name,
*blocked, b);
*blocked = b;
}
spin_unlock(&mle->spinlock);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
mlog(0, "%s:%.*s: restart lock mastery succeeded, "
"rechecking now\n", dlm->name, res->lockname.len,
res->lockname.name);
goto recheck;
} else {
if (!voting_done) {
mlog(0, "map not changed and voting not done "
"for %s:%.*s\n", dlm->name, res->lockname.len,
res->lockname.name);
}
}
if (m != O2NM_MAX_NODES) {
/* another node has done an assert!
* all done! */
sleep = 0;
} else {
sleep = 1;
/* have all nodes responded? */
if (voting_done && !*blocked) {
bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
if (dlm->node_num <= bit) {
/* my node number is lowest.
* now tell other nodes that I am
* mastering this. */
mle->master = dlm->node_num;
/* ref was grabbed in get_lock_resource
* will be dropped in dlmlock_master */
assert = 1;
sleep = 0;
}
/* if voting is done, but we have not received
* an assert master yet, we must sleep */
}
}
spin_unlock(&mle->spinlock);
/* sleep if we haven't finished voting yet */
if (sleep) {
unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
atomic_set(&mle->woken, 0);
(void)wait_event_timeout(mle->wq,
(atomic_read(&mle->woken) == 1),
timeo);
if (res->owner == O2NM_MAX_NODES) {
mlog(0, "%s:%.*s: waiting again\n", dlm->name,
res->lockname.len, res->lockname.name);
goto recheck;
}
mlog(0, "done waiting, master is %u\n", res->owner);
ret = 0;
goto leave;
}
ret = 0; /* done */
if (assert) {
m = dlm->node_num;
mlog(0, "about to master %.*s here, this=%u\n",
res->lockname.len, res->lockname.name, m);
ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
if (ret) {
/* This is a failure in the network path,
* not in the response to the assert_master
* (any nonzero response is a BUG on this node).
* Most likely a socket just got disconnected
* due to node death. */
mlog_errno(ret);
}
/* no longer need to restart lock mastery.
* all living nodes have been contacted. */
ret = 0;
}
/* set the lockres owner */
spin_lock(&res->spinlock);
/* mastery reference obtained either during
* assert_master_handler or in get_lock_resource */
dlm_change_lockres_owner(dlm, res, m);
spin_unlock(&res->spinlock);
leave:
return ret;
}
struct dlm_bitmap_diff_iter
{
int curnode;
unsigned long *orig_bm;
unsigned long *cur_bm;
unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
enum dlm_node_state_change
{
NODE_DOWN = -1,
NODE_NO_CHANGE = 0,
NODE_UP
};
static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
unsigned long *orig_bm,
unsigned long *cur_bm)
{
unsigned long p1, p2;
int i;
iter->curnode = -1;
iter->orig_bm = orig_bm;
iter->cur_bm = cur_bm;
for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
p1 = *(iter->orig_bm + i);
p2 = *(iter->cur_bm + i);
iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
}
}
static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
enum dlm_node_state_change *state)
{
int bit;
if (iter->curnode >= O2NM_MAX_NODES)
return -ENOENT;
bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
iter->curnode+1);
if (bit >= O2NM_MAX_NODES) {
iter->curnode = O2NM_MAX_NODES;
return -ENOENT;
}
/* if it was there in the original then this node died */
if (test_bit(bit, iter->orig_bm))
*state = NODE_DOWN;
else
*state = NODE_UP;
iter->curnode = bit;
return bit;
}
static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
int blocked)
{
struct dlm_bitmap_diff_iter bdi;
enum dlm_node_state_change sc;
int node;
int ret = 0;
mlog(0, "something happened such that the "
"master process may need to be restarted!\n");
assert_spin_locked(&mle->spinlock);
dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
node = dlm_bitmap_diff_iter_next(&bdi, &sc);
while (node >= 0) {
if (sc == NODE_UP) {
/* a node came up. clear any old vote from
* the response map and set it in the vote map
* then restart the mastery. */
mlog(ML_NOTICE, "node %d up while restarting\n", node);
/* redo the master request, but only for the new node */
mlog(0, "sending request to new node\n");
clear_bit(node, mle->response_map);
set_bit(node, mle->vote_map);
} else {
mlog(ML_ERROR, "node down! %d\n", node);
if (blocked) {
int lowest = find_next_bit(mle->maybe_map,
O2NM_MAX_NODES, 0);
/* act like it was never there */
clear_bit(node, mle->maybe_map);
if (node == lowest) {
mlog(0, "expected master %u died"
" while this node was blocked "
"waiting on it!\n", node);
lowest = find_next_bit(mle->maybe_map,
O2NM_MAX_NODES,
lowest+1);
if (lowest < O2NM_MAX_NODES) {
mlog(0, "%s:%.*s:still "
"blocked. waiting on %u "
"now\n", dlm->name,
res->lockname.len,
res->lockname.name,
lowest);
} else {
/* mle is an MLE_BLOCK, but
* there is now nothing left to
* block on. we need to return
* all the way back out and try
* again with an MLE_MASTER.
* dlm_do_local_recovery_cleanup
* has already run, so the mle
* refcount is ok */
mlog(0, "%s:%.*s: no "
"longer blocking. try to "
"master this here\n",
dlm->name,
res->lockname.len,
res->lockname.name);
mle->type = DLM_MLE_MASTER;
mle->mleres = res;
}
}
}
/* now blank out everything, as if we had never
* contacted anyone */
memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
memset(mle->response_map, 0, sizeof(mle->response_map));
/* reset the vote_map to the current node_map */
memcpy(mle->vote_map, mle->node_map,
sizeof(mle->node_map));
/* put myself into the maybe map */
if (mle->type != DLM_MLE_BLOCK)
set_bit(dlm->node_num, mle->maybe_map);
}
ret = -EAGAIN;
node = dlm_bitmap_diff_iter_next(&bdi, &sc);
}
return ret;
}
/*
* DLM_MASTER_REQUEST_MSG
*
* returns: 0 on success,
* -errno on a network error
*
* on error, the caller should assume the target node is "dead"
*
*/
static int dlm_do_master_request(struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle, int to)
{
struct dlm_ctxt *dlm = mle->dlm;
struct dlm_master_request request;
int ret, response=0, resend;
memset(&request, 0, sizeof(request));
request.node_idx = dlm->node_num;
BUG_ON(mle->type == DLM_MLE_MIGRATION);
request.namelen = (u8)mle->mnamelen;
memcpy(request.name, mle->mname, request.namelen);
again:
ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
sizeof(request), to, &response);
if (ret < 0) {
if (ret == -ESRCH) {
/* should never happen */
mlog(ML_ERROR, "TCP stack not ready!\n");
BUG();
} else if (ret == -EINVAL) {
mlog(ML_ERROR, "bad args passed to o2net!\n");
BUG();
} else if (ret == -ENOMEM) {
mlog(ML_ERROR, "out of memory while trying to send "
"network message! retrying\n");
/* this is totally crude */
msleep(50);
goto again;
} else if (!dlm_is_host_down(ret)) {
/* not a network error. bad. */
mlog_errno(ret);
mlog(ML_ERROR, "unhandled error!");
BUG();
}
/* all other errors should be network errors,
* and likely indicate node death */
mlog(ML_ERROR, "link to %d went down!\n", to);
goto out;
}
ret = 0;
resend = 0;
spin_lock(&mle->spinlock);
switch (response) {
case DLM_MASTER_RESP_YES:
set_bit(to, mle->response_map);
mlog(0, "node %u is the master, response=YES\n", to);
mlog(0, "%s:%.*s: master node %u now knows I have a "
"reference\n", dlm->name, res->lockname.len,
res->lockname.name, to);
mle->master = to;
break;
case DLM_MASTER_RESP_NO:
mlog(0, "node %u not master, response=NO\n", to);
set_bit(to, mle->response_map);
break;
case DLM_MASTER_RESP_MAYBE:
mlog(0, "node %u not master, response=MAYBE\n", to);
set_bit(to, mle->response_map);
set_bit(to, mle->maybe_map);
break;
case DLM_MASTER_RESP_ERROR:
mlog(0, "node %u hit an error, resending\n", to);
resend = 1;
response = 0;
break;
default:
mlog(ML_ERROR, "bad response! %u\n", response);
BUG();
}
spin_unlock(&mle->spinlock);
if (resend) {
/* this is also totally crude */
msleep(50);
goto again;
}
out:
return ret;
}
/*
* locks that can be taken here:
* dlm->spinlock
* res->spinlock
* mle->spinlock
* dlm->master_list
*
* if possible, TRIM THIS DOWN!!!
*/
int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
u8 response = DLM_MASTER_RESP_MAYBE;
struct dlm_ctxt *dlm = data;
struct dlm_lock_resource *res = NULL;
struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
char *name;
unsigned int namelen, hash;
int found, ret;
int set_maybe;
int dispatch_assert = 0;
int dispatched = 0;
if (!dlm_grab(dlm))
return DLM_MASTER_RESP_NO;
if (!dlm_domain_fully_joined(dlm)) {
response = DLM_MASTER_RESP_NO;
goto send_response;
}
name = request->name;
namelen = request->namelen;
hash = dlm_lockid_hash(name, namelen);
if (namelen > DLM_LOCKID_NAME_MAX) {
response = DLM_IVBUFLEN;
goto send_response;
}
way_up_top:
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
if (res) {
spin_unlock(&dlm->spinlock);
/* take care of the easy cases up front */
spin_lock(&res->spinlock);
/*
* Right after dlm spinlock was released, dlm_thread could have
* purged the lockres. Check if lockres got unhashed. If so
* start over.
*/
if (hlist_unhashed(&res->hash_node)) {
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
goto way_up_top;
}
if (res->state & (DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_MIGRATING)) {
spin_unlock(&res->spinlock);
mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
"being recovered/migrated\n");
response = DLM_MASTER_RESP_ERROR;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
goto send_response;
}
if (res->owner == dlm->node_num) {
dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
spin_unlock(&res->spinlock);
response = DLM_MASTER_RESP_YES;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
/* this node is the owner.
* there is some extra work that needs to
* happen now. the requesting node has
* caused all nodes up to this one to
* create mles. this node now needs to
* go back and clean those up. */
dispatch_assert = 1;
goto send_response;
} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
spin_unlock(&res->spinlock);
// mlog(0, "node %u is the master\n", res->owner);
response = DLM_MASTER_RESP_NO;
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
goto send_response;
}
/* ok, there is no owner. either this node is
* being blocked, or it is actively trying to
* master this lock. */
if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
mlog(ML_ERROR, "lock with no owner should be "
"in-progress!\n");
BUG();
}
// mlog(0, "lockres is in progress...\n");
spin_lock(&dlm->master_lock);
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
mlog(ML_ERROR, "no mle found for this lock!\n");
BUG();
}
set_maybe = 1;
spin_lock(&tmpmle->spinlock);
if (tmpmle->type == DLM_MLE_BLOCK) {
// mlog(0, "this node is waiting for "
// "lockres to be mastered\n");
response = DLM_MASTER_RESP_NO;
} else if (tmpmle->type == DLM_MLE_MIGRATION) {
mlog(0, "node %u is master, but trying to migrate to "
"node %u.\n", tmpmle->master, tmpmle->new_master);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no owner on lockres, but this "
"node is trying to migrate it to %u?!\n",
tmpmle->new_master);
BUG();
} else {
/* the real master can respond on its own */
response = DLM_MASTER_RESP_NO;
}
} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
set_maybe = 0;
if (tmpmle->master == dlm->node_num) {
response = DLM_MASTER_RESP_YES;
/* this node will be the owner.
* go back and clean the mles on any
* other nodes */
dispatch_assert = 1;
dlm_lockres_set_refmap_bit(dlm, res,
request->node_idx);
} else
response = DLM_MASTER_RESP_NO;
} else {
// mlog(0, "this node is attempting to "
// "master lockres\n");
response = DLM_MASTER_RESP_MAYBE;
}
if (set_maybe)
set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
spin_unlock(&dlm->master_lock);
spin_unlock(&res->spinlock);
/* keep the mle attached to heartbeat events */
dlm_put_mle(tmpmle);
if (mle)
kmem_cache_free(dlm_mle_cache, mle);
goto send_response;
}
/*
* lockres doesn't exist on this node
* if there is an MLE_BLOCK, return NO
* if there is an MLE_MASTER, return MAYBE
* otherwise, add an MLE_BLOCK, return NO
*/
spin_lock(&dlm->master_lock);
found = dlm_find_mle(dlm, &tmpmle, name, namelen);
if (!found) {
/* this lockid has never been seen on this node yet */
// mlog(0, "no mle found\n");
if (!mle) {
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
goto send_response;
}
goto way_up_top;
}
// mlog(0, "this is second time thru, already allocated, "
// "add the block.\n");
dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
response = DLM_MASTER_RESP_NO;
} else {
spin_lock(&tmpmle->spinlock);
if (tmpmle->master == dlm->node_num) {
mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
BUG();
}
if (tmpmle->type == DLM_MLE_BLOCK)
response = DLM_MASTER_RESP_NO;
else if (tmpmle->type == DLM_MLE_MIGRATION) {
mlog(0, "migration mle was found (%u->%u)\n",
tmpmle->master, tmpmle->new_master);
/* real master can respond on its own */
response = DLM_MASTER_RESP_NO;
} else
response = DLM_MASTER_RESP_MAYBE;
set_bit(request->node_idx, tmpmle->maybe_map);
spin_unlock(&tmpmle->spinlock);
}
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
if (found) {
/* keep the mle attached to heartbeat events */
dlm_put_mle(tmpmle);
}
send_response:
/*
* __dlm_lookup_lockres() grabbed a reference to this lockres.
* The reference is released by dlm_assert_master_worker() under
* the call to dlm_dispatch_assert_master(). If
* dlm_assert_master_worker() isn't called, we drop it here.
*/
if (dispatch_assert) {
mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
dlm->node_num, res->lockname.len, res->lockname.name);
spin_lock(&res->spinlock);
ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
DLM_ASSERT_MASTER_MLE_CLEANUP);
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
} else {
dispatched = 1;
__dlm_lockres_grab_inflight_worker(dlm, res);
spin_unlock(&res->spinlock);
}
} else {
if (res)
dlm_lockres_put(res);
}
if (!dispatched)
dlm_put(dlm);
return response;
}
/*
* DLM_ASSERT_MASTER_MSG
*/
/*
* NOTE: this can be used for debugging
* can periodically run all locks owned by this node
* and re-assert across the cluster...
*/
static int dlm_do_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
void *nodemap, u32 flags)
{
struct dlm_assert_master assert;
int to, tmpret;
struct dlm_node_iter iter;
int ret = 0;
int reassert;
const char *lockname = res->lockname.name;
unsigned int namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
spin_lock(&res->spinlock);
res->state |= DLM_LOCK_RES_SETREF_INPROG;
spin_unlock(&res->spinlock);
again:
reassert = 0;
/* note that if this nodemap is empty, it returns 0 */
dlm_node_iter_init(nodemap, &iter);
while ((to = dlm_node_iter_next(&iter)) >= 0) {
int r = 0;
struct dlm_master_list_entry *mle = NULL;
mlog(0, "sending assert master to %d (%.*s)\n", to,
namelen, lockname);
memset(&assert, 0, sizeof(assert));
assert.node_idx = dlm->node_num;
assert.namelen = namelen;
memcpy(assert.name, lockname, namelen);
assert.flags = cpu_to_be32(flags);
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
mlog(ML_ERROR, "Error %d when sending message %u (key "
"0x%x) to node %u\n", tmpret,
DLM_ASSERT_MASTER_MSG, dlm->key, to);
if (!dlm_is_host_down(tmpret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
}
/* a node died. finish out the rest of the nodes. */
mlog(0, "link to %d went down!\n", to);
/* any nonzero status return will do */
ret = tmpret;
r = 0;
} else if (r < 0) {
/* ok, something horribly messed. kill thyself. */
mlog(ML_ERROR,"during assert master of %.*s to %u, "
"got %d.\n", namelen, lockname, to, r);
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
if (dlm_find_mle(dlm, &mle, (char *)lockname,
namelen)) {
dlm_print_one_mle(mle);
__dlm_put_mle(mle);
}
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
BUG();
}
if (r & DLM_ASSERT_RESPONSE_REASSERT &&
!(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
mlog(ML_ERROR, "%.*s: very strange, "
"master MLE but no lockres on %u\n",
namelen, lockname, to);
}
if (r & DLM_ASSERT_RESPONSE_REASSERT) {
mlog(0, "%.*s: node %u create mles on other "
"nodes and requests a re-assert\n",
namelen, lockname, to);
reassert = 1;
}
if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
mlog(0, "%.*s: node %u has a reference to this "
"lockres, set the bit in the refmap\n",
namelen, lockname, to);
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, to);
spin_unlock(&res->spinlock);
}
}
if (reassert)
goto again;
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
return ret;
}
/*
* locks that can be taken here:
* dlm->spinlock
* res->spinlock
* mle->spinlock
* dlm->master_list
*
* if possible, TRIM THIS DOWN!!!
*/
int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_master_list_entry *mle = NULL;
struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
struct dlm_lock_resource *res = NULL;
char *name;
unsigned int namelen, hash;
u32 flags;
int master_request = 0, have_lockres_ref = 0;
int ret = 0;
if (!dlm_grab(dlm))
return 0;
name = assert->name;
namelen = assert->namelen;
hash = dlm_lockid_hash(name, namelen);
flags = be32_to_cpu(assert->flags);
if (namelen > DLM_LOCKID_NAME_MAX) {
mlog(ML_ERROR, "Invalid name length!");
goto done;
}
spin_lock(&dlm->spinlock);
if (flags)
mlog(0, "assert_master with flags: %u\n", flags);
/* find the MLE */
spin_lock(&dlm->master_lock);
if (!dlm_find_mle(dlm, &mle, name, namelen)) {
/* not an error, could be master just re-asserting */
mlog(0, "just got an assert_master from %u, but no "
"MLE for it! (%.*s)\n", assert->node_idx,
namelen, name);
} else {
int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
if (bit >= O2NM_MAX_NODES) {
/* not necessarily an error, though less likely.
* could be master just re-asserting. */
mlog(0, "no bits set in the maybe_map, but %u "
"is asserting! (%.*s)\n", assert->node_idx,
namelen, name);
} else if (bit != assert->node_idx) {
if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
mlog(0, "master %u was found, %u should "
"back off\n", assert->node_idx, bit);
} else {
/* with the fix for bug 569, a higher node
* number winning the mastery will respond
* YES to mastery requests, but this node
* had no way of knowing. let it pass. */
mlog(0, "%u is the lowest node, "
"%u is asserting. (%.*s) %u must "
"have begun after %u won.\n", bit,
assert->node_idx, namelen, name, bit,
assert->node_idx);
}
}
if (mle->type == DLM_MLE_MIGRATION) {
if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
mlog(0, "%s:%.*s: got cleanup assert"
" from %u for migration\n",
dlm->name, namelen, name,
assert->node_idx);
} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
mlog(0, "%s:%.*s: got unrelated assert"
" from %u for migration, ignoring\n",
dlm->name, namelen, name,
assert->node_idx);
__dlm_put_mle(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
goto done;
}
}
}
spin_unlock(&dlm->master_lock);
/* ok everything checks out with the MLE
* now check to see if there is a lockres */
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
mlog(ML_ERROR, "%u asserting but %.*s is "
"RECOVERING!\n", assert->node_idx, namelen, name);
goto kill;
}
if (!mle) {
if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
res->owner != assert->node_idx) {
mlog(ML_ERROR, "DIE! Mastery assert from %u, "
"but current owner is %u! (%.*s)\n",
assert->node_idx, res->owner, namelen,
name);
__dlm_print_one_lock_resource(res);
BUG();
}
} else if (mle->type != DLM_MLE_MIGRATION) {
if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
/* owner is just re-asserting */
if (res->owner == assert->node_idx) {
mlog(0, "owner %u re-asserting on "
"lock %.*s\n", assert->node_idx,
namelen, name);
goto ok;
}
mlog(ML_ERROR, "got assert_master from "
"node %u, but %u is the owner! "
"(%.*s)\n", assert->node_idx,
res->owner, namelen, name);
goto kill;
}
if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
mlog(ML_ERROR, "got assert from %u, but lock "
"with no owner should be "
"in-progress! (%.*s)\n",
assert->node_idx,
namelen, name);
goto kill;
}
} else /* mle->type == DLM_MLE_MIGRATION */ {
/* should only be getting an assert from new master */
if (assert->node_idx != mle->new_master) {
mlog(ML_ERROR, "got assert from %u, but "
"new master is %u, and old master "
"was %u (%.*s)\n",
assert->node_idx, mle->new_master,
mle->master, namelen, name);
goto kill;
}
}
ok:
spin_unlock(&res->spinlock);
}
// mlog(0, "woo! got an assert_master from node %u!\n",
// assert->node_idx);
if (mle) {
int extra_ref = 0;
int nn = -1;
int rr, err = 0;
spin_lock(&mle->spinlock);
if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
extra_ref = 1;
else {
/* MASTER mle: if any bits set in the response map
* then the calling node needs to re-assert to clear
* up nodes that this node contacted */
while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
break;
}
}
}
mle->master = assert->node_idx;
atomic_set(&mle->woken, 1);
wake_up(&mle->wq);
spin_unlock(&mle->spinlock);
if (res) {
int wake = 0;
spin_lock(&res->spinlock);
if (mle->type == DLM_MLE_MIGRATION) {
mlog(0, "finishing off migration of lockres %.*s, "
"from %u to %u\n",
res->lockname.len, res->lockname.name,
dlm->node_num, mle->new_master);
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
dlm_change_lockres_owner(dlm, res, mle->new_master);
BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
} else {
dlm_change_lockres_owner(dlm, res, mle->master);
}
spin_unlock(&res->spinlock);
have_lockres_ref = 1;
if (wake)
wake_up(&res->wq);
}
/* master is known, detach if not already detached.
* ensures that only one assert_master call will happen
* on this mle. */
spin_lock(&dlm->master_lock);
rr = kref_read(&mle->mle_refs);
if (mle->inuse > 0) {
if (extra_ref && rr < 3)
err = 1;
else if (!extra_ref && rr < 2)
err = 1;
} else {
if (extra_ref && rr < 2)
err = 1;
else if (!extra_ref && rr < 1)
err = 1;
}
if (err) {
mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
"that will mess up this node, refs=%d, extra=%d, "
"inuse=%d\n", dlm->name, namelen, name,
assert->node_idx, rr, extra_ref, mle->inuse);
dlm_print_one_mle(mle);
}
__dlm_unlink_mle(dlm, mle);
__dlm_mle_detach_hb_events(dlm, mle);
__dlm_put_mle(mle);
if (extra_ref) {
/* the assert master message now balances the extra
* ref given by the master / migration request message.
* if this is the last put, it will be removed
* from the list. */
__dlm_put_mle(mle);
}
spin_unlock(&dlm->master_lock);
} else if (res) {
if (res->owner != assert->node_idx) {
mlog(0, "assert_master from %u, but current "
"owner is %u (%.*s), no mle\n", assert->node_idx,
res->owner, namelen, name);
}
}
spin_unlock(&dlm->spinlock);
done:
ret = 0;
if (res) {
spin_lock(&res->spinlock);
res->state |= DLM_LOCK_RES_SETREF_INPROG;
spin_unlock(&res->spinlock);
*ret_data = (void *)res;
}
dlm_put(dlm);
if (master_request) {
mlog(0, "need to tell master to reassert\n");
/* positive. negative would shoot down the node. */
ret |= DLM_ASSERT_RESPONSE_REASSERT;
if (!have_lockres_ref) {
mlog(ML_ERROR, "strange, got assert from %u, MASTER "
"mle present here for %s:%.*s, but no lockres!\n",
assert->node_idx, dlm->name, namelen, name);
}
}
if (have_lockres_ref) {
/* let the master know we have a reference to the lockres */
ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
dlm->name, namelen, name, assert->node_idx);
}
return ret;
kill:
/* kill the caller! */
mlog(ML_ERROR, "Bad message received from another node. Dumping state "
"and killing the other node now! This node is OK and can continue.\n");
__dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
spin_lock(&dlm->master_lock);
if (mle)
__dlm_put_mle(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
*ret_data = (void *)res;
dlm_put(dlm);
return -EINVAL;
}
void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
{
struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
if (ret_data) {
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
dlm_lockres_put(res);
}
return;
}
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
item = kzalloc(sizeof(*item), GFP_ATOMIC);
if (!item)
return -ENOMEM;
/* queue up work for dlm_assert_master_worker */
dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
item->u.am.lockres = res; /* already have a ref */
/* can optionally ignore node numbers higher than this node */
item->u.am.ignore_higher = ignore_higher;
item->u.am.request_from = request_from;
item->u.am.flags = flags;
if (ignore_higher)
mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
res->lockname.name);
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
return 0;
}
static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm = data;
int ret = 0;
struct dlm_lock_resource *res;
unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
int ignore_higher;
int bit;
u8 request_from;
u32 flags;
dlm = item->dlm;
res = item->u.am.lockres;
ignore_higher = item->u.am.ignore_higher;
request_from = item->u.am.request_from;
flags = item->u.am.flags;
spin_lock(&dlm->spinlock);
memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
spin_unlock(&dlm->spinlock);
clear_bit(dlm->node_num, nodemap);
if (ignore_higher) {
/* if is this just to clear up mles for nodes below
* this node, do not send the message to the original
* caller or any node number higher than this */
clear_bit(request_from, nodemap);
bit = dlm->node_num;
while (1) {
bit = find_next_bit(nodemap, O2NM_MAX_NODES,
bit+1);
if (bit >= O2NM_MAX_NODES)
break;
clear_bit(bit, nodemap);
}
}
/*
* If we're migrating this lock to someone else, we are no
* longer allowed to assert out own mastery. OTOH, we need to
* prevent migration from starting while we're still asserting
* our dominance. The reserved ast delays migration.
*/
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_MIGRATING) {
mlog(0, "Someone asked us to assert mastery, but we're "
"in the middle of migration. Skipping assert, "
"the new master will handle that.\n");
spin_unlock(&res->spinlock);
goto put;
} else
__dlm_lockres_reserve_ast(res);
spin_unlock(&res->spinlock);
/* this call now finishes out the nodemap
* even if one or more nodes die */
mlog(0, "worker about to master %.*s here, this=%u\n",
res->lockname.len, res->lockname.name, dlm->node_num);
ret = dlm_do_assert_master(dlm, res, nodemap, flags);
if (ret < 0) {
/* no need to restart, we are done */
if (!dlm_is_host_down(ret))
mlog_errno(ret);
}
/* Ok, we've asserted ourselves. Let's let migration start. */
dlm_lockres_release_ast(dlm, res);
put:
dlm_lockres_drop_inflight_worker(dlm, res);
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
}
/* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
* We cannot wait for node recovery to complete to begin mastering this
* lockres because this lockres is used to kick off recovery! ;-)
* So, do a pre-check on all living nodes to see if any of those nodes
* think that $RECOVERY is currently mastered by a dead node. If so,
* we wait a short time to allow that node to get notified by its own
* heartbeat stack, then check again. All $RECOVERY lock resources
* mastered by dead nodes are purged when the hearbeat callback is
* fired, so we can know for sure that it is safe to continue once
* the node returns a live node or no node. */
static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct dlm_node_iter iter;
int nodenum;
int ret = 0;
u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
spin_lock(&dlm->spinlock);
dlm_node_iter_init(dlm->domain_map, &iter);
spin_unlock(&dlm->spinlock);
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
/* do not send to self */
if (nodenum == dlm->node_num)
continue;
ret = dlm_do_master_requery(dlm, res, nodenum, &master);
if (ret < 0) {
mlog_errno(ret);
if (!dlm_is_host_down(ret))
BUG();
/* host is down, so answer for that node would be
* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
ret = 0;
}
if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
/* check to see if this master is in the recovery map */
spin_lock(&dlm->spinlock);
if (test_bit(master, dlm->recovery_map)) {
mlog(ML_NOTICE, "%s: node %u has not seen "
"node %u go down yet, and thinks the "
"dead node is mastering the recovery "
"lock. must wait.\n", dlm->name,
nodenum, master);
ret = -EAGAIN;
}
spin_unlock(&dlm->spinlock);
mlog(0, "%s: reco lock master is %u\n", dlm->name,
master);
break;
}
}
return ret;
}
/*
* DLM_DEREF_LOCKRES_MSG
*/
int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
struct dlm_deref_lockres deref;
int ret = 0, r;
const char *lockname;
unsigned int namelen;
lockname = res->lockname.name;
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
memcpy(deref.name, lockname, namelen);
ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
&deref, sizeof(deref), res->owner, &r);
if (ret < 0)
mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
dlm->name, namelen, lockname, ret, res->owner);
else if (r < 0) {
/* BAD. other node says I did not have a ref. */
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
if (r == -ENOMEM)
BUG();
} else
ret = r;
return ret;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
struct dlm_lock_resource *res = NULL;
char *name;
unsigned int namelen;
int ret = -EINVAL;
u8 node;
unsigned int hash;
struct dlm_work_item *item;
int cleared = 0;
int dispatch = 0;
if (!dlm_grab(dlm))
return 0;
name = deref->name;
namelen = deref->namelen;
node = deref->node_idx;
if (namelen > DLM_LOCKID_NAME_MAX) {
mlog(ML_ERROR, "Invalid name length!");
goto done;
}
if (deref->node_idx >= O2NM_MAX_NODES) {
mlog(ML_ERROR, "Invalid node number: %u\n", node);
goto done;
}
hash = dlm_lockid_hash(name, namelen);
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
if (!res) {
spin_unlock(&dlm->spinlock);
mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
dlm->name, namelen, name);
goto done;
}
spin_unlock(&dlm->spinlock);
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_SETREF_INPROG)
dispatch = 1;
else {
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
if (test_bit(node, res->refmap)) {
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
}
spin_unlock(&res->spinlock);
if (!dispatch) {
if (cleared)
dlm_lockres_calc_usage(dlm, res);
else {
mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
"but it is already dropped!\n", dlm->name,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
ret = DLM_DEREF_RESPONSE_DONE;
goto done;
}
item = kzalloc(sizeof(*item), GFP_NOFS);
if (!item) {
ret = -ENOMEM;
mlog_errno(ret);
goto done;
}
dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
item->u.dl.deref_res = res;
item->u.dl.deref_node = node;
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
queue_work(dlm->dlm_worker, &dlm->dispatched_work);
return DLM_DEREF_RESPONSE_INPROG;
done:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_deref_lockres_done *deref
= (struct dlm_deref_lockres_done *)msg->buf;
struct dlm_lock_resource *res = NULL;
char *name;
unsigned int namelen;
int ret = -EINVAL;
u8 node;
unsigned int hash;
if (!dlm_grab(dlm))
return 0;
name = deref->name;
namelen = deref->namelen;
node = deref->node_idx;
if (namelen > DLM_LOCKID_NAME_MAX) {
mlog(ML_ERROR, "Invalid name length!");
goto done;
}
if (deref->node_idx >= O2NM_MAX_NODES) {
mlog(ML_ERROR, "Invalid node number: %u\n", node);
goto done;
}
hash = dlm_lockid_hash(name, namelen);
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
if (!res) {
spin_unlock(&dlm->spinlock);
mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
dlm->name, namelen, name);
goto done;
}
spin_lock(&res->spinlock);
ocfs2/dlm: disable BUG_ON when DLM_LOCK_RES_DROPPING_REF is cleared before dlm_deref_lockres_done_handler We found a BUG situation in which DLM_LOCK_RES_DROPPING_REF is cleared unexpected that described below. To solve the bug, we disable the BUG_ON and purge lockres in dlm_do_local_recovery_cleanup. Node 1 Node 2(master) dlm_purge_lockres dlm_deref_lockres_handler DLM_LOCK_RES_SETREF_INPROG is set response DLM_DEREF_RESPONSE_INPROG receive DLM_DEREF_RESPONSE_INPROG stop puring in dlm_purge_lockres and wait for DLM_DEREF_RESPONSE_DONE dispatch dlm_deref_lockres_worker response DLM_DEREF_RESPONSE_DONE receive DLM_DEREF_RESPONSE_DONE and prepare to purge lockres Node 2 goes down find Node2 down and do local clean up for Node2: dlm_do_local_recovery_cleanup -> clear DLM_LOCK_RES_DROPPING_REF when purging lockres, BUG_ON happens because DLM_LOCK_RES_DROPPING_REF is clear: dlm_deref_lockres_done_handler ->BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF)); [akpm@linux-foundation.org: fix duplicated write to `ret'] Fixes: 60d663cb5273 ("ocfs2/dlm: add DEREF_DONE message") Link: http://lkml.kernel.org/r/57845055.9080702@huawei.com Signed-off-by: Jun Piao <piaojun@huawei.com> Reviewed-by: Joseph Qi <joseph.qi@huawei.com> Reviewed-by: Jiufei Xue <xuejiufei@huawei.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-08-03 05:02:13 +08:00
if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
"but it is already derefed!\n", dlm->name,
res->lockname.len, res->lockname.name, node);
ret = 0;
goto done;
}
__dlm_do_purge_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
spin_unlock(&dlm->spinlock);
ret = 0;
done:
if (res)
dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 node)
{
struct dlm_deref_lockres_done deref;
int ret = 0, r;
const char *lockname;
unsigned int namelen;
lockname = res->lockname.name;
namelen = res->lockname.len;
BUG_ON(namelen > O2NM_MAX_NAME_LEN);
memset(&deref, 0, sizeof(deref));
deref.node_idx = dlm->node_num;
deref.namelen = namelen;
memcpy(deref.name, lockname, namelen);
ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
&deref, sizeof(deref), node, &r);
if (ret < 0) {
mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
" to node %u\n", dlm->name, namelen,
lockname, ret, node);
} else if (r < 0) {
/* ignore the error */
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, node, r);
dlm_print_one_lock_resource(res);
}
}
static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
{
struct dlm_ctxt *dlm;
struct dlm_lock_resource *res;
u8 node;
u8 cleared = 0;
dlm = item->dlm;
res = item->u.dl.deref_res;
node = item->u.dl.deref_node;
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
if (test_bit(node, res->refmap)) {
dlm_lockres_clear_refmap_bit(dlm, res, node);
cleared = 1;
}
spin_unlock(&res->spinlock);
dlm_drop_lockres_ref_done(dlm, res, node);
if (cleared) {
mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
dlm->name, res->lockname.len, res->lockname.name, node);
dlm_lockres_calc_usage(dlm, res);
} else {
mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
"but it is already dropped!\n", dlm->name,
res->lockname.len, res->lockname.name, node);
dlm_print_one_lock_resource(res);
}
dlm_lockres_put(res);
}
/*
* A migrateable resource is one that is :
* 1. locally mastered, and,
* 2. zero local locks, and,
* 3. one or more non-local locks, or, one or more references
* Returns 1 if yes, 0 if not.
*/
static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
int nonlocal = 0, node_ref;
struct list_head *queue;
struct dlm_lock *lock;
u64 cookie;
assert_spin_locked(&res->spinlock);
/* delay migration when the lockres is in MIGRATING state */
if (res->state & DLM_LOCK_RES_MIGRATING)
return 0;
/* delay migration when the lockres is in RECOCERING state */
ocfs2: fix a tiny race that leads file system read-only when o2hb detect a node down, it first set the dead node to recovery map and create ocfs2rec which will replay journal for dead node. o2hb thread then call dlm_do_local_recovery_cleanup() to delete the lock for dead node. After the lock of dead node is gone, locks for other nodes can be granted and may modify the meta data without replaying journal of the dead node. The detail is described as follows. N1 N2 N3(master) modify the extent tree of inode, and commit dirty metadata to journal, then goes down. o2hb thread detects N1 goes down, set recovery map and delete the lock of N1. dlm_thread flush ast for the lock of N2. do not detect the death of N1, so recovery map is empty. read inode from disk without replaying the journal of N1 and modify the extent tree of the inode that N1 had modified. ocfs2rec recover the journal of N1. The modification of N2 is lost. The modification of N1 and N2 are not serial, and it will lead to read-only file system. We can set recovery_waiting flag to the lock resource after delete the lock for dead node to prevent other node from getting the lock before dlm recovery. After dlm recovery, the recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will wait for ocfs2 recovery. Signed-off-by: Jiufei Xue <xuejiufei@huawei.com> Reviewed-by: Joseph Qi <joseph.qi@huawei.com> Cc: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-16 05:53:20 +08:00
if (res->state & (DLM_LOCK_RES_RECOVERING|
DLM_LOCK_RES_RECOVERY_WAITING))
return 0;
if (res->owner != dlm->node_num)
return 0;
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
if (lock->ml.node != dlm->node_num) {
nonlocal++;
continue;
}
cookie = be64_to_cpu(lock->ml.cookie);
mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on "
"%s list\n", dlm->name, res->lockname.len,
res->lockname.name,
dlm_get_lock_cookie_node(cookie),
dlm_get_lock_cookie_seq(cookie),
dlm_list_in_text(idx));
return 0;
}
}
if (!nonlocal) {
node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
if (node_ref >= O2NM_MAX_NODES)
return 0;
}
mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len,
res->lockname.name);
return 1;
}
/*
* DLM_MIGRATE_LOCKRES
*/
static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 target)
{
struct dlm_master_list_entry *mle = NULL;
struct dlm_master_list_entry *oldmle = NULL;
struct dlm_migratable_lockres *mres = NULL;
int ret = 0;
const char *name;
unsigned int namelen;
int mle_added = 0;
int wake = 0;
if (!dlm_grab(dlm))
return -EINVAL;
BUG_ON(target == O2NM_MAX_NODES);
name = res->lockname.name;
namelen = res->lockname.len;
mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
target);
/* preallocate up front. if this fails, abort */
ret = -ENOMEM;
mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
mlog_errno(ret);
goto leave;
}
mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
}
ret = 0;
/*
* clear any existing master requests and
* add the migration mle to the list
*/
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
namelen, target, dlm->node_num);
/* get an extra reference on the mle.
* otherwise the assert_master from the new
* master will destroy this.
*/
if (ret != -EEXIST)
dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
if (ret == -EEXIST) {
mlog(0, "another process is already migrating it\n");
goto fail;
}
mle_added = 1;
/*
* set the MIGRATING flag and flush asts
* if we fail after this we need to re-dirty the lockres
*/
if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
"the target went down.\n", res->lockname.len,
res->lockname.name, target);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
spin_unlock(&res->spinlock);
ret = -EINVAL;
}
fail:
if (ret != -EEXIST && oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
}
if (ret < 0) {
if (mle_added) {
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
dlm_put_mle_inuse(mle);
} else if (mle) {
kmem_cache_free(dlm_mle_cache, mle);
mle = NULL;
}
goto leave;
}
/*
* at this point, we have a migration target, an mle
* in the master list, and the MIGRATING flag set on
* the lockres
*/
/* now that remote nodes are spinning on the MIGRATING flag,
* ensure that all assert_master work is flushed. */
flush_workqueue(dlm->dlm_worker);
/* notify new node and send all lock state */
/* call send_one_lockres with migration flag.
* this serves as notice to the target node that a
* migration is starting. */
ret = dlm_send_one_lockres(dlm, res, mres, target,
DLM_MRES_MIGRATION);
if (ret < 0) {
mlog(0, "migration to node %u failed with %d\n",
target, ret);
/* migration failed, detach and clean up mle */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
dlm_put_mle_inuse(mle);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
spin_unlock(&res->spinlock);
if (dlm_is_host_down(ret))
dlm_wait_for_node_death(dlm, target,
DLM_NODE_DEATH_WAIT_MAX);
goto leave;
}
/* at this point, the target sends a message to all nodes,
* (using dlm_do_migrate_request). this node is skipped since
* we had to put an mle in the list to begin the process. this
* node now waits for target to do an assert master. this node
* will be the last one notified, ensuring that the migration
* is complete everywhere. if the target dies while this is
* going on, some nodes could potentially see the target as the
* master, so it is important that my recovery finds the migration
* mle and sets the master to UNKNOWN. */
/* wait for new node to assert master */
while (1) {
ret = wait_event_interruptible_timeout(mle->wq,
(atomic_read(&mle->woken) == 1),
msecs_to_jiffies(5000));
if (ret >= 0) {
if (atomic_read(&mle->woken) == 1 ||
res->owner == target)
break;
mlog(0, "%s:%.*s: timed out during migration\n",
dlm->name, res->lockname.len, res->lockname.name);
/* avoid hang during shutdown when migrating lockres
* to a node which also goes down */
if (dlm_is_node_dead(dlm, target)) {
mlog(0, "%s:%.*s: expected migration "
"target %u is no longer up, restarting\n",
dlm->name, res->lockname.len,
res->lockname.name, target);
ret = -EINVAL;
/* migration failed, detach and clean up mle */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
dlm_put_mle_inuse(mle);
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_MIGRATING;
wake = 1;
spin_unlock(&res->spinlock);
goto leave;
}
} else
mlog(0, "%s:%.*s: caught signal during migration\n",
dlm->name, res->lockname.len, res->lockname.name);
}
/* all done, set the owner, clear the flag */
spin_lock(&res->spinlock);
dlm_set_lockres_owner(dlm, res, target);
res->state &= ~DLM_LOCK_RES_MIGRATING;
dlm_remove_nonlocal_locks(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle_inuse(mle);
ret = 0;
dlm_lockres_calc_usage(dlm, res);
leave:
/* re-dirty the lockres if we failed */
if (ret < 0)
dlm_kick_thread(dlm, res);
/* wake up waiters if the MIGRATING flag got set
* but migration failed */
if (wake)
wake_up(&res->wq);
if (mres)
free_page((unsigned long)mres);
dlm_put(dlm);
mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
name, target, ret);
return ret;
}
#define DLM_MIGRATION_RETRY_MS 100
/*
* Should be called only after beginning the domain leave process.
* There should not be any remaining locks on nonlocal lock resources,
* and there should be no local locks left on locally mastered resources.
*
* Called with the dlm spinlock held, may drop it to do migration, but
* will re-acquire before exit.
*
* Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
*/
int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
{
int ret;
int lock_dropped = 0;
u8 target = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
spin_lock(&res->spinlock);
if (dlm_is_lockres_migrateable(dlm, res))
target = dlm_pick_migration_target(dlm, res);
spin_unlock(&res->spinlock);
if (target == O2NM_MAX_NODES)
goto leave;
/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
spin_unlock(&dlm->spinlock);
lock_dropped = 1;
ret = dlm_migrate_lockres(dlm, res, target);
if (ret)
mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
dlm->name, res->lockname.len, res->lockname.name,
target, ret);
spin_lock(&dlm->spinlock);
leave:
return lock_dropped;
}
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
{
int ret;
spin_lock(&dlm->ast_lock);
spin_lock(&lock->spinlock);
ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
spin_unlock(&lock->spinlock);
spin_unlock(&dlm->ast_lock);
return ret;
}
static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 mig_target)
{
int can_proceed;
spin_lock(&res->spinlock);
can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
spin_unlock(&res->spinlock);
/* target has died, so make the caller break out of the
* wait_event, but caller must recheck the domain_map */
spin_lock(&dlm->spinlock);
if (!test_bit(mig_target, dlm->domain_map))
can_proceed = 1;
spin_unlock(&dlm->spinlock);
return can_proceed;
}
static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int ret;
spin_lock(&res->spinlock);
ret = !!(res->state & DLM_LOCK_RES_DIRTY);
spin_unlock(&res->spinlock);
return ret;
}
static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 target)
{
int ret = 0;
mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
res->lockname.len, res->lockname.name, dlm->node_num,
target);
/* need to set MIGRATING flag on lockres. this is done by
* ensuring that all asts have been flushed for this lockres. */
spin_lock(&res->spinlock);
BUG_ON(res->migration_pending);
res->migration_pending = 1;
/* strategy is to reserve an extra ast then release
* it below, letting the release do all of the work */
__dlm_lockres_reserve_ast(res);
spin_unlock(&res->spinlock);
/* now flush all the pending asts */
dlm_kick_thread(dlm, res);
/* before waiting on DIRTY, block processes which may
* try to dirty the lockres before MIGRATING is set */
spin_lock(&res->spinlock);
BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
spin_unlock(&res->spinlock);
/* now wait on any pending asts and the DIRTY state */
wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
dlm_lockres_release_ast(dlm, res);
mlog(0, "about to wait on migration_wq, dirty=%s\n",
res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
/* if the extra ref we just put was the final one, this
* will pass thru immediately. otherwise, we need to wait
* for the last ast to finish. */
again:
ret = wait_event_interruptible_timeout(dlm->migration_wq,
dlm_migration_can_proceed(dlm, res, target),
msecs_to_jiffies(1000));
if (ret < 0) {
mlog(0, "woken again: migrating? %s, dead? %s\n",
res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
test_bit(target, dlm->domain_map) ? "no":"yes");
} else {
mlog(0, "all is well: migrating? %s, dead? %s\n",
res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
test_bit(target, dlm->domain_map) ? "no":"yes");
}
if (!dlm_migration_can_proceed(dlm, res, target)) {
mlog(0, "trying again...\n");
goto again;
}
ret = 0;
/* did the target go down or die? */
spin_lock(&dlm->spinlock);
if (!test_bit(target, dlm->domain_map)) {
mlog(ML_ERROR, "aha. migration target %u just went down\n",
target);
ret = -EHOSTDOWN;
}
spin_unlock(&dlm->spinlock);
/*
* if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
* another try; otherwise, we are sure the MIGRATING state is there,
* drop the unneeded state which blocked threads trying to DIRTY
*/
spin_lock(&res->spinlock);
BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
if (!ret)
BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
else
res->migration_pending = 0;
spin_unlock(&res->spinlock);
/*
* at this point:
*
* o the DLM_LOCK_RES_MIGRATING flag is set if target not down
* o there are no pending asts on this lockres
* o all processes trying to reserve an ast on this
* lockres must wait for the MIGRATING flag to clear
*/
return ret;
}
/* last step in the migration process.
* original master calls this to free all of the dlm_lock
* structures that used to be for other nodes. */
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
struct list_head *queue = &res->granted;
int i, bit;
struct dlm_lock *lock, *next;
assert_spin_locked(&res->spinlock);
BUG_ON(res->owner == dlm->node_num);
for (i=0; i<3; i++) {
list_for_each_entry_safe(lock, next, queue, list) {
if (lock->ml.node != dlm->node_num) {
mlog(0, "putting lock for node %u\n",
lock->ml.node);
/* be extra careful */
BUG_ON(!list_empty(&lock->ast_list));
BUG_ON(!list_empty(&lock->bast_list));
BUG_ON(lock->ast_pending);
BUG_ON(lock->bast_pending);
dlm_lockres_clear_refmap_bit(dlm, res,
lock->ml.node);
list_del_init(&lock->list);
dlm_lock_put(lock);
/* In a normal unlock, we would have added a
* DLM_UNLOCK_FREE_LOCK action. Force it. */
dlm_lock_put(lock);
}
}
queue++;
}
bit = 0;
while (1) {
bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
if (bit >= O2NM_MAX_NODES)
break;
/* do not clear the local node reference, if there is a
* process holding this, let it drop the ref itself */
if (bit != dlm->node_num) {
mlog(0, "%s:%.*s: node %u had a ref to this "
"migrating lockres, clearing\n", dlm->name,
res->lockname.len, res->lockname.name, bit);
dlm_lockres_clear_refmap_bit(dlm, res, bit);
}
bit++;
}
}
/*
* Pick a node to migrate the lock resource to. This function selects a
* potential target based first on the locks and then on refmap. It skips
* nodes that are in the process of exiting the domain.
*/
static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
enum dlm_lockres_list idx;
struct list_head *queue = &res->granted;
struct dlm_lock *lock;
int noderef;
u8 nodenum = O2NM_MAX_NODES;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&res->spinlock);
/* Go through all the locks */
for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
queue = dlm_list_idx_to_ptr(res, idx);
list_for_each_entry(lock, queue, list) {
if (lock->ml.node == dlm->node_num)
continue;
if (test_bit(lock->ml.node, dlm->exit_domain_map))
continue;
nodenum = lock->ml.node;
goto bail;
}
}
/* Go thru the refmap */
noderef = -1;
while (1) {
noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
noderef + 1);
if (noderef >= O2NM_MAX_NODES)
break;
if (noderef == dlm->node_num)
continue;
if (test_bit(noderef, dlm->exit_domain_map))
continue;
nodenum = noderef;
goto bail;
}
bail:
return nodenum;
}
/* this is called by the new master once all lockres
* data has been received */
static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
u8 master, u8 new_master,
struct dlm_node_iter *iter)
{
struct dlm_migrate_request migrate;
int ret, skip, status = 0;
int nodenum;
memset(&migrate, 0, sizeof(migrate));
migrate.namelen = res->lockname.len;
memcpy(migrate.name, res->lockname.name, migrate.namelen);
migrate.new_master = new_master;
migrate.master = master;
ret = 0;
/* send message to all nodes, except the master and myself */
while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
if (nodenum == master ||
nodenum == new_master)
continue;
/* We could race exit domain. If exited, skip. */
spin_lock(&dlm->spinlock);
skip = (!test_bit(nodenum, dlm->domain_map));
spin_unlock(&dlm->spinlock);
if (skip) {
clear_bit(nodenum, iter->node_map);
continue;
}
ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
&migrate, sizeof(migrate), nodenum,
&status);
if (ret < 0) {
mlog(ML_ERROR, "%s: res %.*s, Error %d send "
"MIGRATE_REQUEST to node %u\n", dlm->name,
migrate.namelen, migrate.name, ret, nodenum);
if (!dlm_is_host_down(ret)) {
mlog(ML_ERROR, "unhandled error=%d!\n", ret);
BUG();
}
clear_bit(nodenum, iter->node_map);
ret = 0;
} else if (status < 0) {
mlog(0, "migrate request (node %u) returned %d!\n",
nodenum, status);
ret = status;
} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
/* during the migration request we short-circuited
* the mastery of the lockres. make sure we have
* a mastery ref for nodenum */
mlog(0, "%s:%.*s: need ref for node %u\n",
dlm->name, res->lockname.len, res->lockname.name,
nodenum);
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, nodenum);
spin_unlock(&res->spinlock);
}
}
if (ret < 0)
mlog_errno(ret);
mlog(0, "returning ret=%d\n", ret);
return ret;
}
/* if there is an existing mle for this lockres, we now know who the master is.
* (the one who sent us *this* message) we can clear it up right away.
* since the process that put the mle on the list still has a reference to it,
* we can unhash it now, set the master and wake the process. as a result,
* we will have no mle in the list to start with. now we can add an mle for
* the migration and this should be the only one found for those scanning the
* list. */
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
void **ret_data)
{
struct dlm_ctxt *dlm = data;
struct dlm_lock_resource *res = NULL;
struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
const char *name;
unsigned int namelen, hash;
int ret = 0;
if (!dlm_grab(dlm))
return 0;
name = migrate->name;
namelen = migrate->namelen;
hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
goto leave;
}
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
res = __dlm_lookup_lockres(dlm, name, namelen, hash);
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
/* if all is working ok, this can only mean that we got
* a migrate request from a node that we now see as
* dead. what can we do here? drop it to the floor? */
spin_unlock(&res->spinlock);
mlog(ML_ERROR, "Got a migrate request, but the "
"lockres is marked as recovering!");
kmem_cache_free(dlm_mle_cache, mle);
ret = -EINVAL; /* need a better solution */
goto unlock;
}
res->state |= DLM_LOCK_RES_MIGRATING;
spin_unlock(&res->spinlock);
}
spin_lock(&dlm->master_lock);
/* ignore status. only nonzero status would BUG. */
ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
name, namelen,
migrate->new_master,
migrate->master);
if (ret < 0)
kmem_cache_free(dlm_mle_cache, mle);
spin_unlock(&dlm->master_lock);
unlock:
spin_unlock(&dlm->spinlock);
if (oldmle) {
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, oldmle);
dlm_put_mle(oldmle);
}
if (res)
dlm_lockres_put(res);
leave:
dlm_put(dlm);
return ret;
}
/* must be holding dlm->spinlock and dlm->master_lock
* when adding a migration mle, we can clear any other mles
* in the master list because we know with certainty that
* the master is "master". so we remove any old mle from
* the list after setting it's master field, and then add
* the new migration mle. this way we can hold with the rule
* of having only one mle for a given lock name at all times. */
static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
struct dlm_master_list_entry **oldmle,
const char *name, unsigned int namelen,
u8 new_master, u8 master)
{
int found;
int ret = 0;
*oldmle = NULL;
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
/* caller is responsible for any ref taken here on oldmle */
found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
if (found) {
struct dlm_master_list_entry *tmp = *oldmle;
spin_lock(&tmp->spinlock);
if (tmp->type == DLM_MLE_MIGRATION) {
if (master == dlm->node_num) {
/* ah another process raced me to it */
mlog(0, "tried to migrate %.*s, but some "
"process beat me to it\n",
namelen, name);
spin_unlock(&tmp->spinlock);
return -EEXIST;
} else {
/* bad. 2 NODES are trying to migrate! */
mlog(ML_ERROR, "migration error mle: "
"master=%u new_master=%u // request: "
"master=%u new_master=%u // "
"lockres=%.*s\n",
tmp->master, tmp->new_master,
master, new_master,
namelen, name);
BUG();
}
} else {
/* this is essentially what assert_master does */
tmp->master = master;
atomic_set(&tmp->woken, 1);
wake_up(&tmp->wq);
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
if (tmp->type == DLM_MLE_MASTER) {
ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
"telling master to get ref "
"for cleared out mle during "
"migration\n", dlm->name,
namelen, name, master,
new_master);
}
}
spin_unlock(&tmp->spinlock);
}
/* now add a migration mle to the tail of the list */
dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
mle->new_master = new_master;
/* the new master will be sending an assert master for this.
* at that point we will get the refmap reference */
mle->master = master;
/* do this for consistency with other mle types */
set_bit(new_master, mle->maybe_map);
__dlm_insert_mle(dlm, mle);
return ret;
}
/*
* Sets the owner of the lockres, associated to the mle, to UNKNOWN
*/
static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle)
{
struct dlm_lock_resource *res;
/* Find the lockres associated to the mle and set its owner to UNK */
res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
mle->mnamehash);
if (res) {
spin_unlock(&dlm->master_lock);
/* move lockres onto recovery list */
spin_lock(&res->spinlock);
dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
dlm_move_lockres_to_recovery_list(dlm, res);
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
/* about to get rid of mle, detach from heartbeat */
__dlm_mle_detach_hb_events(dlm, mle);
/* dump the mle */
spin_lock(&dlm->master_lock);
__dlm_put_mle(mle);
spin_unlock(&dlm->master_lock);
}
return res;
}
static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle)
{
__dlm_mle_detach_hb_events(dlm, mle);
spin_lock(&mle->spinlock);
__dlm_unlink_mle(dlm, mle);
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
wake_up(&mle->wq);
}
static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle, u8 dead_node)
{
int bit;
BUG_ON(mle->type != DLM_MLE_BLOCK);
spin_lock(&mle->spinlock);
bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
if (bit != dead_node) {
mlog(0, "mle found, but dead node %u would not have been "
"master\n", dead_node);
spin_unlock(&mle->spinlock);
} else {
/* Must drop the refcount by one since the assert_master will
* never arrive. This may result in the mle being unlinked and
* freed, but there may still be a process waiting in the
* dlmlock path which is fine. */
mlog(0, "node %u was expected master\n", dead_node);
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
wake_up(&mle->wq);
/* Do not need events any longer, so detach from heartbeat */
__dlm_mle_detach_hb_events(dlm, mle);
__dlm_put_mle(mle);
}
}
void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
{
struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
struct hlist_head *bucket;
struct hlist_node *tmp;
unsigned int i;
mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
top:
assert_spin_locked(&dlm->spinlock);
/* clean the master list */
spin_lock(&dlm->master_lock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
/* MASTER mles are initiated locally. The waiting
* process will notice the node map change shortly.
* Let that happen as normal. */
if (mle->type == DLM_MLE_MASTER)
continue;
/* BLOCK mles are initiated by other nodes. Need to
* clean up if the dead node would have been the
* master. */
if (mle->type == DLM_MLE_BLOCK) {
dlm_clean_block_mle(dlm, mle, dead_node);
continue;
}
/* Everything else is a MIGRATION mle */
/* The rule for MIGRATION mles is that the master
* becomes UNKNOWN if *either* the original or the new
* master dies. All UNKNOWN lockres' are sent to
* whichever node becomes the recovery master. The new
* master is responsible for determining if there is
* still a master for this lockres, or if he needs to
* take over mastery. Either way, this node should
* expect another message to resolve this. */
if (mle->master != dead_node &&
mle->new_master != dead_node)
continue;
if (mle->new_master == dead_node && mle->inuse) {
mlog(ML_NOTICE, "%s: target %u died during "
"migration from %u, the MLE is "
"still keep used, ignore it!\n",
dlm->name, dead_node,
mle->master);
continue;
}
/* If we have reached this point, this mle needs to be
* removed from the list and freed. */
dlm_clean_migration_mle(dlm, mle);
mlog(0, "%s: node %u died during migration from "
"%u to %u!\n", dlm->name, dead_node, mle->master,
mle->new_master);
/* If we find a lockres associated with the mle, we've
* hit this rare case that messes up our lock ordering.
* If so, we need to drop the master lock so that we can
* take the lockres lock, meaning that we will have to
* restart from the head of list. */
res = dlm_reset_mleres_owner(dlm, mle);
if (res)
/* restart */
goto top;
/* This may be the last reference */
__dlm_put_mle(mle);
}
}
spin_unlock(&dlm->master_lock);
}
int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 old_master)
{
struct dlm_node_iter iter;
int ret = 0;
spin_lock(&dlm->spinlock);
dlm_node_iter_init(dlm->domain_map, &iter);
clear_bit(old_master, iter.node_map);
clear_bit(dlm->node_num, iter.node_map);
spin_unlock(&dlm->spinlock);
/* ownership of the lockres is changing. account for the
* mastery reference here since old_master will briefly have
* a reference after the migration completes */
spin_lock(&res->spinlock);
dlm_lockres_set_refmap_bit(dlm, res, old_master);
spin_unlock(&res->spinlock);
mlog(0, "now time to do a migrate request to other nodes\n");
ret = dlm_do_migrate_request(dlm, res, old_master,
dlm->node_num, &iter);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
mlog(0, "doing assert master of %.*s to all except the original node\n",
res->lockname.len, res->lockname.name);
/* this call now finishes out the nodemap
* even if one or more nodes die */
ret = dlm_do_assert_master(dlm, res, iter.node_map,
DLM_ASSERT_MASTER_FINISH_MIGRATION);
if (ret < 0) {
/* no longer need to retry. all living nodes contacted. */
mlog_errno(ret);
ret = 0;
}
memset(iter.node_map, 0, sizeof(iter.node_map));
set_bit(old_master, iter.node_map);
mlog(0, "doing assert master of %.*s back to %u\n",
res->lockname.len, res->lockname.name, old_master);
ret = dlm_do_assert_master(dlm, res, iter.node_map,
DLM_ASSERT_MASTER_FINISH_MIGRATION);
if (ret < 0) {
mlog(0, "assert master to original master failed "
"with %d.\n", ret);
/* the only nonzero status here would be because of
* a dead original node. we're done. */
ret = 0;
}
/* all done, set the owner, clear the flag */
spin_lock(&res->spinlock);
dlm_set_lockres_owner(dlm, res, dlm->node_num);
res->state &= ~DLM_LOCK_RES_MIGRATING;
spin_unlock(&res->spinlock);
/* re-dirty it on the new master */
dlm_kick_thread(dlm, res);
wake_up(&res->wq);
leave:
return ret;
}
/*
* LOCKRES AST REFCOUNT
* this is integral to migration
*/
/* for future intent to call an ast, reserve one ahead of time.
* this should be called only after waiting on the lockres
* with dlm_wait_on_lockres, and while still holding the
* spinlock after the call. */
void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
{
assert_spin_locked(&res->spinlock);
if (res->state & DLM_LOCK_RES_MIGRATING) {
__dlm_print_one_lock_resource(res);
}
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
atomic_inc(&res->asts_reserved);
}
/*
* used to drop the reserved ast, either because it went unused,
* or because the ast/bast was actually called.
*
* also, if there is a pending migration on this lockres,
* and this was the last pending ast on the lockres,
* atomically set the MIGRATING flag before we drop the lock.
* this is how we ensure that migration can proceed with no
* asts in progress. note that it is ok if the state of the
* queues is such that a lock should be granted in the future
* or that a bast should be fired, because the new master will
* shuffle the lists on this lockres as soon as it is migrated.
*/
void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
return;
if (!res->migration_pending) {
spin_unlock(&res->spinlock);
return;
}
BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
res->migration_pending = 0;
res->state |= DLM_LOCK_RES_MIGRATING;
spin_unlock(&res->spinlock);
wake_up(&res->wq);
wake_up(&dlm->migration_wq);
}
void dlm_force_free_mles(struct dlm_ctxt *dlm)
{
int i;
struct hlist_head *bucket;
struct dlm_master_list_entry *mle;
struct hlist_node *tmp;
/*
* We notified all other nodes that we are exiting the domain and
* marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
* around we force free them and wake any processes that are waiting
* on the mles
*/
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_master_hash(dlm, i);
hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
if (mle->type != DLM_MLE_BLOCK) {
mlog(ML_ERROR, "bad mle: %p\n", mle);
dlm_print_one_mle(mle);
}
atomic_set(&mle->woken, 1);
wake_up(&mle->wq);
__dlm_unlink_mle(dlm, mle);
__dlm_mle_detach_hb_events(dlm, mle);
__dlm_put_mle(mle);
}
}
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
}