staging: ramster: ramster-specific changes to zcache/tmem
RAMster implements peer-to-peer transcendent memory, allowing a "cluster" of kernels to dynamically pool their RAM. This patch incorporates changes transforming zcache to work with a remote store. In tmem.[ch], new "repatriate" (provoke async get) and "localify" (handle incoming data resulting from an async get) routines combine with a handful of changes to existing pamops interfaces allow the generic tmem code to support asynchronous operations. Also, a new tmem_xhandle struct groups together key information that must be passed to remote tmem stores. Zcache-main.c is augmented with a large amount of ramster-specific code to handle remote operations and "foreign" pages on both ends of the "remotify" protocol. New "foreign" pools are auto-created on demand. A "selfshrinker" thread periodically repatriates remote persistent pages when local memory conditions allow. For certain operations, a queue is necessary to guarantee strict ordering as out-of-order puts/flushes can cause strange race conditions. Pampd pointers now either point to local memory OR describe a remote page; to allow the same 64-bits to describe either, the LSB is used to differentiate. Some acrobatics must be performed to ensure local memory is available to handle a remote persistent get, or deal with the data directly anyway if the malloc failed. Lots of ramster-specific statistics are available via sysfs. Note: Some debug ifdefs left in for now. Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
b95e141a64
commit
c89126eabb
|
@ -1,13 +1,13 @@
|
|||
config ZCACHE
|
||||
tristate "Dynamic compression of swap pages and clean pagecache pages"
|
||||
depends on CLEANCACHE || FRONTSWAP
|
||||
select XVMALLOC
|
||||
config RAMSTER
|
||||
bool "Cross-machine RAM capacity sharing, aka peer-to-peer tmem"
|
||||
depends on (CLEANCACHE || FRONTSWAP) && CONFIGFS_FS && !ZCACHE && !XVMALLOC && !HIGHMEM
|
||||
select LZO_COMPRESS
|
||||
select LZO_DECOMPRESS
|
||||
default n
|
||||
help
|
||||
Zcache doubles RAM efficiency while providing a significant
|
||||
performance boosts on many workloads. Zcache uses lzo1x
|
||||
compression and an in-kernel implementation of transcendent
|
||||
memory to store clean page cache pages and swap in RAM,
|
||||
providing a noticeable reduction in disk I/O.
|
||||
RAMster allows RAM on other machines in a cluster to be utilized
|
||||
dynamically and symmetrically instead of swapping to a local swap
|
||||
disk, thus improving performance on memory-constrained workloads
|
||||
while minimizing total RAM across the cluster. RAMster, like
|
||||
zcache, compresses swap pages into local RAM, but then remotifies
|
||||
the compressed pages to another node in the RAMster cluster.
|
||||
|
|
|
@ -1,3 +1 @@
|
|||
zcache-y := zcache-main.o tmem.o
|
||||
|
||||
obj-$(CONFIG_ZCACHE) += zcache.o
|
||||
obj-$(CONFIG_RAMSTER) += zcache-main.o tmem.o r2net.o xvmalloc.o cluster/
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
#include <linux/list.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/delay.h>
|
||||
|
||||
#include "tmem.h"
|
||||
|
||||
|
@ -316,7 +317,7 @@ static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
|
|||
}
|
||||
|
||||
static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
|
||||
void *new_pampd)
|
||||
void *new_pampd, bool no_free)
|
||||
{
|
||||
struct tmem_objnode **slot;
|
||||
void *ret = NULL;
|
||||
|
@ -325,7 +326,9 @@ static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
|
|||
if ((slot != NULL) && (*slot != NULL)) {
|
||||
void *old_pampd = *(void **)slot;
|
||||
*(void **)slot = new_pampd;
|
||||
(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
|
||||
if (!no_free)
|
||||
(*tmem_pamops.free)(old_pampd, obj->pool,
|
||||
NULL, 0, false);
|
||||
ret = new_pampd;
|
||||
}
|
||||
return ret;
|
||||
|
@ -481,7 +484,7 @@ static void tmem_objnode_node_destroy(struct tmem_obj *obj,
|
|||
if (ht == 1) {
|
||||
obj->pampd_count--;
|
||||
(*tmem_pamops.free)(objnode->slots[i],
|
||||
obj->pool, NULL, 0);
|
||||
obj->pool, NULL, 0, true);
|
||||
objnode->slots[i] = NULL;
|
||||
continue;
|
||||
}
|
||||
|
@ -498,7 +501,8 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
|
|||
return;
|
||||
if (obj->objnode_tree_height == 0) {
|
||||
obj->pampd_count--;
|
||||
(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
|
||||
(*tmem_pamops.free)(obj->objnode_tree_root,
|
||||
obj->pool, NULL, 0, true);
|
||||
} else {
|
||||
tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
|
||||
obj->objnode_tree_height);
|
||||
|
@ -529,7 +533,7 @@ static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
|
|||
* always flushes for simplicity.
|
||||
*/
|
||||
int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
|
||||
char *data, size_t size, bool raw, bool ephemeral)
|
||||
char *data, size_t size, bool raw, int ephemeral)
|
||||
{
|
||||
struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
|
||||
void *pampd = NULL, *pampd_del = NULL;
|
||||
|
@ -545,7 +549,7 @@ int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
|
|||
/* if found, is a dup put, flush the old one */
|
||||
pampd_del = tmem_pampd_delete_from_obj(obj, index);
|
||||
BUG_ON(pampd_del != pampd);
|
||||
(*tmem_pamops.free)(pampd, pool, oidp, index);
|
||||
(*tmem_pamops.free)(pampd, pool, oidp, index, true);
|
||||
if (obj->pampd_count == 0) {
|
||||
objnew = obj;
|
||||
objfound = NULL;
|
||||
|
@ -576,7 +580,7 @@ delete_and_free:
|
|||
(void)tmem_pampd_delete_from_obj(obj, index);
|
||||
free:
|
||||
if (pampd)
|
||||
(*tmem_pamops.free)(pampd, pool, NULL, 0);
|
||||
(*tmem_pamops.free)(pampd, pool, NULL, 0, true);
|
||||
if (objnew) {
|
||||
tmem_obj_free(objnew, hb);
|
||||
(*tmem_hostops.obj_free)(objnew, pool);
|
||||
|
@ -586,6 +590,65 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp,
|
||||
uint32_t index, struct tmem_obj **ret_obj,
|
||||
void **saved_hb)
|
||||
{
|
||||
struct tmem_hashbucket *hb;
|
||||
struct tmem_obj *obj = NULL;
|
||||
void *pampd = NULL;
|
||||
|
||||
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
|
||||
spin_lock(&hb->lock);
|
||||
obj = tmem_obj_find(hb, oidp);
|
||||
if (likely(obj != NULL))
|
||||
pampd = tmem_pampd_lookup_in_obj(obj, index);
|
||||
*ret_obj = obj;
|
||||
*saved_hb = (void *)hb;
|
||||
/* note, hashbucket remains locked */
|
||||
return pampd;
|
||||
}
|
||||
|
||||
void tmem_localify_finish(struct tmem_obj *obj, uint32_t index,
|
||||
void *pampd, void *saved_hb, bool delete)
|
||||
{
|
||||
struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb;
|
||||
|
||||
BUG_ON(!spin_is_locked(&hb->lock));
|
||||
if (pampd != NULL) {
|
||||
BUG_ON(obj == NULL);
|
||||
(void)tmem_pampd_replace_in_obj(obj, index, pampd, 1);
|
||||
} else if (delete) {
|
||||
BUG_ON(obj == NULL);
|
||||
(void)tmem_pampd_delete_from_obj(obj, index);
|
||||
}
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
|
||||
static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb,
|
||||
struct tmem_pool *pool, struct tmem_oid *oidp,
|
||||
uint32_t index, bool free, char *data)
|
||||
{
|
||||
void *old_pampd = *ppampd, *new_pampd = NULL;
|
||||
bool intransit = false;
|
||||
int ret = 0;
|
||||
|
||||
|
||||
if (!is_ephemeral(pool))
|
||||
new_pampd = (*tmem_pamops.repatriate_preload)(
|
||||
old_pampd, pool, oidp, index, &intransit);
|
||||
if (intransit)
|
||||
ret = -EAGAIN;
|
||||
else if (new_pampd != NULL)
|
||||
*ppampd = new_pampd;
|
||||
/* must release the hb->lock else repatriate can't sleep */
|
||||
spin_unlock(&hb->lock);
|
||||
if (!intransit)
|
||||
ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool,
|
||||
oidp, index, free, data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* "Get" a page, e.g. if one can be found, copy the tmem page with the
|
||||
* matching handle from PAM space to the kernel. By tmem definition,
|
||||
|
@ -607,14 +670,36 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
|
|||
int ret = -1;
|
||||
struct tmem_hashbucket *hb;
|
||||
bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
|
||||
bool lock_held = false;
|
||||
bool lock_held = 0;
|
||||
void **ppampd;
|
||||
|
||||
again:
|
||||
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
|
||||
spin_lock(&hb->lock);
|
||||
lock_held = true;
|
||||
lock_held = 1;
|
||||
obj = tmem_obj_find(hb, oidp);
|
||||
if (obj == NULL)
|
||||
goto out;
|
||||
ppampd = __tmem_pampd_lookup_in_obj(obj, index);
|
||||
if (ppampd == NULL)
|
||||
goto out;
|
||||
if (tmem_pamops.is_remote(*ppampd)) {
|
||||
ret = tmem_repatriate(ppampd, hb, pool, oidp,
|
||||
index, free, data);
|
||||
lock_held = 0; /* note hb->lock has been unlocked */
|
||||
if (ret == -EAGAIN) {
|
||||
/* rare I think, but should cond_resched()??? */
|
||||
usleep_range(10, 1000);
|
||||
goto again;
|
||||
} else if (ret != 0) {
|
||||
if (ret != -ENOENT)
|
||||
pr_err("UNTESTED case in tmem_get, ret=%d\n",
|
||||
ret);
|
||||
ret = -1;
|
||||
goto out;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
if (free)
|
||||
pampd = tmem_pampd_delete_from_obj(obj, index);
|
||||
else
|
||||
|
@ -628,10 +713,6 @@ int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
|
|||
obj = NULL;
|
||||
}
|
||||
}
|
||||
if (tmem_pamops.is_remote(pampd)) {
|
||||
lock_held = false;
|
||||
spin_unlock(&hb->lock);
|
||||
}
|
||||
if (free)
|
||||
ret = (*tmem_pamops.get_data_and_free)(
|
||||
data, size, raw, pampd, pool, oidp, index);
|
||||
|
@ -668,7 +749,7 @@ int tmem_flush_page(struct tmem_pool *pool,
|
|||
pampd = tmem_pampd_delete_from_obj(obj, index);
|
||||
if (pampd == NULL)
|
||||
goto out;
|
||||
(*tmem_pamops.free)(pampd, pool, oidp, index);
|
||||
(*tmem_pamops.free)(pampd, pool, oidp, index, true);
|
||||
if (obj->pampd_count == 0) {
|
||||
tmem_obj_free(obj, hb);
|
||||
(*tmem_hostops.obj_free)(obj, pool);
|
||||
|
@ -682,8 +763,8 @@ out:
|
|||
|
||||
/*
|
||||
* If a page in tmem matches the handle, replace the page so that any
|
||||
* subsequent "get" gets the new page. Returns 0 if
|
||||
* there was a page to replace, else returns -1.
|
||||
* subsequent "get" gets the new page. Returns the new page if
|
||||
* there was a page to replace, else returns NULL.
|
||||
*/
|
||||
int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
|
||||
uint32_t index, void *new_pampd)
|
||||
|
@ -697,7 +778,7 @@ int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
|
|||
obj = tmem_obj_find(hb, oidp);
|
||||
if (obj == NULL)
|
||||
goto out;
|
||||
new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
|
||||
new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0);
|
||||
ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
|
||||
out:
|
||||
spin_unlock(&hb->lock);
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
#ifndef _TMEM_H_
|
||||
#define _TMEM_H_
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/atomic.h>
|
||||
|
@ -89,6 +88,31 @@ struct tmem_oid {
|
|||
uint64_t oid[3];
|
||||
};
|
||||
|
||||
struct tmem_xhandle {
|
||||
uint8_t client_id;
|
||||
uint8_t xh_data_cksum;
|
||||
uint16_t xh_data_size;
|
||||
uint16_t pool_id;
|
||||
struct tmem_oid oid;
|
||||
uint32_t index;
|
||||
void *extra;
|
||||
};
|
||||
|
||||
static inline struct tmem_xhandle tmem_xhandle_fill(uint16_t client_id,
|
||||
struct tmem_pool *pool,
|
||||
struct tmem_oid *oidp,
|
||||
uint32_t index)
|
||||
{
|
||||
struct tmem_xhandle xh;
|
||||
xh.client_id = client_id;
|
||||
xh.xh_data_cksum = (uint8_t)-1;
|
||||
xh.xh_data_size = (uint16_t)-1;
|
||||
xh.pool_id = pool->pool_id;
|
||||
xh.oid = *oidp;
|
||||
xh.index = index;
|
||||
return xh;
|
||||
}
|
||||
|
||||
static inline void tmem_oid_set_invalid(struct tmem_oid *oidp)
|
||||
{
|
||||
oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
|
||||
|
@ -147,7 +171,11 @@ struct tmem_obj {
|
|||
unsigned int objnode_tree_height;
|
||||
unsigned long objnode_count;
|
||||
long pampd_count;
|
||||
void *extra; /* for private use by pampd implementation */
|
||||
/* for current design of ramster, all pages belonging to
|
||||
* an object reside on the same remotenode and extra is
|
||||
* used to record the number of the remotenode so a
|
||||
* flush-object operation can specify it */
|
||||
void *extra; /* for use by pampd implementation */
|
||||
DECL_SENTINEL
|
||||
};
|
||||
|
||||
|
@ -174,9 +202,14 @@ struct tmem_pamops {
|
|||
int (*get_data_and_free)(char *, size_t *, bool, void *,
|
||||
struct tmem_pool *, struct tmem_oid *,
|
||||
uint32_t);
|
||||
void (*free)(void *, struct tmem_pool *, struct tmem_oid *, uint32_t);
|
||||
void (*free)(void *, struct tmem_pool *,
|
||||
struct tmem_oid *, uint32_t, bool);
|
||||
void (*free_obj)(struct tmem_pool *, struct tmem_obj *);
|
||||
bool (*is_remote)(void *);
|
||||
void *(*repatriate_preload)(void *, struct tmem_pool *,
|
||||
struct tmem_oid *, uint32_t, bool *);
|
||||
int (*repatriate)(void *, void *, struct tmem_pool *,
|
||||
struct tmem_oid *, uint32_t, bool, void *);
|
||||
void (*new_obj)(struct tmem_obj *);
|
||||
int (*replace_in_obj)(void *, struct tmem_obj *);
|
||||
};
|
||||
|
@ -193,11 +226,16 @@ extern void tmem_register_hostops(struct tmem_hostops *m);
|
|||
|
||||
/* core tmem accessor functions */
|
||||
extern int tmem_put(struct tmem_pool *, struct tmem_oid *, uint32_t index,
|
||||
char *, size_t, bool, bool);
|
||||
char *, size_t, bool, int);
|
||||
extern int tmem_get(struct tmem_pool *, struct tmem_oid *, uint32_t index,
|
||||
char *, size_t *, bool, int);
|
||||
extern int tmem_replace(struct tmem_pool *, struct tmem_oid *, uint32_t index,
|
||||
void *);
|
||||
extern void *tmem_localify_get_pampd(struct tmem_pool *, struct tmem_oid *,
|
||||
uint32_t index, struct tmem_obj **,
|
||||
void **);
|
||||
extern void tmem_localify_finish(struct tmem_obj *, uint32_t index,
|
||||
void *, void *, bool);
|
||||
extern int tmem_flush_page(struct tmem_pool *, struct tmem_oid *,
|
||||
uint32_t index);
|
||||
extern int tmem_flush_object(struct tmem_pool *, struct tmem_oid *);
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue