libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op
This is primarily for rbd's benefit and is supposed to combat fragmentation: "... knowing that rbd images have a 4m size, librbd can pass a hint that will let the osd do the xfs allocation size ioctl on new files so that they are allocated in 1m or 4m chunks. We've seen cases where users with rbd workloads have very high levels of fragmentation in xfs and this would mitigate that and probably have a pretty nice performance benefit." SETALLOCHINT is considered advisory, so our backwards compatibility mechanism here is to set FAILOK flag for all SETALLOCHINT ops. Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Sage Weil <sage@inktank.com> Reviewed-by: Alex Elder <elder@linaro.org>
This commit is contained in:
parent
7b25bf5f02
commit
c647b8a8c6
|
@ -103,6 +103,10 @@ struct ceph_osd_req_op {
|
||||||
u32 timeout;
|
u32 timeout;
|
||||||
__u8 flag;
|
__u8 flag;
|
||||||
} watch;
|
} watch;
|
||||||
|
struct {
|
||||||
|
u64 expected_object_size;
|
||||||
|
u64 expected_write_size;
|
||||||
|
} alloc_hint;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -294,6 +298,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
|
||||||
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
|
extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
|
||||||
unsigned int which, u16 opcode,
|
unsigned int which, u16 opcode,
|
||||||
u64 cookie, u64 version, int flag);
|
u64 cookie, u64 version, int flag);
|
||||||
|
extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
|
||||||
|
unsigned int which,
|
||||||
|
u64 expected_object_size,
|
||||||
|
u64 expected_write_size);
|
||||||
|
|
||||||
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
|
||||||
struct ceph_snap_context *snapc,
|
struct ceph_snap_context *snapc,
|
||||||
|
|
|
@ -227,6 +227,9 @@ enum {
|
||||||
CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
|
CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
|
||||||
CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
|
CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
|
||||||
|
|
||||||
|
/* hints */
|
||||||
|
CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
|
||||||
|
|
||||||
/** multi **/
|
/** multi **/
|
||||||
CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
|
CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
|
||||||
CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
|
CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
|
||||||
|
@ -416,6 +419,10 @@ struct ceph_osd_op {
|
||||||
__le64 offset, length;
|
__le64 offset, length;
|
||||||
__le64 src_offset;
|
__le64 src_offset;
|
||||||
} __attribute__ ((packed)) clonerange;
|
} __attribute__ ((packed)) clonerange;
|
||||||
|
struct {
|
||||||
|
__le64 expected_object_size;
|
||||||
|
__le64 expected_write_size;
|
||||||
|
} __attribute__ ((packed)) alloc_hint;
|
||||||
};
|
};
|
||||||
__le32 payload_len;
|
__le32 payload_len;
|
||||||
} __attribute__ ((packed));
|
} __attribute__ ((packed));
|
||||||
|
|
|
@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
|
||||||
case CEPH_OSD_OP_OMAPCLEAR:
|
case CEPH_OSD_OP_OMAPCLEAR:
|
||||||
case CEPH_OSD_OP_OMAPRMKEYS:
|
case CEPH_OSD_OP_OMAPRMKEYS:
|
||||||
case CEPH_OSD_OP_OMAP_CMP:
|
case CEPH_OSD_OP_OMAP_CMP:
|
||||||
|
case CEPH_OSD_OP_SETALLOCHINT:
|
||||||
case CEPH_OSD_OP_CLONERANGE:
|
case CEPH_OSD_OP_CLONERANGE:
|
||||||
case CEPH_OSD_OP_ASSERT_SRC_VERSION:
|
case CEPH_OSD_OP_ASSERT_SRC_VERSION:
|
||||||
case CEPH_OSD_OP_SRC_CMPXATTR:
|
case CEPH_OSD_OP_SRC_CMPXATTR:
|
||||||
|
@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL(osd_req_op_watch_init);
|
EXPORT_SYMBOL(osd_req_op_watch_init);
|
||||||
|
|
||||||
|
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
|
||||||
|
unsigned int which,
|
||||||
|
u64 expected_object_size,
|
||||||
|
u64 expected_write_size)
|
||||||
|
{
|
||||||
|
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||||
|
CEPH_OSD_OP_SETALLOCHINT);
|
||||||
|
|
||||||
|
op->alloc_hint.expected_object_size = expected_object_size;
|
||||||
|
op->alloc_hint.expected_write_size = expected_write_size;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
|
||||||
|
* not worth a feature bit. Set FAILOK per-op flag to make
|
||||||
|
* sure older osds don't trip over an unsupported opcode.
|
||||||
|
*/
|
||||||
|
op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
|
||||||
|
|
||||||
static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
|
static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
|
||||||
struct ceph_osd_data *osd_data)
|
struct ceph_osd_data *osd_data)
|
||||||
{
|
{
|
||||||
|
@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
|
||||||
dst->watch.ver = cpu_to_le64(src->watch.ver);
|
dst->watch.ver = cpu_to_le64(src->watch.ver);
|
||||||
dst->watch.flag = src->watch.flag;
|
dst->watch.flag = src->watch.flag;
|
||||||
break;
|
break;
|
||||||
|
case CEPH_OSD_OP_SETALLOCHINT:
|
||||||
|
dst->alloc_hint.expected_object_size =
|
||||||
|
cpu_to_le64(src->alloc_hint.expected_object_size);
|
||||||
|
dst->alloc_hint.expected_write_size =
|
||||||
|
cpu_to_le64(src->alloc_hint.expected_write_size);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
pr_err("unsupported osd opcode %s\n",
|
pr_err("unsupported osd opcode %s\n",
|
||||||
ceph_osd_op_name(src->op));
|
ceph_osd_op_name(src->op));
|
||||||
|
|
Loading…
Reference in New Issue