From c66c6e0c0b04ce4a152fe02c562dd0752665d580 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: rbd: document rbd_spec structure I promised Josh I would document whether there were any restrictions needed for accessing fields of an rbd_spec structure. This adds a big block of comments that documents the structure and how it is used--including the fact that we don't attempt to synchronize access to it. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 89576a0b3f2e..128978c6a4e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -119,7 +119,26 @@ struct rbd_image_header { * An rbd image specification. * * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely - * identify an image. + * identify an image. Each rbd_dev structure includes a pointer to + * an rbd_spec structure that encapsulates this identity. + * + * Each of the id's in an rbd_spec has an associated name. For a + * user-mapped image, the names are supplied and the id's associated + * with them are looked up. For a layered image, a parent image is + * defined by the tuple, and the names are looked up. + * + * An rbd_dev structure contains a parent_spec pointer which is + * non-null if the image it represents is a child in a layered + * image. This pointer will refer to the rbd_spec structure used + * by the parent rbd_dev for its own identity (i.e., the structure + * is shared between the parent and child). + * + * Since these structures are populated once, during the discovery + * phase of image construction, they are effectively immutable so + * we make no effort to synchronize access to them. + * + * Note that code herein does not assume the image name is known (it + * could be a null pointer). */ struct rbd_spec { u64 pool_id; -- cgit v1.2.3 From 69e7a02f633ba7de0aefa87f3f0e3b31e953b09a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: rbd: kill rbd_spec->image_name_len There may have been a benefit to hanging on to the length of an image name before, but there is really none now. The only time it's used is when probing for rbd images, so we can just compute the length then. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 128978c6a4e0..a002984891d7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -147,7 +147,6 @@ struct rbd_spec { char *image_id; size_t image_id_len; char *image_name; - size_t image_name_len; u64 snap_id; char *snap_name; @@ -2563,15 +2562,15 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) rbd_assert(!rbd_dev->spec->image_name); - image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; + len = strlen(rbd_dev->spec->image_id); + image_id_size = sizeof (__le32) + len; image_id = kmalloc(image_id_size, GFP_KERNEL); if (!image_id) return NULL; p = image_id; end = (char *) image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, - (u32) rbd_dev->spec->image_id_len); + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); @@ -2631,14 +2630,12 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) /* Fetch the image name; tolerate failure here */ name = rbd_dev_image_name(rbd_dev); - if (name) { - rbd_dev->spec->image_name_len = strlen(name); + if (name) rbd_dev->spec->image_name = (char *) name; - } else { + else pr_warning(RBD_DRV_NAME "%d " "unable to get image name for image id %s\n", rbd_dev->major, rbd_dev->spec->image_id); - } /* Look up the snapshot name. */ @@ -3252,7 +3249,7 @@ static int rbd_add_parse_args(const char *buf, if (!*spec->pool_name) goto out_err; /* Missing pool name */ - spec->image_name = dup_token(&buf, &spec->image_name_len); + spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; if (!*spec->image_name) @@ -3342,7 +3339,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; + size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); object_name = kmalloc(size, GFP_NOIO); if (!object_name) return -ENOMEM; @@ -3400,7 +3397,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) /* Record the header object name for this rbd image. */ - size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); + size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) { ret = -ENOMEM; -- cgit v1.2.3 From 979ed480a2722ad8d9f87054635158f652a1241e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: rbd: kill rbd_spec->image_id_len There is no real benefit to keeping the length of an image id, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a002984891d7..e01dbb12ad03 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -145,7 +145,6 @@ struct rbd_spec { char *pool_name; char *image_id; - size_t image_id_len; char *image_name; u64 snap_id; @@ -2492,7 +2491,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; char *image_id; u64 overlap; - size_t len = 0; int ret; parent_spec = rbd_spec_alloc(); @@ -2526,13 +2524,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ - image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } parent_spec->image_id = image_id; - parent_spec->image_id_len = len; ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); @@ -3368,8 +3365,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, p + RBD_IMAGE_ID_LEN_MAX, - &rbd_dev->spec->image_id_len, - GFP_NOIO); + NULL, GFP_NOIO); if (IS_ERR(rbd_dev->spec->image_id)) { ret = PTR_ERR(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -3393,7 +3389,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); if (!rbd_dev->spec->image_id) return -ENOMEM; - rbd_dev->spec->image_id_len = 0; /* Record the header object name for this rbd image. */ @@ -3443,7 +3438,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) * Image id was filled in by the caller. Record the header * object name for this rbd image. */ - size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; + size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) return -ENOMEM; -- cgit v1.2.3 From 4caf35f9ecdca1feef1d2e5e223b78e52ffbea87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:27 -0500 Subject: rbd: use kmemdup() This replaces two kmalloc()/memcpy() combinations with a single call to kmemdup(). Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e01dbb12ad03..d97611e2b4ee 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3151,11 +3151,9 @@ static inline char *dup_token(const char **buf, size_t *lenp) size_t len; len = next_token(buf); - dup = kmalloc(len + 1, GFP_KERNEL); + dup = kmemdup(*buf, len + 1, GFP_KERNEL); if (!dup) return NULL; - - memcpy(dup, *buf, len); *(dup + len) = '\0'; *buf += len; @@ -3264,10 +3262,9 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); if (!spec->snap_name) goto out_mem; - memcpy(spec->snap_name, buf, len); *(spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ -- cgit v1.2.3 From 06ecc6cbf7a60cd5abd9fd2bda8ae69b395c2be3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: rbd: define and use rbd_warn() Define a new function rbd_warn() that produces a boilerplate warning message, identifying in the resulting message the affected rbd device in the best way available. Use it in a few places that now use pr_warning(). Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d97611e2b4ee..635b81d0ebdb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -294,6 +294,33 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +static __printf(2, 3) +void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING "%s: %s: %pV\n", + RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING "%s: image %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING "%s: id %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); + else /* punt */ + printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", + RBD_DRV_NAME, rbd_dev, &vaf); + va_end(args); +} + #ifdef RBD_DEBUG #define rbd_assert(expr) \ if (unlikely(!(expr))) { \ @@ -1403,8 +1430,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) - pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", rbd_dev->major, rc); + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", rc); rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } @@ -1767,15 +1794,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) goto out_err; if (WARN_ON((size_t) ret < size)) { ret = -ENXIO; - pr_warning("short header read for image %s" - " (want %zd got %d)\n", - rbd_dev->spec->image_name, size, ret); + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; - pr_warning("invalid header for image %s\n", - rbd_dev->spec->image_name); + rbd_warn(rbd_dev, "invalid header"); goto out_err; } @@ -2630,9 +2655,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) if (name) rbd_dev->spec->image_name = (char *) name; else - pr_warning(RBD_DRV_NAME "%d " - "unable to get image name for image id %s\n", - rbd_dev->major, rbd_dev->spec->image_id); + rbd_warn(rbd_dev, "unable to get image name"); /* Look up the snapshot name. */ -- cgit v1.2.3 From 4fb5d671399e83d3875593db2f56d5b57fcb104f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: rbd: add warning messages for missing arguments Tell the user (via dmesg) what was wrong with the arguments provided via /sys/bus/rbd/add. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 635b81d0ebdb..31da8c538480 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf, /* The first four tokens are required */ len = next_token(&buf); - if (!len) - return -EINVAL; /* Missing monitor address(es) */ + if (!len) { + rbd_warn(NULL, "no monitor address(es) provided"); + return -EINVAL; + } mon_addrs = buf; mon_addrs_size = len + 1; buf += len; @@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf, options = dup_token(&buf, NULL); if (!options) return -ENOMEM; - if (!*options) - goto out_err; /* Missing options */ + if (!*options) { + rbd_warn(NULL, "no options provided"); + goto out_err; + } spec = rbd_spec_alloc(); if (!spec) @@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf, spec->pool_name = dup_token(&buf, NULL); if (!spec->pool_name) goto out_mem; - if (!*spec->pool_name) - goto out_err; /* Missing pool name */ + if (!*spec->pool_name) { + rbd_warn(NULL, "no pool name provided"); + goto out_err; + } spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; - if (!*spec->image_name) - goto out_err; /* Missing image name */ + if (!*spec->image_name) { + rbd_warn(NULL, "no image name provided"); + goto out_err; + } /* * Snapshot name is optional; default is to use "-" -- cgit v1.2.3 From f5400b7a0e78a53edce8960a079aa022640849a1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: rbd: add a warning in bio_chain_clone_range() Add a warning in bio_chain_clone_range() to help a user determine what exactly might have led to a failure. There is only one; please say something if you disagree with the following reasoning. There are three places this can return abnormally: - Initially, if there is nothing to clone. It turns out that right now this cannot happen anyway. The test is in place because the code below it doesn't work if those conditions don't hold. As such they could be assertions but since I can return a null to indicate an error I just do that instead. I have not added a warning here because it won't happen. - While processing bio's, if none remain but there are supposed to be more bytes to clone. Here I have added a warning. - If bio_clone_range() returns a null pointer. That function will have already produced a warning (at least the first time, via WARN_ON_ONCE()) to distinguish the cause of the error. The only exception is memory exhaustion, and I'd rather not pepper the code with warnings in all those spots. So no warning is added in that place. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 31da8c538480..ce6c0cbb3d7a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -993,8 +993,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, unsigned int bi_size; struct bio *bio; - if (!bi) + if (!bi) { + rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ + } bi_size = min_t(unsigned int, bi->bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) -- cgit v1.2.3 From 935dc89f3e29e2ef1d7c89778cdb9f37ff36e94b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: rbd: add warnings to rbd_dev_probe_update_spec() Josh suggested adding warnings to this function to help users diagnose problems. Other than memory allocatino errors, there are two places where errors can be returned. Both represent problems that should have been caught earlier, and as such might well have been handled with BUG_ON() calls. But if either ever did manage to happen, it will be reported. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ce6c0cbb3d7a..530a1217a0fa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2644,8 +2644,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) osdc = &rbd_dev->rbd_client->client->osdc; name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) - return -EIO; /* pool id too large (>= 2^31) */ + if (!name) { + rbd_warn(rbd_dev, "there is no pool with id %llu", + rbd_dev->spec->pool_id); /* Really a BUG() */ + return -EIO; + } rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); if (!rbd_dev->spec->pool_name) @@ -2663,6 +2666,8 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); if (!name) { + rbd_warn(rbd_dev, "no snapshot with id %llu", + rbd_dev->spec->snap_id); /* Really a BUG() */ ret = -EIO; goto out_err; } -- cgit v1.2.3 From 725afc97c91cd5f71a015143da5095d20cd668b9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: standardize rbd_request variable names There are two names used for items of rbd_request structure type: "req" and "req_data". The former name is also used to represent items of pointers to struct ceph_osd_request. Change all variables that have these names so they are instead called "rbd_req" consistently. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 530a1217a0fa..0091fa466f83 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1088,10 +1088,12 @@ static void rbd_coll_end_req_index(struct request *rq, spin_unlock_irq(q->queue_lock); } -static void rbd_coll_end_req(struct rbd_request *req, +static void rbd_coll_end_req(struct rbd_request *rbd_req, int ret, u64 len) { - rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); + rbd_coll_end_req_index(rbd_req->rq, + rbd_req->coll, rbd_req->coll_index, + ret, len); } /* @@ -1119,12 +1121,12 @@ static int rbd_do_request(struct request *rq, int ret; u64 bno; struct timespec mtime = CURRENT_TIME; - struct rbd_request *req_data; + struct rbd_request *rbd_req; struct ceph_osd_request_head *reqhead; struct ceph_osd_client *osdc; - req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) { + rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); + if (!rbd_req) { if (coll) rbd_coll_end_req_index(rq, coll, coll_index, -ENOMEM, len); @@ -1132,8 +1134,8 @@ static int rbd_do_request(struct request *rq, } if (coll) { - req_data->coll = coll; - req_data->coll_index = coll_index; + rbd_req->coll = coll; + rbd_req->coll_index = coll_index; } dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", @@ -1150,12 +1152,12 @@ static int rbd_do_request(struct request *rq, req->r_callback = rbd_cb; - req_data->rq = rq; - req_data->bio = bio; - req_data->pages = pages; - req_data->len = len; + rbd_req->rq = rq; + rbd_req->bio = bio; + rbd_req->pages = pages; + rbd_req->len = len; - req->r_priv = req_data; + req->r_priv = rbd_req; reqhead = req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); @@ -1200,11 +1202,11 @@ static int rbd_do_request(struct request *rq, return ret; done_err: - bio_chain_put(req_data->bio); + bio_chain_put(rbd_req->bio); ceph_osdc_put_request(req); done_pages: - rbd_coll_end_req(req_data, ret, len); - kfree(req_data); + rbd_coll_end_req(rbd_req, ret, len); + kfree(rbd_req); return ret; } @@ -1213,7 +1215,7 @@ done_pages: */ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct rbd_request *req_data = req->r_priv; + struct rbd_request *rbd_req = req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; __s32 rc; @@ -1232,20 +1234,20 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) (unsigned long long) bytes, read_op, (int) rc); if (rc == -ENOENT && read_op) { - zero_bio_chain(req_data->bio, 0); + zero_bio_chain(rbd_req->bio, 0); rc = 0; - } else if (rc == 0 && read_op && bytes < req_data->len) { - zero_bio_chain(req_data->bio, bytes); - bytes = req_data->len; + } else if (rc == 0 && read_op && bytes < rbd_req->len) { + zero_bio_chain(rbd_req->bio, bytes); + bytes = rbd_req->len; } - rbd_coll_end_req(req_data, rc, bytes); + rbd_coll_end_req(rbd_req, rc, bytes); - if (req_data->bio) - bio_chain_put(req_data->bio); + if (rbd_req->bio) + bio_chain_put(rbd_req->bio); ceph_osdc_put_request(req); - kfree(req_data); + kfree(rbd_req); } static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) -- cgit v1.2.3 From 5f29ddd4f0954ad6c84e28b934773f128840f064 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: standardize ceph_osd_request variable names There are spots where a ceph_osds_request pointer variable is given the name "req". Since we're dealing with (at least) three types of requests (block layer, rbd, and osd), I find this slightly distracting. Change such instances to use "osd_req" consistently to make the abstraction represented a little more obvious. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 60 +++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 29 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0091fa466f83..85131de90012 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1111,12 +1111,12 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_req_op *ops, struct rbd_req_coll *coll, int coll_index, - void (*rbd_cb)(struct ceph_osd_request *req, - struct ceph_msg *msg), + void (*rbd_cb)(struct ceph_osd_request *, + struct ceph_msg *), struct ceph_osd_request **linger_req, u64 *ver) { - struct ceph_osd_request *req; + struct ceph_osd_request *osd_req; struct ceph_file_layout *layout; int ret; u64 bno; @@ -1143,67 +1143,68 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, false, GFP_NOIO, pages, bio); - if (!req) { + if (!osd_req) { ret = -ENOMEM; goto done_pages; } - req->r_callback = rbd_cb; + osd_req->r_callback = rbd_cb; rbd_req->rq = rq; rbd_req->bio = bio; rbd_req->pages = pages; rbd_req->len = len; - req->r_priv = rbd_req; + osd_req->r_priv = rbd_req; - reqhead = req->r_request->front.iov_base; + reqhead = osd_req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(req->r_oid, object_name, sizeof(req->r_oid)); - req->r_oid_len = strlen(req->r_oid); + strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); + osd_req->r_oid_len = strlen(osd_req->r_oid); - layout = &req->r_file_layout; + layout = &osd_req->r_file_layout; memset(layout, 0, sizeof(*layout)); layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - req, ops); + osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(req, ofs, &len, + ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime, - req->r_oid, req->r_oid_len); + osd_req->r_oid, osd_req->r_oid_len); if (linger_req) { - ceph_osdc_set_request_linger(osdc, req); - *linger_req = req; + ceph_osdc_set_request_linger(osdc, osd_req); + *linger_req = osd_req; } - ret = ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_start_request(osdc, osd_req, false); if (ret < 0) goto done_err; if (!rbd_cb) { - ret = ceph_osdc_wait_request(osdc, req); + u64 version; + + ret = ceph_osdc_wait_request(osdc, osd_req); + version = le64_to_cpu(osd_req->r_reassert_version.version); if (ver) - *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%llu\n", - (unsigned long long) - le64_to_cpu(req->r_reassert_version.version)); - ceph_osdc_put_request(req); + *ver = version; + dout("reassert_ver=%llu\n", (unsigned long long) version); + ceph_osdc_put_request(osd_req); } return ret; done_err: bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); done_pages: rbd_coll_end_req(rbd_req, ret, len); kfree(rbd_req); @@ -1213,9 +1214,9 @@ done_pages: /* * Ceph osd op callback */ -static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) { - struct rbd_request *rbd_req = req->r_priv; + struct rbd_request *rbd_req = osd_req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; __s32 rc; @@ -1246,13 +1247,14 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) if (rbd_req->bio) bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); kfree(rbd_req); } -static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) { - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); } /* -- cgit v1.2.3 From 8986cb37b1cf1f54b35f062f0a12dc68dd89f311 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: be picky about osd request status type The result field in a ceph osd reply header is a signed 32-bit type, but rbd code often casually uses int to represent it. The following changes the types of variables that handle this result value to be "s32" instead of "int" to be completely explicit about it. Only at the point we pass that result to __blk_end_request() does the type get converted to the plain old int defined for that interface. There is almost certainly no binary impact of this change, but I prefer to show the exact size and signedness of the value since we know it. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 85131de90012..8d93b6a649d8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -171,7 +171,7 @@ struct rbd_client { */ struct rbd_req_status { int done; - int rc; + s32 rc; u64 bytes; }; @@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops) static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, - int ret, u64 len) + s32 ret, u64 len) { struct request_queue *q; int min, max, i; dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", - coll, index, ret, (unsigned long long) len); + coll, index, (int)ret, (unsigned long long)len); if (!rq) return; @@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq, max++; for (i = min; istatus[i].rc, + __blk_end_request(rq, (int)coll->status[i].rc, coll->status[i].bytes); coll->num_done++; kref_put(&coll->kref, rbd_coll_release); @@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq, } static void rbd_coll_end_req(struct rbd_request *rbd_req, - int ret, u64 len) + s32 ret, u64 len) { rbd_coll_end_req_index(rbd_req->rq, rbd_req->coll, rbd_req->coll_index, @@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq, if (!rbd_req) { if (coll) rbd_coll_end_req_index(rq, coll, coll_index, - -ENOMEM, len); + (s32)-ENOMEM, len); return -ENOMEM; } @@ -1206,7 +1206,7 @@ done_err: bio_chain_put(rbd_req->bio); ceph_osdc_put_request(osd_req); done_pages: - rbd_coll_end_req(rbd_req, ret, len); + rbd_coll_end_req(rbd_req, (s32)ret, len); kfree(rbd_req); return ret; } @@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) struct rbd_request *rbd_req = osd_req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; - __s32 rc; + s32 rc; u64 bytes; int read_op; @@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) replyhead = msg->front.iov_base; WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); + rc = (s32)le32_to_cpu(replyhead->result); bytes = le64_to_cpu(op->extent.length); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", (unsigned long long) bytes, read_op, (int) rc); - if (rc == -ENOENT && read_op) { + if (rc == (s32)-ENOENT && read_op) { zero_bio_chain(rbd_req->bio, 0); rc = 0; } else if (rc == 0 && read_op && bytes < rbd_req->len) { @@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q) bio_chain, coll, cur_seg); else rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, chain_size); + (s32)-ENOMEM, + chain_size); size -= chain_size; ofs += chain_size; -- cgit v1.2.3 From 8295cda7ceceb7b25f9a12cd21bbfb6206e28a6d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: encapsulate handling for a single request In rbd_rq_fn(), requests are fetched from the block layer and each request is processed, looping through the request's list of bio's until they've all been consumed. Separate the handling for a single request into its own function to make it a bit easier to see what's going on. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 119 +++++++++++++++++++++++++++------------------------- 1 file changed, 63 insertions(+), 56 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8d93b6a649d8..738d1e4c0ab5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1583,6 +1583,64 @@ static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) return coll; } +static int rbd_dev_do_request(struct request *rq, + struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + u64 ofs, unsigned int size, + struct bio *bio_chain) +{ + int num_segs; + struct rbd_req_coll *coll; + unsigned int bio_offset; + int cur_seg = 0; + + dout("%s 0x%x bytes at 0x%llx\n", + rq_data_dir(rq) == WRITE ? "write" : "read", + size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); + + num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); + if (num_segs <= 0) + return num_segs; + + coll = rbd_alloc_coll(num_segs); + if (!coll) + return -ENOMEM; + + bio_offset = 0; + do { + u64 limit = rbd_segment_length(rbd_dev, ofs, size); + unsigned int clone_size; + struct bio *bio_clone; + + BUG_ON(limit > (u64)UINT_MAX); + clone_size = (unsigned int)limit; + dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); + + kref_get(&coll->kref); + + /* Pass a cloned bio chain via an osd request */ + + bio_clone = bio_chain_clone_range(&bio_chain, + &bio_offset, clone_size, + GFP_ATOMIC); + if (bio_clone) + (void)rbd_do_op(rq, rbd_dev, snapc, + ofs, clone_size, + bio_clone, coll, cur_seg); + else + rbd_coll_end_req_index(rq, coll, cur_seg, + (s32)-ENOMEM, + clone_size); + size -= clone_size; + ofs += clone_size; + + cur_seg++; + } while (size > 0); + kref_put(&coll->kref, rbd_coll_release); + + return 0; +} + /* * block device queue callback */ @@ -1596,10 +1654,8 @@ static void rbd_rq_fn(struct request_queue *q) bool do_write; unsigned int size; u64 ofs; - int num_segs, cur_seg = 0; - struct rbd_req_coll *coll; struct ceph_snap_context *snapc; - unsigned int bio_offset; + int result; dout("fetched request\n"); @@ -1637,60 +1693,11 @@ static void rbd_rq_fn(struct request_queue *q) ofs = blk_rq_pos(rq) * SECTOR_SIZE; bio = rq->bio; - dout("%s 0x%x bytes at 0x%llx\n", - do_write ? "write" : "read", - size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); - - num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); - if (num_segs <= 0) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, num_segs); - ceph_put_snap_context(snapc); - continue; - } - coll = rbd_alloc_coll(num_segs); - if (!coll) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - ceph_put_snap_context(snapc); - continue; - } - - bio_offset = 0; - do { - u64 limit = rbd_segment_length(rbd_dev, ofs, size); - unsigned int chain_size; - struct bio *bio_chain; - - BUG_ON(limit > (u64) UINT_MAX); - chain_size = (unsigned int) limit; - dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - - kref_get(&coll->kref); - - /* Pass a cloned bio chain via an osd request */ - - bio_chain = bio_chain_clone_range(&bio, - &bio_offset, chain_size, - GFP_ATOMIC); - if (bio_chain) - (void) rbd_do_op(rq, rbd_dev, snapc, - ofs, chain_size, - bio_chain, coll, cur_seg); - else - rbd_coll_end_req_index(rq, coll, cur_seg, - (s32)-ENOMEM, - chain_size); - size -= chain_size; - ofs += chain_size; - - cur_seg++; - } while (size > 0); - kref_put(&coll->kref, rbd_coll_release); - - spin_lock_irq(q->queue_lock); - + result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio); ceph_put_snap_context(snapc); + spin_lock_irq(q->queue_lock); + if (!size || result < 0) + __blk_end_request_all(rq, result); } } -- cgit v1.2.3 From cd323ac0eb433b14cbb270bfc5a82f308f2662de Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: end request on error in rbd_do_request() caller Only one of the three callers of rbd_do_request() provide a collection structure to aggregate status. If an error occurs in rbd_do_request(), have the caller take care of calling rbd_coll_end_req() if necessary in that one spot. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 738d1e4c0ab5..21468d0b2792 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1126,12 +1126,8 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_client *osdc; rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) { - if (coll) - rbd_coll_end_req_index(rq, coll, coll_index, - (s32)-ENOMEM, len); + if (!rbd_req) return -ENOMEM; - } if (coll) { rbd_req->coll = coll; @@ -1206,7 +1202,6 @@ done_err: bio_chain_put(rbd_req->bio); ceph_osdc_put_request(osd_req); done_pages: - rbd_coll_end_req(rbd_req, (s32)ret, len); kfree(rbd_req); return ret; } @@ -1359,7 +1354,9 @@ static int rbd_do_op(struct request *rq, ops, coll, coll_index, rbd_req_cb, 0, NULL); - + if (ret < 0) + rbd_coll_end_req_index(rq, coll, coll_index, + (s32)ret, seg_len); rbd_destroy_ops(ops); done: kfree(seg_name); -- cgit v1.2.3 From b395e8b5b8f06399e3fe3ee016c9cf41ff665efc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: rbd: a little more cleanup of rbd_rq_fn() Now that a big hunk in the middle of rbd_rq_fn() has been moved into its own routine we can simplify it a little more. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 50 ++++++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 21468d0b2792..9d49e5b888d8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1644,53 +1644,51 @@ static int rbd_dev_do_request(struct request *rq, static void rbd_rq_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; + bool read_only = rbd_dev->mapping.read_only; struct request *rq; while ((rq = blk_fetch_request(q))) { - struct bio *bio; - bool do_write; - unsigned int size; - u64 ofs; - struct ceph_snap_context *snapc; + struct ceph_snap_context *snapc = NULL; + unsigned int size = 0; int result; dout("fetched request\n"); - /* filter out block requests we don't understand */ + /* Filter out block requests we don't understand */ + if ((rq->cmd_type != REQ_TYPE_FS)) { __blk_end_request_all(rq, 0); continue; } + spin_unlock_irq(q->queue_lock); - /* deduce our operation (read, write) */ - do_write = (rq_data_dir(rq) == WRITE); - if (do_write && rbd_dev->mapping.read_only) { - __blk_end_request_all(rq, -EROFS); - continue; - } + /* Stop writes to a read-only device */ - spin_unlock_irq(q->queue_lock); + result = -EROFS; + if (read_only && rq_data_dir(rq) == WRITE) + goto out_end_request; + + /* Grab a reference to the snapshot context */ down_read(&rbd_dev->header_rwsem); + if (rbd_dev->exists) { + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + rbd_assert(snapc != NULL); + } + up_read(&rbd_dev->header_rwsem); - if (!rbd_dev->exists) { + if (!snapc) { rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - up_read(&rbd_dev->header_rwsem); dout("request for non-existent snapshot"); - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENXIO); - continue; + result = -ENXIO; + goto out_end_request; } - snapc = ceph_get_snap_context(rbd_dev->header.snapc); - - up_read(&rbd_dev->header_rwsem); - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - bio = rq->bio; - - result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio); + result = rbd_dev_do_request(rq, rbd_dev, snapc, + blk_rq_pos(rq) * SECTOR_SIZE, + size, rq->bio); +out_end_request: ceph_put_snap_context(snapc); spin_lock_irq(q->queue_lock); if (!size || result < 0) -- cgit v1.2.3 From d78b650a595e23e5a115d332e3c37e019baf7703 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: make exists flag atomic The rbd_device->exists field can be updated asynchronously, changing from set to clear if a mapped snapshot disappears from the base image's snapshot context. Currently, value of the "exists" flag is only read and modified under protection of the header semaphore, but that will change with the next patch. Making it atomic ensures this won't be a problem because the a the non-existence of device will be immediately known. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9d49e5b888d8..2846536d446e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -229,7 +229,7 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - bool exists; + atomic_t exists; struct rbd_spec *spec; char *header_name; @@ -751,7 +751,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) goto done; rbd_dev->mapping.read_only = true; } - rbd_dev->exists = true; + atomic_set(&rbd_dev->exists, 1); done: return ret; } @@ -1671,7 +1671,7 @@ static void rbd_rq_fn(struct request_queue *q) /* Grab a reference to the snapshot context */ down_read(&rbd_dev->header_rwsem); - if (rbd_dev->exists) { + if (atomic_read(&rbd_dev->exists)) { snapc = ceph_get_snap_context(rbd_dev->header.snapc); rbd_assert(snapc != NULL); } @@ -2294,6 +2294,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); + atomic_set(&rbd_dev->exists, 0); INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); @@ -2918,7 +2919,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ if (rbd_dev->spec->snap_id == snap->id) - rbd_dev->exists = false; + atomic_set(&rbd_dev->exists, 0); rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->spec->snap_id == snap->id ? -- cgit v1.2.3 From a7b4c65f4f15aa657b09d13da8f45ba0b72ec0df Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: only get snap context for write requests Right now we get the snapshot context for an rbd image (under protection of the header semaphore) for every request processed. There's no need to get the snap context if we're doing a read, so avoid doing so in that case. Note that we no longer need to hold the header semaphore to check the rbd_dev's existence flag. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2846536d446e..0fbe9add0446 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1331,7 +1331,7 @@ static int rbd_do_op(struct request *rq, } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; - snapc = NULL; + rbd_assert(!snapc); snapid = rbd_dev->spec->snap_id; payload_len = 0; } @@ -1662,22 +1662,25 @@ static void rbd_rq_fn(struct request_queue *q) } spin_unlock_irq(q->queue_lock); - /* Stop writes to a read-only device */ - - result = -EROFS; - if (read_only && rq_data_dir(rq) == WRITE) - goto out_end_request; - - /* Grab a reference to the snapshot context */ - - down_read(&rbd_dev->header_rwsem); - if (atomic_read(&rbd_dev->exists)) { + /* Write requests need a reference to the snapshot context */ + + if (rq_data_dir(rq) == WRITE) { + result = -EROFS; + if (read_only) /* Can't write to a read-only device */ + goto out_end_request; + + /* + * Note that each osd request will take its + * own reference to the snapshot context + * supplied. The reference we take here + * just guarantees the one we provide stays + * valid. + */ + down_read(&rbd_dev->header_rwsem); snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); rbd_assert(snapc != NULL); - } - up_read(&rbd_dev->header_rwsem); - - if (!snapc) { + } else if (!atomic_read(&rbd_dev->exists)) { rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); dout("request for non-existent snapshot"); result = -ENXIO; @@ -1689,7 +1692,8 @@ static void rbd_rq_fn(struct request_queue *q) blk_rq_pos(rq) * SECTOR_SIZE, size, rq->bio); out_end_request: - ceph_put_snap_context(snapc); + if (snapc) + ceph_put_snap_context(snapc); spin_lock_irq(q->queue_lock); if (!size || result < 0) __blk_end_request_all(rq, result); -- cgit v1.2.3 From 0ec8ce87f3bb5e4a561190f5320934e003405b6f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: separate layout init Pull a block of code that initializes the layout structure in an osd request into its own function so it can be reused. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0fbe9add0446..d4e93a28fb6a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -54,6 +54,7 @@ /* It might be useful to have this defined elsewhere too */ +#define U32_MAX ((u32) (~0U)) #define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" @@ -1096,6 +1097,16 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req, ret, len); } +static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) +{ + memset(layout, 0, sizeof (*layout)); + layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_stripe_count = cpu_to_le32(1); + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_assert(pool_id <= (u64) U32_MAX); + layout->fl_pg_pool = cpu_to_le32((u32) pool_id); +} + /* * Send ceph osd request */ @@ -1117,7 +1128,6 @@ static int rbd_do_request(struct request *rq, u64 *ver) { struct ceph_osd_request *osd_req; - struct ceph_file_layout *layout; int ret; u64 bno; struct timespec mtime = CURRENT_TIME; @@ -1161,14 +1171,9 @@ static int rbd_do_request(struct request *rq, strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); - layout = &osd_req->r_file_layout; - memset(layout, 0, sizeof(*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - osd_req, ops); + rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); + ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, + snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); ceph_osdc_build_request(osd_req, ofs, &len, -- cgit v1.2.3 From af77f26caa35a95af09d1dac5c513b3901de7e37 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: drop oid parameters from ceph_osdc_build_request() The last two parameters to ceph_osd_build_request() describe the object id, but the values passed always come from the osd request structure whose address is also provided. Get rid of those last two parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +----- include/linux/ceph/osd_client.h | 4 +--- net/ceph/osd_client.c | 13 +++++-------- 3 files changed, 7 insertions(+), 16 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d4e93a28fb6a..9a701effa0ef 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1176,11 +1176,7 @@ static int rbd_do_request(struct request *rq, snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, &len, - ops, - snapc, - &mtime, - osd_req->r_oid, osd_req->r_oid_len); + ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d9b880e977e6..f2e5d2cdca06 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -227,9 +227,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 *plen, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len); + struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eade41bb7102..7d38327a8e89 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -376,9 +376,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 *plen, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len) + struct timespec *mtime) { struct ceph_msg *msg = req->r_request; struct ceph_osd_request_head *head; @@ -405,9 +403,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, /* fill in oid */ - head->object_len = cpu_to_le32(oid_len); - memcpy(p, oid, oid_len); - p += oid_len; + head->object_len = cpu_to_le32(req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + p += req->r_oid_len; src_op = src_ops; while (src_op->op) { @@ -506,8 +504,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ceph_osdc_build_request(req, off, plen, ops, snapc, - mtime, - req->r_oid, req->r_oid_len); + mtime); return req; } -- cgit v1.2.3 From 4775618d9255c0c135580bbee8bee6815f8194cf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: rbd: drop snapid parameter from rbd_req_sync_read() There is only one caller of rbd_req_sync_read(), and it passes CEPH_NOSNAP as the snapshot id argument. Delete that parameter and just use CEPH_NOSNAP within the function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9a701effa0ef..07e064482cd8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1368,7 +1368,6 @@ done: * Request sync osd read */ static int rbd_req_sync_read(struct rbd_device *rbd_dev, - u64 snapid, const char *object_name, u64 ofs, u64 len, char *buf, @@ -1382,7 +1381,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, NULL, - snapid, + CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); @@ -1799,8 +1798,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, - rbd_dev->header_name, + ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); -- cgit v1.2.3 From 07b2391fbbcefdecbc2f16321f8e454802e0b926 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:16 -0600 Subject: rbd: drop flags parameter from rbd_req_sync_exec() All callers of rbd_req_sync_exec() pass CEPH_OSD_FLAG_READ as their flags argument. Delete that parameter and use CEPH_OSD_FLAG_READ within the function. If we find a need to support write operations we can add it back again. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 07e064482cd8..5d48bcd1709b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1524,7 +1524,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t outbound_size, char *inbound, size_t inbound_size, - int flags, u64 *ver) { struct ceph_osd_req_op *ops; @@ -1555,8 +1554,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - flags, ops, + CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, object_name, 0, inbound_size, inbound, NULL, ver); @@ -2418,8 +2416,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), - CEPH_OSD_FLAG_READ, NULL); + (char *) &size_buf, sizeof (size_buf), NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2454,8 +2451,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2494,7 +2490,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, "rbd", "get_features", (char *) &snapid, sizeof (snapid), (char *) &features_buf, sizeof (features_buf), - CEPH_OSD_FLAG_READ, NULL); + NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2549,8 +2545,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -2615,8 +2610,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); if (ret < 0) goto out; p = reply_buf; @@ -2722,8 +2716,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, - CEPH_OSD_FLAG_READ, ver); + reply_buf, size, ver); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2792,8 +2785,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", (char *) &snap_id, sizeof (snap_id), - reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + reply_buf, size, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3401,8 +3393,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, object_name, "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); + response, RBD_IMAGE_ID_LEN_MAX, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; -- cgit v1.2.3 From 25704ac9de30ac3e73c123e7b2734f7ca744c8d8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:16 -0600 Subject: rbd: kill rbd_req_sync_op() snapc and snapid parameters The snapc and snapid parameters to rbd_req_sync_op() always take the values NULL and CEPH_NOSNAP, respectively. So just get rid of them and use those values where needed. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d48bcd1709b..85c5852378d2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1257,8 +1257,6 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, * Do a synchronous ceph osd operation */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, int flags, struct ceph_osd_req_op *ops, const char *object_name, @@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); - ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, @@ -1380,9 +1378,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, if (!ops) return -ENOMEM; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_READ, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); @@ -1461,8 +1457,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 1; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, rbd_dev->header_name, @@ -1499,8 +1494,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 0; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, rbd_dev->header_name, @@ -1553,8 +1547,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata = outbound; ops[0].cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, object_name, 0, inbound_size, inbound, NULL, ver); -- cgit v1.2.3 From 7c3d22cf16f1bbcb37a73e88338c042bb49ff112 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 12:50:10 -0600 Subject: rbd: don't bother setting snapid in rbd_do_request() For some reason, the snapid field of the osd request header is explicitly set to CEPH_NOSNAP in rbd_do_request(). Just a few lines later--with no code that would access this field in between--a call is made to ceph_calc_raw_layout() passing the snapid provided to rbd_do_request(), which encodes the snapid value it is provided into that field instead. In other words, there is no need to fill in CEPH_NOSNAP, and doing so suggests it might be necessary. Don't do that any more. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 85c5852378d2..54bd9fc3ef7c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1132,7 +1132,6 @@ static int rbd_do_request(struct request *rq, u64 bno; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; - struct ceph_osd_request_head *reqhead; struct ceph_osd_client *osdc; rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); @@ -1165,9 +1164,6 @@ static int rbd_do_request(struct request *rq, osd_req->r_priv = rbd_req; - reqhead = osd_req->r_request->front.iov_base; - reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); -- cgit v1.2.3 From 0120be3c60d46d6d55f4bf7a3d654cc705eb0c54 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:19 -0600 Subject: libceph: pass length to ceph_osdc_build_request() The len argument to ceph_osdc_build_request() is set up to be passed by address, but that function never updates its value so there's no need to do this. Tighten up the interface by passing the length directly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 54bd9fc3ef7c..c1b135b6cb97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq, snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 61562c792855..4bfb4582439a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct bio *bio); extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 37d43d5b828c..e29a3ed92958 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -336,7 +336,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime) @@ -390,7 +390,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, if (flags & CEPH_OSD_FLAG_WRITE) { req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + req->r_request->hdr.data_len = cpu_to_le32(len + data_len); } else if (data_len) { req->r_request->hdr.data_off = 0; req->r_request->hdr.data_len = cpu_to_le32(data_len); @@ -464,7 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, plen, ops, + ceph_osdc_build_request(req, off, *plen, ops, snapc, mtime); -- cgit v1.2.3 From 4d6b250bf18d44571d69a0f4afec4b6a1969729f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: drop snapid in ceph_calc_raw_layout() A snapshot id must be provided to ceph_calc_raw_layout() even though it is not needed at all for calculating the layout. Where the snapshot id *is* needed is when building the request message for an osd operation. Drop the snapid parameter from ceph_calc_raw_layout() and pass that value instead in ceph_osdc_build_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 14 ++++---------- 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c1b135b6cb97..fa371868e9b0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, - snapid, ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4bfb4582439a..0e82a0a967ef 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); @@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, + u64 snap_id, struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 47e5f5b1f94c..b5a4b2875e8a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -40,18 +40,14 @@ static int op_has_extent(int op) int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ int r; - reqhead->snapid = cpu_to_le64(snapid); - /* object extent? */ r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); @@ -121,8 +117,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -340,7 +335,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, + struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) { struct ceph_msg *msg = req->r_request; @@ -355,6 +350,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, int i; head = msg->front.iov_base; + head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); p = (void *)(op + num_op); @@ -466,9 +462,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, ops, - snapc, - mtime); + ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime); return req; } -- cgit v1.2.3 From e75b45cf36565fd8ba206a9d80f670a86e61ba2f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:14 -0600 Subject: libceph: drop osdc from ceph_calc_raw_layout() The osdc parameter to ceph_calc_raw_layout() is not used, so get rid of it. Consequently, the corresponding parameter in calc_layout() becomes unused, so get rid of that as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 10 ++++------ 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fa371868e9b0..ac8fd3856509 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, + ret = ceph_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e82a0a967ef..fe3a6e8db1f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b5a4b2875e8a..cd9c28387de3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -38,8 +38,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) @@ -107,8 +106,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static int calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, +static int calc_layout(struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, struct ceph_osd_request *req, @@ -117,7 +115,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); + r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -452,7 +450,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, return ERR_PTR(-ENOMEM); /* calculate max write size */ - r = calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(vino, layout, off, plen, req, ops); if (r < 0) return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ -- cgit v1.2.3 From d178a9e74006e80f568d87e29f2a68f14fc7cbb1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: don't set flags in ceph_osdc_alloc_request() The only thing ceph_osdc_alloc_request() really does with the flags value it is passed is assign it to the newly-created osd request structure. Do that in the caller instead. Both callers subsequently call ceph_osdc_build_request(), so have that function (instead of ceph_osdc_alloc_request()) issue a warning if a request comes through with neither the read nor write flags set. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- include/linux/ceph/osd_client.h | 1 - net/ceph/osd_client.c | 11 ++++------- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ac8fd3856509..bdbaa4cfd9d3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO, pages, bio); if (!osd_req) { ret = -ENOMEM; goto done_pages; } + osd_req->r_flags = flags; osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fe3a6e8db1f9..6ddda5bbd1a6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, struct ceph_osd_req_op *op); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cd9c28387de3..77ce1edaa07d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,7 +171,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops) } struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, @@ -208,10 +207,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -347,6 +342,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 data_len = 0; int i; + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + head = msg->front.iov_base; head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); @@ -442,12 +439,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, flags, - snapc, ops, + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS, NULL, NULL); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ r = calc_layout(vino, layout, off, plen, req, ops); -- cgit v1.2.3 From 54a5400721da7fa5a16cea151aade5bdfee74111 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: don't set pages or bio in ceph_osdc_alloc_request() Only one of the two callers of ceph_osdc_alloc_request() provides page or bio data for its payload. And essentially all that function was doing with those arguments was assigning them to fields in the osd request structure. Simplify ceph_osdc_alloc_request() by having the caller take care of making those assignments Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++++-- include/linux/ceph/osd_client.h | 4 +--- net/ceph/osd_client.c | 15 ++------------- 3 files changed, 9 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bdbaa4cfd9d3..d1445df6398a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, - false, GFP_NOIO, pages, bio); + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; } osd_req->r_flags = flags; + osd_req->r_pages = pages; + if (bio) { + osd_req->r_bio = bio; + bio_get(osd_req->r_bio); + } osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 6ddda5bbd1a6..75f56d372d44 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio); + gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 77ce1edaa07d..bdc3bb12bfd5 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -174,9 +174,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio) + gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; @@ -237,13 +235,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; - req->r_pages = pages; -#ifdef CONFIG_BLOCK - if (bio) { - req->r_bio = bio; - bio_get(req->r_bio); - } -#endif return req; } @@ -439,9 +430,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, snapc, ops, - use_mempool, - GFP_NOFS, NULL, NULL); + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); req->r_flags = flags; -- cgit v1.2.3 From d07c09589f533db9ab500ac38151bc9f3a4d0648 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: rbd: pass num_op with ops array Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to indicate the number of entries in the array. The callers of these functions always know how many entries are in the array, so just pass that information down. This is in anticipation of eliminating the extra zero-filled entry in these ops arrays. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d1445df6398a..b6872d3cb04c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, struct rbd_req_coll *coll, int coll_index, @@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, const char *object_name, u64 ofs, u64 inbound_size, @@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - ops, + num_op, ops, NULL, 0, NULL, linger_req, ver); @@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - ops, + 1, ops, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - ops, object_name, ofs, len, buf, NULL, ver); + 1, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); return ret; @@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - ops, + 1, ops, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); @@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata = outbound; ops[0].cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, object_name, 0, inbound_size, inbound, NULL, ver); -- cgit v1.2.3 From ae7ca4a35b1f5df86e2c32b2cfc01a8d528c7b8c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: libceph: pass num_op with ops Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are provided an array of ceph osd request operations. Rather than just passing the number of operations in the array, the caller is required append an additional zeroed operation structure to signal the end of the array. All callers know the number of operations at the time these functions are called, so drop the silly zero entry and supply that number directly. As a result, get_num_ops() is no longer needed. This also means that ceph_osdc_alloc_request() never uses its ops argument, so that can be dropped. Also rbd_create_rw_ops() no longer needs to add one to reserve room for the additional op. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++---- include/linux/ceph/osd_client.h | 3 ++- net/ceph/osd_client.c | 43 ++++++++++++++--------------------------- 3 files changed, 22 insertions(+), 33 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b6872d3cb04c..88de8ccb29bd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1026,12 +1026,12 @@ out_err: /* * helpers for osd request op vectors. */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, int opcode, u32 payload_len) { struct ceph_osd_req_op *ops; - ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); if (!ops) return NULL; @@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 75f56d372d44..2b04d054e09d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, + unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index bdc3bb12bfd5..500ae8b49321 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -160,25 +160,14 @@ void ceph_osdc_release_request(struct kref *kref) } EXPORT_SYMBOL(ceph_osdc_release_request); -static int get_num_ops(struct ceph_osd_req_op *ops) -{ - int i = 0; - - while (ops[i].op) - i++; - - return i; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; - int num_op = get_num_ops(ops); size_t msg_size = sizeof(struct ceph_osd_request_head); msg_size += num_op*sizeof(struct ceph_osd_op); @@ -317,7 +306,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, + u64 off, u64 len, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) @@ -327,7 +316,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, struct ceph_osd_req_op *src_op; struct ceph_osd_op *op; void *p; - int num_op = get_num_ops(src_ops); size_t msg_size = sizeof(*head) + num_op*sizeof(*op); int flags = req->r_flags; u64 data_len = 0; @@ -346,20 +334,17 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, head->flags = cpu_to_le32(flags); if (flags & CEPH_OSD_FLAG_WRITE) ceph_encode_timespec(&head->mtime, mtime); + BUG_ON(num_op > (unsigned int) ((u16) -1)); head->num_ops = cpu_to_le16(num_op); - /* fill in oid */ head->object_len = cpu_to_le32(req->r_oid_len); memcpy(p, req->r_oid, req->r_oid_len); p += req->r_oid_len; src_op = src_ops; - while (src_op->op) { - osd_req_encode_op(req, op, src_op); - src_op++; - op++; - } + while (num_op--) + osd_req_encode_op(req, op++, src_op++); data_len += req->r_trail.length; @@ -414,23 +399,24 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool, int num_reply, int page_align) { - struct ceph_osd_req_op ops[3]; + struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; + unsigned int num_op = 1; int r; + memset(&ops, 0, sizeof ops); + ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; ops[0].extent.truncate_size = truncate_size; - ops[0].payload_len = 0; if (do_sync) { ops[1].op = CEPH_OSD_OP_STARTSYNC; - ops[1].payload_len = 0; - ops[2].op = 0; - } else - ops[1].op = 0; + num_op++; + } - req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS); + req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, + GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); req->r_flags = flags; @@ -446,7 +432,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime); + ceph_osdc_build_request(req, off, *plen, num_op, ops, + snapc, vino.snap, mtime); return req; } -- cgit v1.2.3 From 139b4318ad93ae4370d88882ff89b42dcbfaaab1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: rbd: there is really only one op Throughout the rbd code there are spots where it appears we can handle an osd request containing more than one osd request op. But that is only the way it appears. In fact, currently only one operation at a time can be supported, and supporting more than one will require much more than fleshing out the support that's there now. This patch changes names to make it perfectly clear that anywhere we're dealing with a block of ops, we're in fact dealing with exactly one of them. We'll be able to simplify some things as a result. When multiple op support is implemented, we can update things again accordingly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 118 +++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 62 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 88de8ccb29bd..cc8924d8f263 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1023,32 +1023,26 @@ out_err: return NULL; } -/* - * helpers for osd request op vectors. - */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, - int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; - ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); - if (!ops) + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) return NULL; - - ops[0].op = opcode; - /* * op extent offset and length will be set later on * in calc_raw_layout() */ - ops[0].payload_len = payload_len; + op->op = opcode; + op->payload_len = payload_len; - return ops; + return op; } -static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +static void rbd_destroy_op(struct ceph_osd_req_op *op) { - kfree(ops); + kfree(op); } static void rbd_coll_end_req_index(struct request *rq, @@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq, u64 seg_ofs; u64 seg_len; int ret; - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; u32 payload_len; int opcode; int flags; @@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq, } ret = -ENOMEM; - ops = rbd_create_rw_ops(1, opcode, payload_len); - if (!ops) + op = rbd_create_rw_op(opcode, payload_len); + if (!op) goto done; /* we've taken care of segment sizes earlier when we @@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - 1, ops, + 1, op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); - rbd_destroy_ops(ops); + rbd_destroy_op(op); done: kfree(seg_name); return ret; @@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, char *buf, u64 *ver) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, ops, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_ops(ops); + 1, op, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_op(op); return ret; } @@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!op) return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = notify_id; - ops[0].watch.flag = 0; + op->watch.ver = cpu_to_le64(ver); + op->watch.cookie = notify_id; + op->watch.flag = 0; ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, ops, + 1, op, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_ops(ops); + rbd_destroy_op(op); return ret; } @@ -1444,12 +1438,12 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + if (!op) return -ENOMEM; ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, @@ -1457,13 +1451,13 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) if (ret < 0) goto fail; - ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 1; + op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); + op->watch.flag = 1; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, ops, + 1, op, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1471,14 +1465,14 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) if (ret < 0) goto fail_event; - rbd_destroy_ops(ops); + rbd_destroy_op(op); return 0; fail_event: ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; fail: - rbd_destroy_ops(ops); + rbd_destroy_op(op); return ret; } @@ -1487,25 +1481,25 @@ fail: */ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + if (!op) return -ENOMEM; - ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 0; + op->watch.ver = 0; + op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); + op->watch.flag = 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, ops, + 1, op, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); - rbd_destroy_ops(ops); + rbd_destroy_op(op); ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; return ret; @@ -1524,7 +1518,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t inbound_size, u64 *ver) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int class_name_len = strlen(class_name); int method_name_len = strlen(method_name); int payload_size; @@ -1539,23 +1533,23 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * operation. */ payload_size = class_name_len + method_name_len + outbound_size; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size); + if (!op) return -ENOMEM; - ops[0].cls.class_name = class_name; - ops[0].cls.class_len = (__u8) class_name_len; - ops[0].cls.method_name = method_name; - ops[0].cls.method_len = (__u8) method_name_len; - ops[0].cls.argc = 0; - ops[0].cls.indata = outbound; - ops[0].cls.indata_len = outbound_size; + op->cls.class_name = class_name; + op->cls.class_len = (__u8) class_name_len; + op->cls.method_name = method_name; + op->cls.method_len = (__u8) method_name_len; + op->cls.argc = 0; + op->cls.indata = outbound; + op->cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op, object_name, 0, inbound_size, inbound, NULL, ver); - rbd_destroy_ops(ops); + rbd_destroy_op(op); dout("cls_exec returned %d\n", ret); return ret; -- cgit v1.2.3 From 30573d680355ca0de4db2113b9080cd078ac726f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: rbd: assume single op in a request We now know that every of rbd_req_sync_op() passes an array of exactly one operation, as evidenced by all callers passing 1 as its num_op argument. So get rid of that argument, assuming a single op. Similarly, we now know that all callers of rbd_do_request() pass 1 as the num_op value, so that parameter can be eliminated as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cc8924d8f263..7ce178fd6fe5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, @@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); if (linger_req) { @@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, @@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct page **pages; int num_pages; - rbd_assert(ops != NULL); + rbd_assert(op != NULL); num_pages = calc_pages_for(ofs, inbound_size); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - num_op, ops, + op, NULL, 0, NULL, linger_req, ver); @@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - 1, op, + op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, NULL, ver); rbd_destroy_op(op); return ret; @@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, op, + op, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, op, + op, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, op, + op, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); @@ -1545,7 +1543,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, op->cls.indata = outbound; op->cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, NULL, ver); -- cgit v1.2.3 From 0829661863fb5c8031c1c5c119693ea157517783 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:18 -0600 Subject: rbd: pull in ceph_calc_raw_layout() This is the first in a series of patches aimed at eliminating the use of ceph_calc_raw_layout() by rbd. It simply pulls in a copy of that function and renames it rbd_calc_raw_layout(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7ce178fd6fe5..eee0d3649dcf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } +int rbd_calc_raw_layout(struct ceph_file_layout *layout, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + int r; + + /* object extent? */ + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, + &objoff, &objlen); + if (r < 0) + return r; + if (objlen < orig_len) { + *plen = objlen; + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + } + + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + return 0; +} + /* * Send ceph osd request */ @@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(&osd_req->r_file_layout, + ret = rbd_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); -- cgit v1.2.3 From e01e79273b251dbb35ff2522a688229b09481923 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:18 -0600 Subject: rbd: open code rbd_calc_raw_layout() This patch gets rid of rbd_calc_raw_layout() by simply open coding it in its one caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 55 ++++++++++++++++++----------------------------------- 1 file changed, 18 insertions(+), 37 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eee0d3649dcf..f5f4e4a8fc87 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) return NULL; /* * op extent offset and length will be set later on - * in calc_raw_layout() + * after ceph_calc_file_object_mapping(). */ op->op = opcode; op->payload_len = payload_len; @@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } -int rbd_calc_raw_layout(struct ceph_file_layout *layout, - u64 off, u64 *plen, u64 *bno, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) -{ - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ - int r; - - /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, - &objoff, &objlen); - if (r < 0) - return r; - if (objlen < orig_len) { - *plen = objlen; - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); - } - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = objoff; - op->extent.length = objlen; - } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; - - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - *bno, objoff, objlen, req->r_num_pages); - return 0; -} - /* * Send ceph osd request */ @@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request *osd_req; int ret; u64 bno; + u64 obj_off = 0; + u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = rbd_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, op); + ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, + &bno, &obj_off, &obj_len); rbd_assert(ret == 0); + if (obj_len < len) { + dout(" skipping last %llu, final file extent %llu~%llu\n", + len - obj_len, ofs, obj_len); + len = obj_len; + } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = obj_off; + op->extent.length = obj_len; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = obj_len; + } + osd_req->r_num_pages = calc_pages_for(ofs, len); + osd_req->r_page_alignment = ofs & ~PAGE_MASK; ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); -- cgit v1.2.3 From 47dba7ba2623b088cbbe1ac0aaa1a034f3249b6d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:19 -0600 Subject: rbd: don't bother calculating file mapping When rbd_do_request() has a request to process it initializes a ceph file layout structure and uses it to compute offsets and limits for the range of the request using ceph_calc_file_object_mapping(). The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30). It sets the layout's object size and stripe unit to be 1 GB (2^30), and sets the stripe count to be 1. The job of ceph_calc_file_object_mapping() is to determine which of a sequence of objects will contain data covered by range, and within that object, at what offset the range starts. It also truncates the length of the range at the end of the selected object if necessary. This is needed for ceph fs, but for rbd it really serves no purpose. It does its own blocking of images into objects, echo of which is (1 << obj_order) in size, and as a result it ignores the "bno" value returned by ceph_calc_file_object_mapping(). In addition, by the point a request has reached this function, it is already destined for a single rbd object, and its length will not exceed that object's extent. Because of this, and because the mapping will result in blocking up the range using an integer multiple of the image's object order, ceph_calc_file_object_mapping() will never change the offset or length values defined by the request. In other words, this call is a big no-op for rbd data requests. There is one exception. We read the header object using this function, and in that case we will not have already limited the request size. However, the header is a single object (not a file or rbd image), and should not be broken into pieces anyway. So in fact we should *not* be calling ceph_calc_file_object_mapping() when operating on the header object. So... Don't call ceph_calc_file_object_mapping() in rbd_do_request(), because useless for image data and incorrect to do sofor the image header. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f5f4e4a8fc87..1c0192c3cf47 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq, { struct ceph_osd_request *osd_req; int ret; - u64 bno; - u64 obj_off = 0; - u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, - &bno, &obj_off, &obj_len); - rbd_assert(ret == 0); - if (obj_len < len) { - dout(" skipping last %llu, final file extent %llu~%llu\n", - len - obj_len, ofs, obj_len); - len = obj_len; - } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = obj_off; - op->extent.length = obj_len; + op->extent.offset = ofs; + op->extent.length = len; if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = obj_len; + op->payload_len = len; } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; -- cgit v1.2.3 From 0903e875caa93e1fb231dd66c69b118dbdad25cb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:19 -0600 Subject: rbd: use a common layout for each device Each osd message includes a layout structure, and for rbd it is always the same (at least for osd's in a given pool). Initialize a layout structure when an rbd_dev gets created and just copy that into osd requests for the rbd image. Replace an assertion that was done when initializing the layout structures with code that catches and handles anything that would trigger the assertion as soon as it is identified. This precludes that (bad) condition from ever occurring. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1c0192c3cf47..d2a6e9589fab 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -235,6 +235,8 @@ struct rbd_device { char *header_name; + struct ceph_file_layout layout; + struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req, ret, len); } -static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) -{ - memset(layout, 0, sizeof (*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_assert(pool_id <= (u64) U32_MAX); - layout->fl_pg_pool = cpu_to_le32((u32) pool_id); -} - /* * Send ceph osd request */ @@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq, strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); - rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); + osd_req->r_file_layout = rbd_dev->layout; /* struct */ if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { op->extent.offset = ofs; @@ -2297,6 +2289,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + return rbd_dev; } @@ -2551,6 +2550,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) + goto out; + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); @@ -3680,6 +3685,13 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + rc = -EIO; + goto err_out_client; + } + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; -- cgit v1.2.3 From 907703d050df92979b3848ee42f88d5c9c6c13fe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: rbd: combine rbd sync watch/unwatch functions The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are nearly identical. Combine them into a single function with a flag indicating whether a watch is to be initiated or torn down. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 81 ++++++++++++++++++----------------------------------- 1 file changed, 27 insertions(+), 54 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d2a6e9589fab..9d21fcd7e188 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) } /* - * Request sync osd watch + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { struct ceph_osd_req_op *op; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request **linger_req = NULL; + __le64 version = 0; int ret; op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); if (!op) return -ENOMEM; - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)rbd_dev, &rbd_dev->watch_event); - if (ret < 0) - goto fail; - - op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 1; - - ret = rbd_req_sync_op(rbd_dev, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, - &rbd_dev->watch_request, NULL); - - if (ret < 0) - goto fail_event; - - rbd_destroy_op(op); - return 0; - -fail_event: - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; -fail: - rbd_destroy_op(op); - return ret; -} - -/* - * Request sync osd unwatch - */ -static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *op; - int ret; + if (start) { + struct ceph_osd_client *osdc; - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); - if (!op) - return -ENOMEM; + osdc = &rbd_dev->rbd_client->client->osdc; + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + goto done; + version = cpu_to_le64(rbd_dev->header.obj_version); + linger_req = &rbd_dev->watch_request; + } - op->watch.ver = 0; + op->watch.ver = version; op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 0; + op->watch.flag = (u8) start ? 1 : 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - + op, rbd_dev->header_name, + 0, 0, NULL, linger_req, NULL); + if (!start || ret < 0) { + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + } +done: rbd_destroy_op(op); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; + return ret; } @@ -3033,7 +3007,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev); + ret = rbd_req_sync_watch(rbd_dev, 1); if (ret == -ERANGE) { rc = rbd_dev_refresh(rbd_dev, NULL); if (rc < 0) @@ -3752,8 +3726,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev); - + rbd_req_sync_watch(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); -- cgit v1.2.3 From 2e53c6c379b65372df21f4d6019f6eb63af81384 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 09:59:47 -0600 Subject: rbd: don't leak rbd_req on synchronous requests When rbd_do_request() is called it allocates and populates an rbd_req structure to hold information about the osd request to be sent. This is done for the benefit of the callback function (in particular, rbd_req_cb()), which uses this in processing when the request completes. Synchronous requests provide no callback function, in which case rbd_do_request() waits for the request to complete before returning. This case is not handling the needed free of the rbd_req structure like it should, so it is getting leaked. Note however that the synchronous case has no need for the rbd_req structure at all. So rather than simply freeing this structure for synchronous requests, just don't allocate it to begin with. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9d21fcd7e188..28b62367ff93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request **linger_req, u64 *ver) { + struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - int ret; + struct rbd_request *rbd_req = NULL; struct timespec mtime = CURRENT_TIME; - struct rbd_request *rbd_req; - struct ceph_osd_client *osdc; - - rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) - return -ENOMEM; - - if (coll) { - rbd_req->coll = coll; - rbd_req->coll_index = coll_index; - } + int ret; dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", object_name, (unsigned long long) ofs, @@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq, osdc = &rbd_dev->rbd_client->client->osdc; osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); - if (!osd_req) { - ret = -ENOMEM; - goto done_pages; - } + if (!osd_req) + return -ENOMEM; osd_req->r_flags = flags; osd_req->r_pages = pages; @@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_bio = bio; bio_get(osd_req->r_bio); } - osd_req->r_callback = rbd_cb; - rbd_req->rq = rq; - rbd_req->bio = bio; - rbd_req->pages = pages; - rbd_req->len = len; + if (rbd_cb) { + ret = -ENOMEM; + rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); + if (!rbd_req) + goto done_osd_req; + + rbd_req->rq = rq; + rbd_req->bio = bio; + rbd_req->pages = pages; + rbd_req->len = len; + rbd_req->coll = coll; + rbd_req->coll_index = coll ? coll_index : 0; + } + osd_req->r_callback = rbd_cb; osd_req->r_priv = rbd_req; strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); @@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq, return ret; done_err: - bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(osd_req); -done_pages: + if (bio) + bio_chain_put(osd_req->r_bio); kfree(rbd_req); +done_osd_req: + ceph_osdc_put_request(osd_req); + return ret; } -- cgit v1.2.3 From 1821665749a3d7e26ad470c63fc2c46990dc6041 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 09:59:47 -0600 Subject: rbd: don't leak rbd_req for rbd_req_sync_notify_ack() When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies rbd_simple_req_cb() as its callback function. Because the callback is supplied, an rbd_req structure gets allocated and populated so it can be used by the callback. However rbd_simple_req_cb() is not freeing (or even using) the rbd_req structure, so it's getting leaked. Since rbd_simple_req_cb() has no need for the rbd_req structure, just avoid allocating one for this case. Of the three calls to rbd_do_request(), only the one from rbd_do_op() needs the rbd_req structure, and that call can be distinguished from the other two because it supplies a non-null rbd_collection pointer. So fix this leak by only allocating the rbd_req structure if a non-null "coll" value is provided to rbd_do_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 28b62367ff93..619d680960b1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq, bio_get(osd_req->r_bio); } - if (rbd_cb) { + if (coll) { ret = -ENOMEM; rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); if (!rbd_req) @@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq, rbd_req->pages = pages; rbd_req->len = len; rbd_req->coll = coll; - rbd_req->coll_index = coll ? coll_index : 0; + rbd_req->coll_index = coll_index; } osd_req->r_callback = rbd_cb; -- cgit v1.2.3 From c561191813e232aa52022532751855ff5c9fa319 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: rbd: don't assign extent info in rbd_do_request() In rbd_do_request() there's a sort of last-minute assignment of the extent offset and length and payload length for read and write operations. Move those assignments into the caller (in those spots that might initiate read or write operations) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 619d680960b1..c6917b11800b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); osd_req->r_file_layout = rbd_dev->layout; /* struct */ - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = len; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = len; - } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; @@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = ofs; + op->extent.length = inbound_size; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = inbound_size; + } + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq, op = rbd_create_rw_op(opcode, payload_len); if (!op) goto done; + op->extent.offset = seg_ofs; + op->extent.length = seg_len; + op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment -- cgit v1.2.3 From 8d23bf29095e5fab84535035e7a27c4920812c44 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: rbd: don't assign extent info in rbd_req_sync_op() Move the assignment of the extent offset and length and payload length out of rbd_req_sync_op() and into its caller in the one spot where a read (and note--no write) operation might be initiated. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 72 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 26 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c6917b11800b..235cda083137 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1025,19 +1025,15 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) { struct ceph_osd_req_op *op; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) return NULL; - /* - * op extent offset and length will be set later on - * after ceph_calc_file_object_mapping(). - */ + op->op = opcode; - op->payload_len = payload_len; return op; } @@ -1047,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op *op) kfree(op); } +struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +{ + struct ceph_osd_req_op *op; + va_list args; + + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) + return NULL; + op->op = opcode; + va_start(args, opcode); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + /* rbd_osd_req_op_create(READ, offset, length) */ + /* rbd_osd_req_op_create(WRITE, offset, length) */ + op->extent.offset = va_arg(args, u64); + op->extent.length = va_arg(args, u64); + if (opcode == CEPH_OSD_OP_WRITE) + op->payload_len = op->extent.length; + break; + default: + rbd_warn(NULL, "unsupported opcode %hu\n", opcode); + kfree(op); + op = NULL; + break; + } + va_end(args); + + return op; +} + +static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) +{ + kfree(op); +} + static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, @@ -1262,13 +1294,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = inbound_size; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = inbound_size; - } - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1304,7 +1329,6 @@ static int rbd_do_op(struct request *rq, u64 seg_len; int ret; struct ceph_osd_req_op *op; - u32 payload_len; int opcode; int flags; u64 snapid; @@ -1319,22 +1343,17 @@ static int rbd_do_op(struct request *rq, opcode = CEPH_OSD_OP_WRITE; flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; snapid = CEPH_NOSNAP; - payload_len = seg_len; } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; rbd_assert(!snapc); snapid = rbd_dev->spec->snap_id; - payload_len = 0; } ret = -ENOMEM; - op = rbd_create_rw_op(opcode, payload_len); + op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len); if (!op) goto done; - op->extent.offset = seg_ofs; - op->extent.length = seg_len; - op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment @@ -1352,7 +1371,7 @@ static int rbd_do_op(struct request *rq, if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); done: kfree(seg_name); return ret; @@ -1370,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len); if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } @@ -1391,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); if (!op) return -ENOMEM; @@ -1442,7 +1461,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) __le64 version = 0; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); if (!op) return -ENOMEM; @@ -1505,9 +1524,10 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * operation. */ payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size); + op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); if (!op) return -ENOMEM; + op->payload_len = payload_size; op->cls.class_name = class_name; op->cls.class_len = (__u8) class_name_len; -- cgit v1.2.3 From 2647ba38100765298fc67ce1ec5d32e80d9fe046 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: rbd: move call osd op setup into rbd_osd_req_op_create() Move the initialization of the CEPH_OSD_OP_CALL operation into rbd_osd_req_op_create(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 235cda083137..760be82548d9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,10 +52,12 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have this defined elsewhere too */ +/* It might be useful to have these defined elsewhere */ -#define U32_MAX ((u32) (~0U)) -#define U64_MAX ((u64) (~0ULL)) +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; va_list args; + size_t size; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) @@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) if (opcode == CEPH_OSD_OP_WRITE) op->payload_len = op->extent.length; break; + case CEPH_OSD_OP_CALL: + /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ + op->cls.class_name = va_arg(args, char *); + size = strlen(op->cls.class_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.class_len = size; + op->payload_len = size; + + op->cls.method_name = va_arg(args, char *); + size = strlen(op->cls.method_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.method_len = size; + op->payload_len += size; + + op->cls.argc = 0; + op->cls.indata = va_arg(args, void *); + size = va_arg(args, size_t); + rbd_assert(size <= (size_t) U32_MAX); + op->cls.indata_len = (u32) size; + op->payload_len += size; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, u64 *ver) { struct ceph_osd_req_op *op; - int class_name_len = strlen(class_name); - int method_name_len = strlen(method_name); - int payload_size; int ret; /* @@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * the perspective of the server side) in the OSD request * operation. */ - payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); if (!op) return -ENOMEM; - op->payload_len = payload_size; - - op->cls.class_name = class_name; - op->cls.class_len = (__u8) class_name_len; - op->cls.method_name = method_name; - op->cls.method_len = (__u8) method_name_len; - op->cls.argc = 0; - op->cls.indata = outbound; - op->cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); dout("cls_exec returned %d\n", ret); return ret; -- cgit v1.2.3 From 5efea49a98d1a3b3a7301d3a17f826ad4c31b290 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: rbd: move remaining osd op setup into rbd_osd_req_op_create() The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and CEPH_OSD_OP_NOTIFY_ACK. Move the setup of those operations into rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and rbd_destroy_op(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 68 +++++++++++++++++++++-------------------------------- 1 file changed, 27 insertions(+), 41 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 760be82548d9..3802a7857280 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1027,24 +1027,6 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) -{ - struct ceph_osd_req_op *op; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - - op->op = opcode; - - return op; -} - -static void rbd_destroy_op(struct ceph_osd_req_op *op) -{ - kfree(op); -} - struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; @@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) op->cls.indata_len = (u32) size; op->payload_len += size; break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ + /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ + op->watch.cookie = va_arg(args, u64); + op->watch.ver = va_arg(args, u64); + op->watch.ver = cpu_to_le64(op->watch.ver); + if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) + op->watch.flag = (u8) 1; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); if (!op) return -ENOMEM; - op->watch.ver = cpu_to_le64(ver); - op->watch.cookie = notify_id; - op->watch.flag = 0; - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, @@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); + return ret; } @@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_req_op *op; struct ceph_osd_request **linger_req = NULL; - __le64 version = 0; - int ret; - - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); - if (!op) - return -ENOMEM; + struct ceph_osd_req_op *op; + int ret = 0; if (start) { struct ceph_osd_client *osdc; @@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, &rbd_dev->watch_event); if (ret < 0) - goto done; - version = cpu_to_le64(rbd_dev->header.obj_version); + return ret; linger_req = &rbd_dev->watch_request; + } else { + rbd_assert(rbd_dev->watch_request != NULL); } - op->watch.ver = version; - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = (u8) start ? 1 : 0; - - ret = rbd_req_sync_op(rbd_dev, + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (op) + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, 0, 0, NULL, linger_req, NULL); - if (!start || ret < 0) { + /* Cancel the event if we're tearing down, or on error */ + + if (!start || !op || ret < 0) { ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; } -done: - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } -- cgit v1.2.3 From 8b84de7940b69fd7326946ba244621aa5fc412e0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 20 Nov 2012 14:17:17 -0600 Subject: rbd: assign watch request more directly Both rbd_req_sync_op() and rbd_do_request() have a "linger" parameter, which is the address of a pointer that should refer to the osd request structure used to issue a request to an osd. Only one case ever supplies a non-null "linger" argument: an CEPH_OSD_OP_WATCH start. And in that one case it is assigned &rbd_dev->watch_request. Within rbd_do_request() (where the assignment ultimately gets made) we know the rbd_dev and therefore its watch_request field. We also know whether the op being sent is CEPH_OSD_OP_WATCH start. Stop opaquely passing down the "linger" pointer, and instead just assign the value directly inside rbd_do_request() when it's needed. This makes it unnecessary for rbd_req_sync_watch() to make arrangements to hold a value that's not available until a bit later. This more clearly separates setting up a watch request from submitting it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3802a7857280..a3a11c3c1cac 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, struct ceph_msg *), - struct ceph_osd_request **linger_req, u64 *ver) { struct ceph_osd_client *osdc; @@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq, ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); - if (linger_req) { + if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) { ceph_osdc_set_request_linger(osdc, osd_req); - *linger_req = osd_req; + rbd_dev->watch_request = osd_req; } ret = ceph_osdc_start_request(osdc, osd_req, false); @@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, - struct ceph_osd_request **linger_req, u64 *ver) { int ret; @@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, op, NULL, 0, NULL, - linger_req, ver); + ver); if (ret < 0) goto done; @@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq, flags, op, coll, coll_index, - rbd_req_cb, 0, NULL); + rbd_req_cb, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); @@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, ver); rbd_osd_req_op_destroy(op); return ret; @@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, CEPH_OSD_FLAG_READ, op, NULL, 0, - rbd_simple_req_cb, 0, NULL); + rbd_simple_req_cb, NULL); rbd_osd_req_op_destroy(op); @@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_request **linger_req = NULL; struct ceph_osd_req_op *op; int ret = 0; @@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; - linger_req = &rbd_dev->watch_request; } else { rbd_assert(rbd_dev->watch_request != NULL); } @@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, - 0, 0, NULL, linger_req, NULL); + 0, 0, NULL, NULL); /* Cancel the event if we're tearing down, or on error */ @@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, - NULL, ver); + ver); rbd_osd_req_op_destroy(op); -- cgit v1.2.3 From e0b49868d3629708eda593b6739cb78f33ab238a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 9 Jan 2013 14:44:18 -0600 Subject: rbd: fix type of snap_id in rbd_dev_v2_snap_info() The type of the snap_id local variable is defined with the wrong byte order. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a3a11c3c1cac..007b726ea0eb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2802,7 +2802,7 @@ out: static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { - __le64 snap_id; + u64 snap_id; u8 order; int ret; -- cgit v1.2.3 From 98571b5aa776d4a69eadd7d4e5c9d4e69365ab9a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: rbd: small changes A few very minor changes to the rbd code: - RBD_MAX_OPT_LEN is unused, so get rid of it - Consolidate rbd options definitions - Make rbd_segment_name() return pointer to const char Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 007b726ea0eb..4ed074138f8d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -69,7 +69,6 @@ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ -#define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" @@ -96,8 +95,6 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_READ_ONLY_DEFAULT false - /* * block device image metadata (in-memory version) */ @@ -156,10 +153,6 @@ struct rbd_spec { struct kref kref; }; -struct rbd_options { - bool read_only; -}; - /* * an instance of the client. multiple devices may share an rbd client. */ @@ -475,6 +468,12 @@ static match_table_t rbd_opts_tokens = { {-1, NULL} }; +struct rbd_options { + bool read_only; +}; + +#define RBD_READ_ONLY_DEFAULT false + static int parse_rbd_opts_token(char *c, void *private) { struct rbd_options *rbd_opts = private; @@ -773,7 +772,7 @@ static void rbd_header_free(struct rbd_image_header *header) header->snapc = NULL; } -static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; u64 segment; @@ -1338,7 +1337,7 @@ static int rbd_do_op(struct request *rq, struct rbd_req_coll *coll, int coll_index) { - char *seg_name; + const char *seg_name; u64 seg_ofs; u64 seg_len; int ret; -- cgit v1.2.3 From 38901e0f240b634467fb31743365af80a006be33 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 10 Jan 2013 12:56:58 -0600 Subject: rbd: check for overflow in rbd_get_num_segments() The return type of rbd_get_num_segments() is int, but the values it operates on are u64. Although it's not likely, there's no guarantee the result won't exceed what can be respresented in an int. The function is already designed to return -ERANGE on error, so just add this possible overflow as another reason to return that. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ed074138f8d..58d01e3a0fce 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -820,6 +820,7 @@ static int rbd_get_num_segments(struct rbd_image_header *header, { u64 start_seg; u64 end_seg; + u64 result; if (!len) return 0; @@ -829,7 +830,11 @@ static int rbd_get_num_segments(struct rbd_image_header *header, start_seg = ofs >> header->obj_order; end_seg = (ofs + len - 1) >> header->obj_order; - return end_seg - start_seg + 1; + result = end_seg - start_seg + 1; + if (result > (u64) INT_MAX) + return -ERANGE; + + return (int) result; } /* -- cgit v1.2.3 From c04306471ad93f1daf60771a0373316d4c3494ae Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 18 Jan 2013 12:31:09 -0600 Subject: rbd: don't retry setting up header watch When an rbd image is initially mapped a watch event is registered so we can do something if the header object changes. The code that does this currently loops if initiating the watch request results in an ERANGE error. The osds will never return ERANGE, so there's no reason to do this loop, so get rid of it. This resolves: http://tracker.newdream.net/issues/3860 Note that the problem this loop was intended to solve is a race between collecting image header information and setting up the watch on the header object. The real fix for that problem is described here: http://tracker.newdream.net/issues/3871 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 58d01e3a0fce..668936381ab0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1474,6 +1474,9 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) struct ceph_osd_req_op *op; int ret = 0; + rbd_assert(start ^ !!rbd_dev->watch_event); + rbd_assert(start ^ !!rbd_dev->watch_request); + if (start) { struct ceph_osd_client *osdc; @@ -1482,8 +1485,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; - } else { - rbd_assert(rbd_dev->watch_request != NULL); } op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, @@ -3023,22 +3024,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static int rbd_init_watch_dev(struct rbd_device *rbd_dev) -{ - int ret, rc; - - do { - ret = rbd_req_sync_watch(rbd_dev, 1); - if (ret == -ERANGE) { - rc = rbd_dev_refresh(rbd_dev, NULL); - if (rc < 0) - return rc; - } - } while (ret == -ERANGE); - - return ret; -} - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); /* @@ -3584,7 +3569,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_init_watch_dev(rbd_dev); + ret = rbd_req_sync_watch(rbd_dev, 1); if (ret) goto err_out_bus; -- cgit v1.2.3 From bf0d5f503dc11d6314c0503591d258d60ee9c944 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 22 Nov 2012 00:00:08 -0600 Subject: rbd: new request tracking code This patch fully implements the new request tracking code for rbd I/O requests. Each I/O request to an rbd image will get an rbd_image_request structure allocated to track it. This provides access to all information about the original request, as well as access to the set of one or more object requests that are initiated as a result of the image request. An rbd_obj_request structure defines a request sent to a single osd object (possibly) as part of an rbd image request. An rbd object request refers to a ceph_osd_request structure built up to represent the request; for now it will contain a single osd operation. It also provides space to hold the result status and the version of the object when the osd request completes. An rbd_obj_request structure can also stand on its own. This will be used for reading the version 1 header object, for issuing acknowledgements to event notifications, and for making object method calls. All rbd object requests now complete asynchronously with respect to the osd client--they supply a common callback routine. This resolves: http://tracker.newdream.net/issues/3741 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 621 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 619 insertions(+), 2 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 668936381ab0..daa0f18f7089 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -181,6 +181,67 @@ struct rbd_req_coll { struct rbd_req_status status[0]; }; +struct rbd_img_request; +typedef void (*rbd_img_callback_t)(struct rbd_img_request *); + +#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ + +struct rbd_obj_request; +typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); + +enum obj_request_type { OBJ_REQUEST_BIO }; /* More types to come */ + +struct rbd_obj_request { + const char *object_name; + u64 offset; /* object start byte */ + u64 length; /* bytes from offset */ + + struct rbd_img_request *img_request; + struct list_head links; /* img_request->obj_requests */ + u32 which; /* posn image request list */ + + enum obj_request_type type; + struct bio *bio_list; + + struct ceph_osd_request *osd_req; + + u64 xferred; /* bytes transferred */ + u64 version; + s32 result; + atomic_t done; + + rbd_obj_callback_t callback; + + struct kref kref; +}; + +struct rbd_img_request { + struct request *rq; + struct rbd_device *rbd_dev; + u64 offset; /* starting image byte offset */ + u64 length; /* byte count from offset */ + bool write_request; /* false for read */ + union { + struct ceph_snap_context *snapc; /* for writes */ + u64 snap_id; /* for reads */ + }; + spinlock_t completion_lock;/* protects next_completion */ + u32 next_completion; + rbd_img_callback_t callback; + + u32 obj_request_count; + struct list_head obj_requests; /* rbd_obj_request structs */ + + struct kref kref; +}; + +#define for_each_obj_request(ireq, oreq) \ + list_for_each_entry(oreq, &ireq->obj_requests, links) +#define for_each_obj_request_from(ireq, oreq) \ + list_for_each_entry_from(oreq, &ireq->obj_requests, links) +#define for_each_obj_request_safe(ireq, oreq, n) \ + list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links) + /* * a single io request */ @@ -1031,6 +1092,62 @@ out_err: return NULL; } +static void rbd_obj_request_get(struct rbd_obj_request *obj_request) +{ + kref_get(&obj_request->kref); +} + +static void rbd_obj_request_destroy(struct kref *kref); +static void rbd_obj_request_put(struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request != NULL); + kref_put(&obj_request->kref, rbd_obj_request_destroy); +} + +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + kref_get(&img_request->kref); +} + +static void rbd_img_request_destroy(struct kref *kref); +static void rbd_img_request_put(struct rbd_img_request *img_request) +{ + rbd_assert(img_request != NULL); + kref_put(&img_request->kref, rbd_img_request_destroy); +} + +static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_obj_request_get(obj_request); + obj_request->img_request = img_request; + list_add_tail(&obj_request->links, &img_request->obj_requests); + obj_request->which = img_request->obj_request_count++; + rbd_assert(obj_request->which != BAD_WHICH); +} + +static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->which != BAD_WHICH); + obj_request->which = BAD_WHICH; + list_del(&obj_request->links); + rbd_assert(obj_request->img_request == img_request); + obj_request->callback = NULL; + obj_request->img_request = NULL; + rbd_obj_request_put(obj_request); +} + +static bool obj_request_type_valid(enum obj_request_type type) +{ + switch (type) { + case OBJ_REQUEST_BIO: + return true; + default: + return false; + } +} + struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; @@ -1395,6 +1512,26 @@ done: return ret; } +static int rbd_obj_request_submit(struct ceph_osd_client *osdc, + struct rbd_obj_request *obj_request) +{ + return ceph_osdc_start_request(osdc, obj_request->osd_req, false); +} + +static void rbd_img_request_complete(struct rbd_img_request *img_request) +{ + if (img_request->callback) + img_request->callback(img_request); + else + rbd_img_request_put(img_request); +} + +static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) +{ + if (obj_request->callback) + obj_request->callback(obj_request); +} + /* * Request sync osd read */ @@ -1618,6 +1755,486 @@ static int rbd_dev_do_request(struct request *rq, return 0; } +static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, + struct ceph_osd_op *op) +{ + u64 xferred; + + /* + * We support a 64-bit length, but ultimately it has to be + * passed to blk_end_request(), which takes an unsigned int. + */ + xferred = le64_to_cpu(op->extent.length); + rbd_assert(xferred < (u64) UINT_MAX); + if (obj_request->result == (s32) -ENOENT) { + zero_bio_chain(obj_request->bio_list, 0); + obj_request->result = 0; + } else if (xferred < obj_request->length && !obj_request->result) { + zero_bio_chain(obj_request->bio_list, xferred); + xferred = obj_request->length; + } + obj_request->xferred = xferred; + atomic_set(&obj_request->done, 1); +} + +static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, + struct ceph_osd_op *op) +{ + obj_request->xferred = le64_to_cpu(op->extent.length); + atomic_set(&obj_request->done, 1); +} + +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) +{ + struct rbd_obj_request *obj_request = osd_req->r_priv; + struct ceph_osd_reply_head *reply_head; + struct ceph_osd_op *op; + u32 num_ops; + u16 opcode; + + rbd_assert(osd_req == obj_request->osd_req); + rbd_assert(!!obj_request->img_request ^ + (obj_request->which == BAD_WHICH)); + + obj_request->xferred = le32_to_cpu(msg->hdr.data_len); + reply_head = msg->front.iov_base; + obj_request->result = (s32) le32_to_cpu(reply_head->result); + obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); + + num_ops = le32_to_cpu(reply_head->num_ops); + WARN_ON(num_ops != 1); /* For now */ + + op = &reply_head->ops[0]; + opcode = le16_to_cpu(op->op); + switch (opcode) { + case CEPH_OSD_OP_READ: + rbd_osd_read_callback(obj_request, op); + break; + case CEPH_OSD_OP_WRITE: + rbd_osd_write_callback(obj_request, op); + break; + default: + rbd_warn(NULL, "%s: unsupported op %hu\n", + obj_request->object_name, (unsigned short) opcode); + break; + } + + if (atomic_read(&obj_request->done)) + rbd_obj_request_complete(obj_request); +} + +static struct ceph_osd_request *rbd_osd_req_create( + struct rbd_device *rbd_dev, + bool write_request, + struct rbd_obj_request *obj_request, + struct ceph_osd_req_op *op) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + struct timespec now; + struct timespec *mtime; + u64 snap_id = CEPH_NOSNAP; + u64 offset = obj_request->offset; + u64 length = obj_request->length; + + if (img_request) { + rbd_assert(img_request->write_request == write_request); + if (img_request->write_request) + snapc = img_request->snapc; + else + snap_id = img_request->snap_id; + } + + /* Allocate and initialize the request, for the single op */ + + osdc = &rbd_dev->rbd_client->client->osdc; + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_BIO: + rbd_assert(obj_request->bio_list != NULL); + osd_req->r_bio = obj_request->bio_list; + bio_get(osd_req->r_bio); + /* osd client requires "num pages" even for bio */ + osd_req->r_num_pages = calc_pages_for(offset, length); + break; + } + + if (write_request) { + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + now = CURRENT_TIME; + mtime = &now; + } else { + osd_req->r_flags = CEPH_OSD_FLAG_READ; + mtime = NULL; /* not needed for reads */ + offset = 0; /* These are not used... */ + length = 0; /* ...for osd read requests */ + } + + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; + + osd_req->r_oid_len = strlen(obj_request->object_name); + rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid)); + memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len); + + osd_req->r_file_layout = rbd_dev->layout; /* struct */ + + /* osd_req will get its own reference to snapc (if non-null) */ + + ceph_osdc_build_request(osd_req, offset, length, 1, op, + snapc, snap_id, mtime); + + return osd_req; +} + +static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) +{ + ceph_osdc_put_request(osd_req); +} + +/* object_name is assumed to be a non-null pointer and NUL-terminated */ + +static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, + u64 offset, u64 length, + enum obj_request_type type) +{ + struct rbd_obj_request *obj_request; + size_t size; + char *name; + + rbd_assert(obj_request_type_valid(type)); + + size = strlen(object_name) + 1; + obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL); + if (!obj_request) + return NULL; + + name = (char *)(obj_request + 1); + obj_request->object_name = memcpy(name, object_name, size); + obj_request->offset = offset; + obj_request->length = length; + obj_request->which = BAD_WHICH; + obj_request->type = type; + INIT_LIST_HEAD(&obj_request->links); + atomic_set(&obj_request->done, 0); + kref_init(&obj_request->kref); + + return obj_request; +} + +static void rbd_obj_request_destroy(struct kref *kref) +{ + struct rbd_obj_request *obj_request; + + obj_request = container_of(kref, struct rbd_obj_request, kref); + + rbd_assert(obj_request->img_request == NULL); + rbd_assert(obj_request->which == BAD_WHICH); + + if (obj_request->osd_req) + rbd_osd_req_destroy(obj_request->osd_req); + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_BIO: + if (obj_request->bio_list) + bio_chain_put(obj_request->bio_list); + break; + } + + kfree(obj_request); +} + +/* + * Caller is responsible for filling in the list of object requests + * that comprises the image request, and the Linux request pointer + * (if there is one). + */ +struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev, + u64 offset, u64 length, + bool write_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; + + img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC); + if (!img_request) + return NULL; + + if (write_request) { + down_read(&rbd_dev->header_rwsem); + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); + if (WARN_ON(!snapc)) { + kfree(img_request); + return NULL; /* Shouldn't happen */ + } + } + + img_request->rq = NULL; + img_request->rbd_dev = rbd_dev; + img_request->offset = offset; + img_request->length = length; + img_request->write_request = write_request; + if (write_request) + img_request->snapc = snapc; + else + img_request->snap_id = rbd_dev->spec->snap_id; + spin_lock_init(&img_request->completion_lock); + img_request->next_completion = 0; + img_request->callback = NULL; + img_request->obj_request_count = 0; + INIT_LIST_HEAD(&img_request->obj_requests); + kref_init(&img_request->kref); + + rbd_img_request_get(img_request); /* Avoid a warning */ + rbd_img_request_put(img_request); /* TEMPORARY */ + + return img_request; +} + +static void rbd_img_request_destroy(struct kref *kref) +{ + struct rbd_img_request *img_request; + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + + img_request = container_of(kref, struct rbd_img_request, kref); + + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_img_obj_request_del(img_request, obj_request); + + if (img_request->write_request) + ceph_put_snap_context(img_request->snapc); + + kfree(img_request); +} + +static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, + struct bio *bio_list) +{ + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct rbd_obj_request *obj_request = NULL; + struct rbd_obj_request *next_obj_request; + unsigned int bio_offset; + u64 image_offset; + u64 resid; + u16 opcode; + + opcode = img_request->write_request ? CEPH_OSD_OP_WRITE + : CEPH_OSD_OP_READ; + bio_offset = 0; + image_offset = img_request->offset; + rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); + resid = img_request->length; + while (resid) { + const char *object_name; + unsigned int clone_size; + struct ceph_osd_req_op *op; + u64 offset; + u64 length; + + object_name = rbd_segment_name(rbd_dev, image_offset); + if (!object_name) + goto out_unwind; + offset = rbd_segment_offset(rbd_dev, image_offset); + length = rbd_segment_length(rbd_dev, image_offset, resid); + obj_request = rbd_obj_request_create(object_name, + offset, length, + OBJ_REQUEST_BIO); + kfree(object_name); /* object request has its own copy */ + if (!obj_request) + goto out_unwind; + + rbd_assert(length <= (u64) UINT_MAX); + clone_size = (unsigned int) length; + obj_request->bio_list = bio_chain_clone_range(&bio_list, + &bio_offset, clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_partial; + + /* + * Build up the op to use in building the osd + * request. Note that the contents of the op are + * copied by rbd_osd_req_create(). + */ + op = rbd_osd_req_op_create(opcode, offset, length); + if (!op) + goto out_partial; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, + img_request->write_request, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_partial; + /* status and version are initially zero-filled */ + + rbd_img_obj_request_add(img_request, obj_request); + + image_offset += length; + resid -= length; + } + + return 0; + +out_partial: + rbd_obj_request_put(obj_request); +out_unwind: + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_obj_request_put(obj_request); + + return -ENOMEM; +} + +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + img_request = obj_request->img_request; + rbd_assert(img_request != NULL); + rbd_assert(img_request->rq != NULL); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + rbd_assert(which >= img_request->next_completion); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + unsigned int xferred; + int result; + + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!atomic_read(&obj_request->done)) + break; + + rbd_assert(obj_request->xferred <= (u64) UINT_MAX); + xferred = (unsigned int) obj_request->xferred; + result = (int) obj_request->result; + if (result) + rbd_warn(NULL, "obj_request %s result %d xferred %u\n", + img_request->write_request ? "write" : "read", + result, xferred); + + more = blk_end_request(img_request->rq, result, xferred); + which++; + } + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); + + if (!more) + rbd_img_request_complete(img_request); +} + +static int rbd_img_request_submit(struct rbd_img_request *img_request) +{ + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + + for_each_obj_request(img_request, obj_request) { + int ret; + + obj_request->callback = rbd_img_obj_callback; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + return ret; + /* + * The image request has its own reference to each + * of its object requests, so we can safely drop the + * initial one here. + */ + rbd_obj_request_put(obj_request); + } + + return 0; +} + +static void rbd_request_fn(struct request_queue *q) +{ + struct rbd_device *rbd_dev = q->queuedata; + bool read_only = rbd_dev->mapping.read_only; + struct request *rq; + int result; + + while ((rq = blk_fetch_request(q))) { + bool write_request = rq_data_dir(rq) == WRITE; + struct rbd_img_request *img_request; + u64 offset; + u64 length; + + /* Ignore any non-FS requests that filter through. */ + + if (rq->cmd_type != REQ_TYPE_FS) { + __blk_end_request_all(rq, 0); + continue; + } + + spin_unlock_irq(q->queue_lock); + + /* Disallow writes to a read-only device */ + + if (write_request) { + result = -EROFS; + if (read_only) + goto end_request; + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); + } + + /* Quit early if the snapshot has disappeared */ + + if (!atomic_read(&rbd_dev->exists)) { + dout("request for non-existent snapshot"); + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); + result = -ENXIO; + goto end_request; + } + + offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; + length = (u64) blk_rq_bytes(rq); + + result = -EINVAL; + if (WARN_ON(offset && length > U64_MAX - offset + 1)) + goto end_request; /* Shouldn't happen */ + + result = -ENOMEM; + img_request = rbd_img_request_create(rbd_dev, offset, length, + write_request); + if (!img_request) + goto end_request; + + img_request->rq = rq; + + result = rbd_img_request_fill_bio(img_request, rq->bio); + if (!result) + result = rbd_img_request_submit(img_request); + if (result) + rbd_img_request_put(img_request); +end_request: + spin_lock_irq(q->queue_lock); + if (result < 0) { + rbd_warn(rbd_dev, "obj_request %s result %d\n", + write_request ? "write" : "read", result); + __blk_end_request_all(rq, result); + } + } +} + /* * block device queue callback */ @@ -1929,8 +2546,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - /* init rq */ - q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock); + (void) rbd_rq_fn; /* avoid a warning */ + q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); if (!q) goto out_disk; -- cgit v1.2.3 From 2250a71b591728092db9adcc51629401deb2f9f8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 17:53:04 -0600 Subject: rbd: kill rbd_rq_fn() and all other related code Now that the request function has been replaced by one using the new request management data structures the old one can go away. Deleting it makes rbd_dev_do_request() no longer needed, and deleting that makes other functions unneeded, and so on. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 319 ---------------------------------------------------- 1 file changed, 319 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index daa0f18f7089..4c175a7d3f7e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -621,18 +621,6 @@ static void rbd_put_client(struct rbd_client *rbdc) kref_put(&rbdc->kref, rbd_client_release); } -/* - * Destroy requests collection - */ -static void rbd_coll_release(struct kref *kref) -{ - struct rbd_req_coll *coll = - container_of(kref, struct rbd_req_coll, kref); - - dout("rbd_coll_release %p\n", coll); - kfree(coll); -} - static bool rbd_image_format_valid(u32 image_format) { return image_format == 1 || image_format == 2; @@ -876,28 +864,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, return length; } -static int rbd_get_num_segments(struct rbd_image_header *header, - u64 ofs, u64 len) -{ - u64 start_seg; - u64 end_seg; - u64 result; - - if (!len) - return 0; - if (len - 1 > U64_MAX - ofs) - return -ERANGE; - - start_seg = ofs >> header->obj_order; - end_seg = (ofs + len - 1) >> header->obj_order; - - result = end_seg - start_seg + 1; - if (result > (u64) INT_MAX) - return -ERANGE; - - return (int) result; -} - /* * returns the size of an object in the image */ @@ -1216,52 +1182,6 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) kfree(op); } -static void rbd_coll_end_req_index(struct request *rq, - struct rbd_req_coll *coll, - int index, - s32 ret, u64 len) -{ - struct request_queue *q; - int min, max, i; - - dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", - coll, index, (int)ret, (unsigned long long)len); - - if (!rq) - return; - - if (!coll) { - blk_end_request(rq, ret, len); - return; - } - - q = rq->q; - - spin_lock_irq(q->queue_lock); - coll->status[index].done = 1; - coll->status[index].rc = ret; - coll->status[index].bytes = len; - max = min = coll->num_done; - while (max < coll->total && coll->status[max].done) - max++; - - for (i = min; istatus[i].rc, - coll->status[i].bytes); - coll->num_done++; - kref_put(&coll->kref, rbd_coll_release); - } - spin_unlock_irq(q->queue_lock); -} - -static void rbd_coll_end_req(struct rbd_request *rbd_req, - s32 ret, u64 len) -{ - rbd_coll_end_req_index(rbd_req->rq, - rbd_req->coll, rbd_req->coll_index, - ret, len); -} - /* * Send ceph osd request */ @@ -1361,46 +1281,6 @@ done_osd_req: return ret; } -/* - * Ceph osd op callback - */ -static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) -{ - struct rbd_request *rbd_req = osd_req->r_priv; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; - s32 rc; - u64 bytes; - int read_op; - - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = (s32)le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - - dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", - (unsigned long long) bytes, read_op, (int) rc); - - if (rc == (s32)-ENOENT && read_op) { - zero_bio_chain(rbd_req->bio, 0); - rc = 0; - } else if (rc == 0 && read_op && bytes < rbd_req->len) { - zero_bio_chain(rbd_req->bio, bytes); - bytes = rbd_req->len; - } - - rbd_coll_end_req(rbd_req, rc, bytes); - - if (rbd_req->bio) - bio_chain_put(rbd_req->bio); - - ceph_osdc_put_request(osd_req); - kfree(rbd_req); -} - static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) { @@ -1448,70 +1328,6 @@ done: return ret; } -/* - * Do an asynchronous ceph osd operation - */ -static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, u64 len, - struct bio *bio, - struct rbd_req_coll *coll, - int coll_index) -{ - const char *seg_name; - u64 seg_ofs; - u64 seg_len; - int ret; - struct ceph_osd_req_op *op; - int opcode; - int flags; - u64 snapid; - - seg_name = rbd_segment_name(rbd_dev, ofs); - if (!seg_name) - return -ENOMEM; - seg_len = rbd_segment_length(rbd_dev, ofs, len); - seg_ofs = rbd_segment_offset(rbd_dev, ofs); - - if (rq_data_dir(rq) == WRITE) { - opcode = CEPH_OSD_OP_WRITE; - flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; - snapid = CEPH_NOSNAP; - } else { - opcode = CEPH_OSD_OP_READ; - flags = CEPH_OSD_FLAG_READ; - rbd_assert(!snapc); - snapid = rbd_dev->spec->snap_id; - } - - ret = -ENOMEM; - op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len); - if (!op) - goto done; - - /* we've taken care of segment sizes earlier when we - cloned the bios. We should never have a segment - truncated at this point */ - rbd_assert(seg_len == len); - - ret = rbd_do_request(rq, rbd_dev, snapc, snapid, - seg_name, seg_ofs, seg_len, - bio, - NULL, 0, - flags, - op, - coll, coll_index, - rbd_req_cb, NULL); - if (ret < 0) - rbd_coll_end_req_index(rq, coll, coll_index, - (s32)ret, seg_len); - rbd_osd_req_op_destroy(op); -done: - kfree(seg_name); - return ret; -} - static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { @@ -1683,78 +1499,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, return ret; } -static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) -{ - struct rbd_req_coll *coll = - kzalloc(sizeof(struct rbd_req_coll) + - sizeof(struct rbd_req_status) * num_reqs, - GFP_ATOMIC); - - if (!coll) - return NULL; - coll->total = num_reqs; - kref_init(&coll->kref); - return coll; -} - -static int rbd_dev_do_request(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 ofs, unsigned int size, - struct bio *bio_chain) -{ - int num_segs; - struct rbd_req_coll *coll; - unsigned int bio_offset; - int cur_seg = 0; - - dout("%s 0x%x bytes at 0x%llx\n", - rq_data_dir(rq) == WRITE ? "write" : "read", - size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); - - num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); - if (num_segs <= 0) - return num_segs; - - coll = rbd_alloc_coll(num_segs); - if (!coll) - return -ENOMEM; - - bio_offset = 0; - do { - u64 limit = rbd_segment_length(rbd_dev, ofs, size); - unsigned int clone_size; - struct bio *bio_clone; - - BUG_ON(limit > (u64)UINT_MAX); - clone_size = (unsigned int)limit; - dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); - - kref_get(&coll->kref); - - /* Pass a cloned bio chain via an osd request */ - - bio_clone = bio_chain_clone_range(&bio_chain, - &bio_offset, clone_size, - GFP_ATOMIC); - if (bio_clone) - (void)rbd_do_op(rq, rbd_dev, snapc, - ofs, clone_size, - bio_clone, coll, cur_seg); - else - rbd_coll_end_req_index(rq, coll, cur_seg, - (s32)-ENOMEM, - clone_size); - size -= clone_size; - ofs += clone_size; - - cur_seg++; - } while (size > 0); - kref_put(&coll->kref, rbd_coll_release); - - return 0; -} - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { @@ -2235,68 +1979,6 @@ end_request: } } -/* - * block device queue callback - */ -static void rbd_rq_fn(struct request_queue *q) -{ - struct rbd_device *rbd_dev = q->queuedata; - bool read_only = rbd_dev->mapping.read_only; - struct request *rq; - - while ((rq = blk_fetch_request(q))) { - struct ceph_snap_context *snapc = NULL; - unsigned int size = 0; - int result; - - dout("fetched request\n"); - - /* Filter out block requests we don't understand */ - - if ((rq->cmd_type != REQ_TYPE_FS)) { - __blk_end_request_all(rq, 0); - continue; - } - spin_unlock_irq(q->queue_lock); - - /* Write requests need a reference to the snapshot context */ - - if (rq_data_dir(rq) == WRITE) { - result = -EROFS; - if (read_only) /* Can't write to a read-only device */ - goto out_end_request; - - /* - * Note that each osd request will take its - * own reference to the snapshot context - * supplied. The reference we take here - * just guarantees the one we provide stays - * valid. - */ - down_read(&rbd_dev->header_rwsem); - snapc = ceph_get_snap_context(rbd_dev->header.snapc); - up_read(&rbd_dev->header_rwsem); - rbd_assert(snapc != NULL); - } else if (!atomic_read(&rbd_dev->exists)) { - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - dout("request for non-existent snapshot"); - result = -ENXIO; - goto out_end_request; - } - - size = blk_rq_bytes(rq); - result = rbd_dev_do_request(rq, rbd_dev, snapc, - blk_rq_pos(rq) * SECTOR_SIZE, - size, rq->bio); -out_end_request: - if (snapc) - ceph_put_snap_context(snapc); - spin_lock_irq(q->queue_lock); - if (!size || result < 0) - __blk_end_request_all(rq, result); - } -} - /* * a queue callback. Makes sure that we don't create a bio that spans across * multiple osd objects. One exception would be with a single page bios, @@ -2546,7 +2228,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - (void) rbd_rq_fn; /* avoid a warning */ q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); if (!q) goto out_disk; -- cgit v1.2.3 From 7d250b949a33c8a658a2ad4ab390d8394b842224 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 17:53:04 -0600 Subject: rbd: kill rbd_req_coll and rbd_request The two remaining callers of rbd_do_request() always pass a null collection pointer, so the "coll" and "coll_index" parameters are not needed. There is no other use of that data structure, so it can be eliminated. Deleting them means there is no need to allocate a rbd_request structure for the callback function. And since that's the only use of *that* structure, it too can be eliminated. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 58 +++-------------------------------------------------- 1 file changed, 3 insertions(+), 55 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4c175a7d3f7e..c1bb649b4ad1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -162,25 +162,6 @@ struct rbd_client { struct list_head node; }; -/* - * a request completion status - */ -struct rbd_req_status { - int done; - s32 rc; - u64 bytes; -}; - -/* - * a collection of requests - */ -struct rbd_req_coll { - int total; - int num_done; - struct kref kref; - struct rbd_req_status status[0]; -}; - struct rbd_img_request; typedef void (*rbd_img_callback_t)(struct rbd_img_request *); @@ -242,18 +223,6 @@ struct rbd_img_request { #define for_each_obj_request_safe(ireq, oreq, n) \ list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links) -/* - * a single io request - */ -struct rbd_request { - struct request *rq; /* blk layer request */ - struct bio *bio; /* cloned bio */ - struct page **pages; /* list of used pages */ - u64 len; - int coll_index; - struct rbd_req_coll *coll; -}; - struct rbd_snap { struct device dev; const char *name; @@ -1195,21 +1164,18 @@ static int rbd_do_request(struct request *rq, int num_pages, int flags, struct ceph_osd_req_op *op, - struct rbd_req_coll *coll, - int coll_index, void (*rbd_cb)(struct ceph_osd_request *, struct ceph_msg *), u64 *ver) { struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - struct rbd_request *rbd_req = NULL; struct timespec mtime = CURRENT_TIME; int ret; - dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", + dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, (unsigned long long) ofs, - (unsigned long long) len, coll, coll_index); + (unsigned long long) len); osdc = &rbd_dev->rbd_client->client->osdc; osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); @@ -1223,22 +1189,8 @@ static int rbd_do_request(struct request *rq, bio_get(osd_req->r_bio); } - if (coll) { - ret = -ENOMEM; - rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) - goto done_osd_req; - - rbd_req->rq = rq; - rbd_req->bio = bio; - rbd_req->pages = pages; - rbd_req->len = len; - rbd_req->coll = coll; - rbd_req->coll_index = coll_index; - } - osd_req->r_callback = rbd_cb; - osd_req->r_priv = rbd_req; + osd_req->r_priv = NULL; strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); @@ -1274,8 +1226,6 @@ static int rbd_do_request(struct request *rq, done_err: if (bio) bio_chain_put(osd_req->r_bio); - kfree(rbd_req); -done_osd_req: ceph_osdc_put_request(osd_req); return ret; @@ -1314,7 +1264,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, pages, num_pages, flags, op, - NULL, 0, NULL, ver); if (ret < 0) @@ -1390,7 +1339,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, NULL, 0, CEPH_OSD_FLAG_READ, op, - NULL, 0, rbd_simple_req_cb, NULL); rbd_osd_req_op_destroy(op); -- cgit v1.2.3 From 788e2df3b92e30f1fff74139bb53e68ec13fe2a5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 17 Jan 2013 12:25:27 -0600 Subject: rbd: implement sync object read with new code Reimplement the synchronous read operation used for reading a version 1 header using the new request tracking code. Name the resulting function rbd_obj_read_sync() to better reflect that it's a full object operation, not an object request. To do this, implement a new OBJ_REQUEST_PAGES object request type. This implements a new mechanism to allow the caller to wait for completion for an rbd_obj_request by calling rbd_obj_request_wait(). This partially resolves: http://tracker.newdream.net/issues/3755 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c1bb649b4ad1..3f5eaea444a0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -170,7 +170,7 @@ typedef void (*rbd_img_callback_t)(struct rbd_img_request *); struct rbd_obj_request; typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); -enum obj_request_type { OBJ_REQUEST_BIO }; /* More types to come */ +enum obj_request_type { OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; struct rbd_obj_request { const char *object_name; @@ -182,7 +182,13 @@ struct rbd_obj_request { u32 which; /* posn image request list */ enum obj_request_type type; - struct bio *bio_list; + union { + struct bio *bio_list; + struct { + struct page **pages; + u32 page_count; + }; + }; struct ceph_osd_request *osd_req; @@ -192,6 +198,7 @@ struct rbd_obj_request { atomic_t done; rbd_obj_callback_t callback; + struct completion completion; struct kref kref; }; @@ -1077,6 +1084,7 @@ static bool obj_request_type_valid(enum obj_request_type type) { switch (type) { case OBJ_REQUEST_BIO: + case OBJ_REQUEST_PAGES: return true; default: return false; @@ -1291,14 +1299,23 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request) rbd_img_request_put(img_request); } +/* Caller is responsible for rbd_obj_request_destroy(obj_request) */ + +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +{ + return wait_for_completion_interruptible(&obj_request->completion); +} + static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { if (obj_request->callback) obj_request->callback(obj_request); + else + complete_all(&obj_request->completion); } /* - * Request sync osd read + * Synchronously read a range from an object into a provided buffer */ static int rbd_req_sync_read(struct rbd_device *rbd_dev, const char *object_name, @@ -1556,6 +1573,11 @@ static struct ceph_osd_request *rbd_osd_req_create( /* osd client requires "num pages" even for bio */ osd_req->r_num_pages = calc_pages_for(offset, length); break; + case OBJ_REQUEST_PAGES: + osd_req->r_pages = obj_request->pages; + osd_req->r_num_pages = obj_request->page_count; + osd_req->r_page_alignment = offset & ~PAGE_MASK; + break; } if (write_request) { @@ -1616,6 +1638,7 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); atomic_set(&obj_request->done, 0); + init_completion(&obj_request->completion); kref_init(&obj_request->kref); return obj_request; @@ -1639,6 +1662,11 @@ static void rbd_obj_request_destroy(struct kref *kref) if (obj_request->bio_list) bio_chain_put(obj_request->bio_list); break; + case OBJ_REQUEST_PAGES: + if (obj_request->pages) + ceph_release_page_vector(obj_request->pages, + obj_request->page_count); + break; } kfree(obj_request); @@ -1987,6 +2015,65 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) put_disk(disk); } +static int rbd_obj_read_sync(struct rbd_device *rbd_dev, + const char *object_name, + u64 offset, u64 length, + char *buf, u64 *version) + +{ + struct ceph_osd_req_op *op; + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + int ret; + + page_count = (u32) calc_pages_for(offset, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + ret = PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, offset, length, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred); + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + /* * Read the complete header for the given rbd device. * @@ -2025,7 +2112,8 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name, + (void) rbd_req_sync_read; /* avoid a warning */ + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); -- cgit v1.2.3 From 86ea43bfcbeae61858b0fee4971e5b1e894d7174 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: rbd: get rid of rbd_req_sync_read() Delete rbd_req_sync_read() is no longer used, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3f5eaea444a0..5e7110a50513 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1314,29 +1314,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -/* - * Synchronously read a range from an object into a provided buffer - */ -static int rbd_req_sync_read(struct rbd_device *rbd_dev, - const char *object_name, - u64 ofs, u64 len, - char *buf, - u64 *ver) -{ - struct ceph_osd_req_op *op; - int ret; - - op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len); - if (!op) - return -ENOMEM; - - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - op, object_name, ofs, len, buf, ver); - rbd_osd_req_op_destroy(op); - - return ret; -} - /* * Request sync osd watch */ @@ -2112,7 +2089,6 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - (void) rbd_req_sync_read; /* avoid a warning */ ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); -- cgit v1.2.3 From 9969ebc5af93028c21f2614621737f0d6ff6fc06 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 18 Jan 2013 12:31:10 -0600 Subject: rbd: implement watch/unwatch with new code Implement a new function to set up or tear down a watch event for an mapped rbd image header using the new request code. Create a new object request type "nodata" to handle this. And define rbd_osd_trivial_callback() which simply marks a request done. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5e7110a50513..60b68512fa93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -170,7 +170,9 @@ typedef void (*rbd_img_callback_t)(struct rbd_img_request *); struct rbd_obj_request; typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); -enum obj_request_type { OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; +enum obj_request_type { + OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES +}; struct rbd_obj_request { const char *object_name; @@ -1083,6 +1085,7 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, static bool obj_request_type_valid(enum obj_request_type type) { switch (type) { + case OBJ_REQUEST_NODATA: case OBJ_REQUEST_BIO: case OBJ_REQUEST_PAGES: return true; @@ -1306,6 +1309,12 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return wait_for_completion_interruptible(&obj_request->completion); } +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, + struct ceph_osd_op *op) +{ + atomic_set(&obj_request->done, 1); +} + static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { if (obj_request->callback) @@ -1500,6 +1509,9 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request, op); break; + case CEPH_OSD_OP_WATCH: + rbd_osd_trivial_callback(obj_request, op); + break; default: rbd_warn(NULL, "%s: unsupported op %hu\n", obj_request->object_name, (unsigned short) opcode); @@ -1543,6 +1555,8 @@ static struct ceph_osd_request *rbd_osd_req_create( rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); osd_req->r_bio = obj_request->bio_list; @@ -1635,6 +1649,8 @@ static void rbd_obj_request_destroy(struct kref *kref) rbd_assert(obj_request_type_valid(obj_request->type)); switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ case OBJ_REQUEST_BIO: if (obj_request->bio_list) bio_chain_put(obj_request->bio_list); @@ -1862,6 +1878,72 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) return 0; } +/* + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. + */ +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; + int ret; + + rbd_assert(start ^ !!rbd_dev->watch_event); + rbd_assert(start ^ !!rbd_dev->watch_request); + + if (start) { + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + } + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + goto out_cancel; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (!op) + goto out_cancel; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out_cancel; + + if (start) { + rbd_dev->watch_request = obj_request->osd_req; + ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request); + } + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out_cancel; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out_cancel; + + ret = obj_request->result; + if (ret) + goto out_cancel; + + if (start) + goto done; /* Done if setting up the watch request */ +out_cancel: + /* Cancel the event if we're tearing down, or on error */ + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; +done: + if (obj_request) + rbd_obj_request_put(obj_request); + + return ret; +} + static void rbd_request_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; @@ -3879,7 +3961,8 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_req_sync_watch(rbd_dev, 1); + (void) rbd_req_sync_watch; /* avoid a warning */ + ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) goto err_out_bus; @@ -4042,7 +4125,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_watch(rbd_dev, 0); + rbd_dev_header_watch_sync(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); -- cgit v1.2.3 From ecf7a0318b1b026fb147623c36324fa8c73447a9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: rbd: get rid of rbd_req_sync_watch() Get rid of rbd_req_sync_watch(), because it is no longer used. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 60b68512fa93..a6a85271380a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1369,47 +1369,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } -/* - * Request sync osd watch/unwatch. The value of "start" determines - * whether a watch request is being initiated or torn down. - */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) -{ - struct ceph_osd_req_op *op; - int ret = 0; - - rbd_assert(start ^ !!rbd_dev->watch_event); - rbd_assert(start ^ !!rbd_dev->watch_request); - - if (start) { - struct ceph_osd_client *osdc; - - osdc = &rbd_dev->rbd_client->client->osdc; - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, - &rbd_dev->watch_event); - if (ret < 0) - return ret; - } - - op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, - rbd_dev->watch_event->cookie, - rbd_dev->header.obj_version, start); - if (op) - ret = rbd_req_sync_op(rbd_dev, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, rbd_dev->header_name, - 0, 0, NULL, NULL); - - /* Cancel the event if we're tearing down, or on error */ - - if (!start || !op || ret < 0) { - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; - } - rbd_osd_req_op_destroy(op); - - return ret; -} /* * Synchronous osd object method call @@ -3961,7 +3920,6 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - (void) rbd_req_sync_watch; /* avoid a warning */ ret = rbd_dev_header_watch_sync(rbd_dev, 1); if (ret) goto err_out_bus; -- cgit v1.2.3 From b8d70035b35dc12135d5835b659b229bcd6d4f94 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 17:53:04 -0600 Subject: rbd: use new code for notify ack Use the new object request tracking mechanism for handling a notify_ack request. Move the callback function below the definition of this so we don't have to do a pre-declaration. This resolves: http://tracker.newdream.net/issues/3754 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 76 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 21 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a6a85271380a..1428795571c9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1349,27 +1349,6 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, return ret; } -static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) -{ - struct rbd_device *rbd_dev = (struct rbd_device *)data; - u64 hver; - int rc; - - if (!rbd_dev) - return; - - dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", - rbd_dev->header_name, (unsigned long long) notify_id, - (unsigned int) opcode); - rc = rbd_dev_refresh(rbd_dev, &hver); - if (rc) - rbd_warn(rbd_dev, "got notification but failed to " - " update snaps: %d\n", rc); - - rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); -} - - /* * Synchronous osd object method call */ @@ -1468,6 +1447,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request, op); break; + case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: rbd_osd_trivial_callback(obj_request, op); break; @@ -1837,6 +1817,60 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) return 0; } +static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, + u64 ver, u64 notify_id) +{ + struct rbd_obj_request *obj_request; + struct ceph_osd_req_op *op; + struct ceph_osd_client *osdc; + int ret; + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + return -ENOMEM; + + ret = -ENOMEM; + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (!ret) + ret = rbd_obj_request_wait(obj_request); +out: + rbd_obj_request_put(obj_request); + + return ret; +} + +static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +{ + struct rbd_device *rbd_dev = (struct rbd_device *)data; + u64 hver; + int rc; + + if (!rbd_dev) + return; + + dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); + rc = rbd_dev_refresh(rbd_dev, &hver); + if (rc) + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", rc); + + (void) rbd_req_sync_notify_ack; /* avoid a warning */ + rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id); +} + /* * Request sync osd watch/unwatch. The value of "start" determines * whether a watch request is being initiated or torn down. -- cgit v1.2.3 From 5ae9db81b45c2d95554c665043afffd5e9a7d5ac Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: rbd: get rid of rbd_req_sync_notify_ack() Get rid rbd_req_sync_notify_ack() because it is no longer used. As a result rbd_simple_req_cb() becomes unreferenced, so get rid of that too. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1428795571c9..7a6694d08874 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1242,12 +1242,6 @@ done_err: return ret; } -static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, - struct ceph_msg *msg) -{ - ceph_osdc_put_request(osd_req); -} - /* * Do a synchronous ceph osd operation */ @@ -1323,32 +1317,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -/* - * Request sync osd watch - */ -static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, - u64 ver, - u64 notify_id) -{ - struct ceph_osd_req_op *op; - int ret; - - op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); - if (!op) - return -ENOMEM; - - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, - rbd_dev->header_name, 0, 0, NULL, - NULL, 0, - CEPH_OSD_FLAG_READ, - op, - rbd_simple_req_cb, NULL); - - rbd_osd_req_op_destroy(op); - - return ret; -} - /* * Synchronous osd object method call */ @@ -1867,7 +1835,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) rbd_warn(rbd_dev, "got notification but failed to " " update snaps: %d\n", rc); - (void) rbd_req_sync_notify_ack; /* avoid a warning */ rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id); } -- cgit v1.2.3 From cf81b60e4bbd4a1281fe2640f9c0c40fe3a85fdf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 17 Jan 2013 12:18:46 -0600 Subject: rbd: send notify ack asynchronously When we receive notification of a change to an rbd image's header object we need to refresh our information about the image (its size and snapshot context). Once we have refreshed our rbd image we need to acknowledge the notification. This acknowledgement was previously done synchronously, but there's really no need to wait for it to complete. Change it so the caller doesn't wait for the notify acknowledgement request to complete. And change the name to reflect it's no longer synchronous. This resolves: http://tracker.newdream.net/issues/3877 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7a6694d08874..76917cc3e5a1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1785,7 +1785,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) return 0; } -static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, +static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { struct rbd_obj_request *obj_request; @@ -1809,11 +1809,11 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, goto out; osdc = &rbd_dev->rbd_client->client->osdc; + obj_request->callback = rbd_obj_request_put; ret = rbd_obj_request_submit(osdc, obj_request); - if (!ret) - ret = rbd_obj_request_wait(obj_request); out: - rbd_obj_request_put(obj_request); + if (ret) + rbd_obj_request_put(obj_request); return ret; } @@ -1835,7 +1835,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) rbd_warn(rbd_dev, "got notification but failed to " " update snaps: %d\n", rc); - rbd_obj_notify_ack_sync(rbd_dev, hver, notify_id); + rbd_obj_notify_ack(rbd_dev, hver, notify_id); } /* -- cgit v1.2.3 From 36be9a761844e186f629f463b665945df4f67766 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sat, 19 Jan 2013 00:30:28 -0600 Subject: rbd: implement sync method with new code Reimplement synchronous object method calls using the new request tracking code. Use the name rbd_obj_method_sync() Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 114 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 18 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 76917cc3e5a1..d10649f2e346 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1415,6 +1415,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request, op); break; + case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: rbd_osd_trivial_callback(obj_request, op); @@ -1904,6 +1905,82 @@ done: return ret; } +/* + * Synchronous osd object method call + */ +static int rbd_obj_method_sync(struct rbd_device *rbd_dev, + const char *object_name, + const char *class_name, + const char *method_name, + const char *outbound, + size_t outbound_size, + char *inbound, + size_t inbound_size, + u64 *version) +{ + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc; + struct ceph_osd_req_op *op; + struct page **pages; + u32 page_count; + int ret; + + /* + * Method calls are ultimately read operations but they + * don't involve object data (so no offset or length). + * The result should placed into the inbound buffer + * provided. They also supply outbound data--parameters for + * the object method. Currently if this is present it will + * be a snapshot id. + */ + page_count = (u32) calc_pages_for(0, inbound_size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); + if (!op) + goto out; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, + obj_request, op); + rbd_osd_req_op_destroy(op); + if (!obj_request->osd_req) + goto out; + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + ret = ceph_copy_from_page_vector(pages, inbound, 0, + obj_request->xferred); + if (version) + *version = obj_request->version; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + static void rbd_request_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; @@ -2054,7 +2131,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, ret = -ENOMEM; obj_request = rbd_obj_request_create(object_name, offset, length, - OBJ_REQUEST_PAGES); + OBJ_REQUEST_PAGES); if (!obj_request) goto out; @@ -2754,11 +2831,12 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + (void) rbd_req_sync_exec; /* Avoid a warning */ + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), (char *) &size_buf, sizeof (size_buf), NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2789,14 +2867,14 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ + ret = 0; /* rbd_obj_method_sync() can return positive */ p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, @@ -2827,12 +2905,12 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, u64 incompat; int ret; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_features", (char *) &snapid, sizeof (snapid), (char *) &features_buf, sizeof (features_buf), NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2883,11 +2961,11 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } snapid = cpu_to_le64(CEPH_NOSNAP); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", (char *) &snapid, sizeof (snapid), (char *) reply_buf, size, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -2954,7 +3032,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) if (!reply_buf) goto out; - ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, + ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, (char *) reply_buf, size, NULL); @@ -3060,11 +3138,11 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) if (!reply_buf) return -ENOMEM; - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, reply_buf, size, ver); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3129,11 +3207,11 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) return ERR_PTR(-ENOMEM); snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]); - ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", (char *) &snap_id, sizeof (snap_id), reply_buf, size, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3721,14 +3799,14 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) goto out; } - ret = rbd_req_sync_exec(rbd_dev, object_name, + ret = rbd_obj_method_sync(rbd_dev, object_name, "rbd", "get_id", NULL, 0, response, RBD_IMAGE_ID_LEN_MAX, NULL); - dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_req_sync_exec() can return positive */ + ret = 0; /* rbd_obj_method_sync() can return positive */ p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, -- cgit v1.2.3 From 9f20e02a53b944a54a35b9f0db1243cd64872f7d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: rbd: get rid of rbd_req_sync_exec() Get rid rbd_req_sync_exec() because it is no longer used. That eliminates the last use of rbd_req_sync_op(), so get rid of that too. And finally, that leaves rbd_do_request() unreferenced, so get rid of that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 160 ---------------------------------------------------- 1 file changed, 160 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d10649f2e346..4ea89917bb92 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1162,126 +1162,6 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) kfree(op); } -/* - * Send ceph osd request - */ -static int rbd_do_request(struct request *rq, - struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, - const char *object_name, u64 ofs, u64 len, - struct bio *bio, - struct page **pages, - int num_pages, - int flags, - struct ceph_osd_req_op *op, - void (*rbd_cb)(struct ceph_osd_request *, - struct ceph_msg *), - u64 *ver) -{ - struct ceph_osd_client *osdc; - struct ceph_osd_request *osd_req; - struct timespec mtime = CURRENT_TIME; - int ret; - - dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", - object_name, (unsigned long long) ofs, - (unsigned long long) len); - - osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); - if (!osd_req) - return -ENOMEM; - - osd_req->r_flags = flags; - osd_req->r_pages = pages; - if (bio) { - osd_req->r_bio = bio; - bio_get(osd_req->r_bio); - } - - osd_req->r_callback = rbd_cb; - osd_req->r_priv = NULL; - - strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); - osd_req->r_oid_len = strlen(osd_req->r_oid); - - osd_req->r_file_layout = rbd_dev->layout; /* struct */ - osd_req->r_num_pages = calc_pages_for(ofs, len); - osd_req->r_page_alignment = ofs & ~PAGE_MASK; - - ceph_osdc_build_request(osd_req, ofs, len, 1, op, - snapc, snapid, &mtime); - - if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) { - ceph_osdc_set_request_linger(osdc, osd_req); - rbd_dev->watch_request = osd_req; - } - - ret = ceph_osdc_start_request(osdc, osd_req, false); - if (ret < 0) - goto done_err; - - if (!rbd_cb) { - u64 version; - - ret = ceph_osdc_wait_request(osdc, osd_req); - version = le64_to_cpu(osd_req->r_reassert_version.version); - if (ver) - *ver = version; - dout("reassert_ver=%llu\n", (unsigned long long) version); - ceph_osdc_put_request(osd_req); - } - return ret; - -done_err: - if (bio) - bio_chain_put(osd_req->r_bio); - ceph_osdc_put_request(osd_req); - - return ret; -} - -/* - * Do a synchronous ceph osd operation - */ -static int rbd_req_sync_op(struct rbd_device *rbd_dev, - int flags, - struct ceph_osd_req_op *op, - const char *object_name, - u64 ofs, u64 inbound_size, - char *inbound, - u64 *ver) -{ - int ret; - struct page **pages; - int num_pages; - - rbd_assert(op != NULL); - - num_pages = calc_pages_for(ofs, inbound_size); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, - object_name, ofs, inbound_size, NULL, - pages, num_pages, - flags, - op, - NULL, - ver); - if (ret < 0) - goto done; - - if ((flags & CEPH_OSD_FLAG_READ) && inbound) - ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret); - -done: - ceph_release_page_vector(pages, num_pages); - return ret; -} - static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { @@ -1317,45 +1197,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -/* - * Synchronous osd object method call - */ -static int rbd_req_sync_exec(struct rbd_device *rbd_dev, - const char *object_name, - const char *class_name, - const char *method_name, - const char *outbound, - size_t outbound_size, - char *inbound, - size_t inbound_size, - u64 *ver) -{ - struct ceph_osd_req_op *op; - int ret; - - /* - * Any input parameters required by the method we're calling - * will be sent along with the class and method names as - * part of the message payload. That data and its size are - * supplied via the indata and indata_len fields (named from - * the perspective of the server side) in the OSD request - * operation. - */ - op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, - method_name, outbound, outbound_size); - if (!op) - return -ENOMEM; - - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, - object_name, 0, inbound_size, inbound, - ver); - - rbd_osd_req_op_destroy(op); - - dout("cls_exec returned %d\n", ret); - return ret; -} - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { @@ -2831,7 +2672,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, __le64 size; } __attribute__ ((packed)) size_buf = { 0 }; - (void) rbd_req_sync_exec; /* Avoid a warning */ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), -- cgit v1.2.3 From 6977c3f983b0d2b481a65b1fa3e85683fd1318af Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Jan 2013 17:08:55 -0600 Subject: rbd: unregister linger in watch sync routine Move the code that unregisters an rbd device's lingering header object watch request into rbd_dev_header_watch_sync(), so it occurs in the same function that originally sets up that request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ea89917bb92..5593def33ce5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1721,6 +1721,10 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (start) { rbd_dev->watch_request = obj_request->osd_req; ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request); + } else { + ceph_osdc_unregister_linger_request(osdc, + rbd_dev->watch_request); + rbd_dev->watch_request = NULL; } ret = rbd_obj_request_submit(osdc, obj_request); if (ret) @@ -3995,12 +3999,6 @@ static void rbd_dev_release(struct device *dev) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - if (rbd_dev->watch_request) { - struct ceph_client *client = rbd_dev->rbd_client->client; - - ceph_osdc_unregister_linger_request(&client->osdc, - rbd_dev->watch_request); - } if (rbd_dev->watch_event) rbd_dev_header_watch_sync(rbd_dev, 0); -- cgit v1.2.3 From 975241afcbba82ab1ddc6ebf8a02246d1e9314fd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Jan 2013 17:08:55 -0600 Subject: rbd: track object rather than osd request for watch Switch to keeping track of the object request pointer rather than the osd request used to watch the rbd image header object. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5593def33ce5..fc1a045cee4d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -272,7 +272,7 @@ struct rbd_device { struct ceph_file_layout layout; struct ceph_osd_event *watch_event; - struct ceph_osd_request *watch_request; + struct rbd_obj_request *watch_request; struct rbd_spec *parent_spec; u64 parent_overlap; @@ -1719,11 +1719,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) goto out_cancel; if (start) { - rbd_dev->watch_request = obj_request->osd_req; - ceph_osdc_set_request_linger(osdc, rbd_dev->watch_request); + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); + rbd_dev->watch_request = obj_request; } else { ceph_osdc_unregister_linger_request(osdc, - rbd_dev->watch_request); + rbd_dev->watch_request->osd_req); rbd_dev->watch_request = NULL; } ret = rbd_obj_request_submit(osdc, obj_request); -- cgit v1.2.3 From 25dcf954c3230946b5f3e18db9f91d7640eff76e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Jan 2013 17:08:55 -0600 Subject: rbd: decrement obj request count when deleting Decrement the obj_request_count value when deleting an object request from its image request's list. Rearrange a few lines in the surrounding code. This resolves: http://tracker.ceph.com/issues/3940 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fc1a045cee4d..d3d15d06abc0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1063,22 +1063,29 @@ static void rbd_img_request_put(struct rbd_img_request *img_request) static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request) { + rbd_assert(obj_request->img_request == NULL); + rbd_obj_request_get(obj_request); obj_request->img_request = img_request; - list_add_tail(&obj_request->links, &img_request->obj_requests); - obj_request->which = img_request->obj_request_count++; + obj_request->which = img_request->obj_request_count; rbd_assert(obj_request->which != BAD_WHICH); + img_request->obj_request_count++; + list_add_tail(&obj_request->links, &img_request->obj_requests); } static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, struct rbd_obj_request *obj_request) { rbd_assert(obj_request->which != BAD_WHICH); - obj_request->which = BAD_WHICH; + list_del(&obj_request->links); + rbd_assert(img_request->obj_request_count > 0); + img_request->obj_request_count--; + rbd_assert(obj_request->which == img_request->obj_request_count); + obj_request->which = BAD_WHICH; rbd_assert(obj_request->img_request == img_request); - obj_request->callback = NULL; obj_request->img_request = NULL; + obj_request->callback = NULL; rbd_obj_request_put(obj_request); } @@ -1472,6 +1479,7 @@ static void rbd_img_request_destroy(struct kref *kref) for_each_obj_request_safe(img_request, obj_request, next_obj_request) rbd_img_obj_request_del(img_request, obj_request); + rbd_assert(img_request->obj_request_count == 0); if (img_request->write_request) ceph_put_snap_context(img_request->snapc); -- cgit v1.2.3 From 8eb87565306cf40a32f5d0883d008675cd2dd510 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 25 Jan 2013 17:08:55 -0600 Subject: rbd: don't drop watch requests on completion When we register an osd request to linger, it means that request will stay around (under control of the osd client) until we've unregistered it. We do that for an rbd image's header object, and we keep a pointer to the object request associated with it. Keep a reference to the watch object request for as long as it is registered to linger. Drop it again after we've removed the linger registration. This resolves: http://tracker.ceph.com/issues/3937 (Note: this originally came about because the osd client was issuing a callback more than once. But that behavior will be changing soon, documented in tracker issue 3967.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d3d15d06abc0..fd9656b5fdb9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1707,6 +1707,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; + rbd_assert(rbd_dev->watch_event != NULL); } ret = -ENOMEM; @@ -1726,32 +1727,43 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) if (!obj_request->osd_req) goto out_cancel; - if (start) { + if (start) ceph_osdc_set_request_linger(osdc, obj_request->osd_req); - rbd_dev->watch_request = obj_request; - } else { + else ceph_osdc_unregister_linger_request(osdc, rbd_dev->watch_request->osd_req); - rbd_dev->watch_request = NULL; - } ret = rbd_obj_request_submit(osdc, obj_request); if (ret) goto out_cancel; ret = rbd_obj_request_wait(obj_request); if (ret) goto out_cancel; - ret = obj_request->result; if (ret) goto out_cancel; - if (start) - goto done; /* Done if setting up the watch request */ + /* + * A watch request is set to linger, so the underlying osd + * request won't go away until we unregister it. We retain + * a pointer to the object request during that time (in + * rbd_dev->watch_request), so we'll keep a reference to + * it. We'll drop that reference (below) after we've + * unregistered it. + */ + if (start) { + rbd_dev->watch_request = obj_request; + + return 0; + } + + /* We have successfully torn down the watch request */ + + rbd_obj_request_put(rbd_dev->watch_request); + rbd_dev->watch_request = NULL; out_cancel: /* Cancel the event if we're tearing down, or on error */ ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; -done: if (obj_request) rbd_obj_request_put(obj_request); -- cgit v1.2.3 From 6d292906f80170f4647079dd503df18b737750af Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 14 Jan 2013 12:43:31 -0600 Subject: rbd: define flags field, use it for exists flag Define a new rbd device flags field, manipulated using bit operations. Replace the use of the current "exists" flag with a bit in this new "flags" field. Add a little commentary about the "exists" flag, which does not need to be manipulated atomically. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fd9656b5fdb9..8c90a39c2a91 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -264,7 +264,7 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - atomic_t exists; + unsigned long flags; struct rbd_spec *spec; char *header_name; @@ -292,6 +292,12 @@ struct rbd_device { unsigned long open_count; }; +/* Flag bits for rbd_dev->flags */ + +enum rbd_dev_flags { + RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ +}; + static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ static LIST_HEAD(rbd_dev_list); /* devices */ @@ -782,7 +788,8 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) goto done; rbd_dev->mapping.read_only = true; } - atomic_set(&rbd_dev->exists, 1); + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + done: return ret; } @@ -1877,9 +1884,14 @@ static void rbd_request_fn(struct request_queue *q) rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); } - /* Quit early if the snapshot has disappeared */ - - if (!atomic_read(&rbd_dev->exists)) { + /* + * Quit early if the mapped snapshot no longer + * exists. It's still possible the snapshot will + * have disappeared by the time our request arrives + * at the osd, but there's no sense in sending it if + * we already know. + */ + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { dout("request for non-existent snapshot"); rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); result = -ENXIO; @@ -2571,7 +2583,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); - atomic_set(&rbd_dev->exists, 0); + rbd_dev->flags = 0; INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); @@ -3200,10 +3212,17 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) { struct list_head *next = links->next; - /* Existing snapshot not in the new snap context */ - + /* + * A previously-existing snapshot is not in + * the new snap context. + * + * If the now missing snapshot is the one the + * image is mapped to, clear its exists flag + * so we can avoid sending any more requests + * to it. + */ if (rbd_dev->spec->snap_id == snap->id) - atomic_set(&rbd_dev->exists, 0); + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->spec->snap_id == snap->id ? -- cgit v1.2.3 From b82d167be64b3e88d9434d8a98ce83c83a07aa48 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 14 Jan 2013 12:43:31 -0600 Subject: rbd: prevent open for image being removed An open request for a mapped rbd image can arrive while removal of that mapping is underway. We need to prevent such an open request from succeeding. (It appears that Maciej Galkiewicz ran into this problem.) Define and use a "removing" flag to indicate a mapping is getting removed. Set it in the remove path after verifying nothing holds the device open. And check it in the open path before allowing the open to proceed. Acquire the rbd device's lock around each of these spots to avoid any races accessing the flags and open_count fields. This addresses: http://tracker.newdream.net/issues/3427 Reported-by: Maciej Galkiewicz Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8c90a39c2a91..ed0c91d81063 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -261,10 +261,10 @@ struct rbd_device { char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ - spinlock_t lock; /* queue lock */ + spinlock_t lock; /* queue, flags, open_count */ struct rbd_image_header header; - unsigned long flags; + unsigned long flags; /* possibly lock protected */ struct rbd_spec *spec; char *header_name; @@ -289,13 +289,19 @@ struct rbd_device { /* sysfs related */ struct device dev; - unsigned long open_count; + unsigned long open_count; /* protected by lock */ }; -/* Flag bits for rbd_dev->flags */ - +/* + * Flag bits for rbd_dev->flags. If atomicity is required, + * rbd_dev->lock is used to protect access. + * + * Currently, only the "removing" flag (which is coupled with the + * "open_count" field) requires atomic access. + */ enum rbd_dev_flags { RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ + RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ }; static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ @@ -383,14 +389,23 @@ static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + bool removing = false; if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; + spin_lock(&rbd_dev->lock); + if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) + removing = true; + else + rbd_dev->open_count++; + spin_unlock(&rbd_dev->lock); + if (removing) + return -ENOENT; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); (void) get_device(&rbd_dev->dev); set_device_ro(bdev, rbd_dev->mapping.read_only); - rbd_dev->open_count++; mutex_unlock(&ctl_mutex); return 0; @@ -399,10 +414,14 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) static int rbd_release(struct gendisk *disk, fmode_t mode) { struct rbd_device *rbd_dev = disk->private_data; + unsigned long open_count_before; + + spin_lock(&rbd_dev->lock); + open_count_before = rbd_dev->open_count--; + spin_unlock(&rbd_dev->lock); + rbd_assert(open_count_before > 0); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbd_assert(rbd_dev->open_count > 0); - rbd_dev->open_count--; put_device(&rbd_dev->dev); mutex_unlock(&ctl_mutex); @@ -4083,10 +4102,14 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - if (rbd_dev->open_count) { + spin_lock(&rbd_dev->lock); + if (rbd_dev->open_count) ret = -EBUSY; + else + set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock(&rbd_dev->lock); + if (ret < 0) goto done; - } rbd_remove_all_snaps(rbd_dev); rbd_bus_del_dev(rbd_dev); -- cgit v1.2.3 From 1e32d34cfa6759df58b5f4002664241f2a0fef6a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 30 Jan 2013 11:13:33 -0600 Subject: rbd: don't take extra bio reference for osd client Currently, if the OSD client finds an osd request has had a bio list attached to it, it drops a reference to it (or rather, to the first entry on that list) when the request is released. The code that added that reference (i.e., the rbd client) is therefore required to take an extra reference to that first bio structure. The osd client doesn't really do anything with the bio pointer other than transfer it from the osd request structure to outgoing (for writes) and ingoing (for reads) messages. So it really isn't the right place to be taking or dropping references. Furthermore, the rbd client already holds references to all bio structures it passes to the osd client, and holds them until the request is completed. So there's no need for this extra reference whatsoever. So remove the bio_put() call in ceph_osdc_release_request(), as well as its matching bio_get() call in rbd_osd_req_create(). This change could lead to a crash if old libceph.ko was used with new rbd.ko. Add a compatibility check at rbd initialization time to avoid this possibilty. This resolves: http://tracker.ceph.com/issues/3798 and http://tracker.ceph.com/issues/3799 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++++- net/ceph/ceph_common.c | 2 +- net/ceph/osd_client.c | 4 ---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ed0c91d81063..3ba4836f024c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1342,7 +1342,6 @@ static struct ceph_osd_request *rbd_osd_req_create( case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); osd_req->r_bio = obj_request->bio_list; - bio_get(osd_req->r_bio); /* osd client requires "num pages" even for bio */ osd_req->r_num_pages = calc_pages_for(offset, length); break; @@ -4149,6 +4148,11 @@ int __init rbd_init(void) { int rc; + if (!libceph_compatible(NULL)) { + rbd_warn(NULL, "libceph incompatibility (quitting)"); + + return -EINVAL; + } rc = rbd_sysfs_init(); if (rc) return rc; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a98c03ff853f..c236c235c4a2 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -39,7 +39,7 @@ */ bool libceph_compatible(void *data) { - return false; + return true; } EXPORT_SYMBOL(libceph_compatible); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 500ae8b49321..ba03648533c0 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -147,10 +147,6 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_own_pages) ceph_release_page_vector(req->r_pages, req->r_num_pages); -#ifdef CONFIG_BLOCK - if (req->r_bio) - bio_put(req->r_bio); -#endif ceph_put_snap_context(req->r_snapc); ceph_pagelist_release(&req->r_trail); if (req->r_mempool) -- cgit v1.2.3 From 9cbb1d7268afa997a7f96d779470cc57d28e1a13 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 31 Jan 2013 16:02:00 -0600 Subject: libceph: don't require r_num_pages for bio requests There is a check in the completion path for osd requests that ensures the number of pages allocated is enough to hold the amount of incoming data expected. For bio requests coming from rbd the "number of pages" is not really meaningful (although total length would be). So stop requiring that nr_pages be supplied for bio requests. This is done by checking whether the pages pointer is null before checking the value of nr_pages. Note that this value is passed on to the messenger, but there it's only used for debugging--it's never used for validation. While here, change another spot that used r_pages in a debug message inappropriately, and also invalidate the r_con_filling_msg pointer after dropping a reference to it. This resolves: http://tracker.ceph.com/issues/3875 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 -- net/ceph/osd_client.c | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3ba4836f024c..14a6967291d3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1342,8 +1342,6 @@ static struct ceph_osd_request *rbd_osd_req_create( case OBJ_REQUEST_BIO: rbd_assert(obj_request->bio_list != NULL); osd_req->r_bio = obj_request->bio_list; - /* osd client requires "num pages" even for bio */ - osd_req->r_num_pages = calc_pages_for(offset, length); break; case OBJ_REQUEST_PAGES: osd_req->r_pages = obj_request->pages; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ba03648533c0..d9d58bbe9f9a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -137,10 +137,11 @@ void ceph_osdc_release_request(struct kref *kref) if (req->r_request) ceph_msg_put(req->r_request); if (req->r_con_filling_msg) { - dout("%s revoking pages %p from con %p\n", __func__, - req->r_pages, req->r_con_filling_msg); + dout("%s revoking msg %p from con %p\n", __func__, + req->r_reply, req->r_con_filling_msg); ceph_msg_revoke_incoming(req->r_reply); req->r_con_filling_msg->ops->put(req->r_con_filling_msg); + req->r_con_filling_msg = NULL; } if (req->r_reply) ceph_msg_put(req->r_reply); @@ -1981,7 +1982,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, if (data_len > 0) { int want = calc_pages_for(req->r_page_alignment, data_len); - if (unlikely(req->r_num_pages < want)) { + if (req->r_pages && unlikely(req->r_num_pages < want)) { pr_warning("tid %lld reply has %d bytes %d pages, we" " had only %d pages ready\n", tid, data_len, want, req->r_num_pages); -- cgit v1.2.3 From a14ea269dd6b5e48a2941ba73b202cd7cd5d716d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 5 Feb 2013 13:23:12 -0600 Subject: rbd: turn off interrupts for open/remove locking This commit: bc7a62ee5 rbd: prevent open for image being removed added checking for removing rbd before allowing an open, and used the same request spinlock for protecting that and updating the open count as is used for the request queue. However it used the non-irq protected version of the spinlocks. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 14a6967291d3..91983a60487b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -394,12 +394,12 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) return -EROFS; - spin_lock(&rbd_dev->lock); + spin_lock_irq(&rbd_dev->lock); if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) removing = true; else rbd_dev->open_count++; - spin_unlock(&rbd_dev->lock); + spin_unlock_irq(&rbd_dev->lock); if (removing) return -ENOENT; @@ -416,9 +416,9 @@ static int rbd_release(struct gendisk *disk, fmode_t mode) struct rbd_device *rbd_dev = disk->private_data; unsigned long open_count_before; - spin_lock(&rbd_dev->lock); + spin_lock_irq(&rbd_dev->lock); open_count_before = rbd_dev->open_count--; - spin_unlock(&rbd_dev->lock); + spin_unlock_irq(&rbd_dev->lock); rbd_assert(open_count_before > 0); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); @@ -4099,12 +4099,12 @@ static ssize_t rbd_remove(struct bus_type *bus, goto done; } - spin_lock(&rbd_dev->lock); + spin_lock_irq(&rbd_dev->lock); if (rbd_dev->open_count) ret = -EBUSY; else set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); - spin_unlock(&rbd_dev->lock); + spin_unlock_irq(&rbd_dev->lock); if (ret < 0) goto done; -- cgit v1.2.3 From 077413082f9ade9ca4d9774dbdc81ee7256d8089 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 5 Feb 2013 23:41:50 -0600 Subject: rbd: add barriers near done flag operations Somehow, I missed this little item in Documentation/atomic_ops.txt: *** WARNING: atomic_read() and atomic_set() DO NOT IMPLY BARRIERS! *** Create and use some helper functions that include the proper memory barriers for manipulating the done field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 91983a60487b..982963ec2607 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1216,10 +1216,28 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) return wait_for_completion_interruptible(&obj_request->completion); } +static void obj_request_done_init(struct rbd_obj_request *obj_request) +{ + atomic_set(&obj_request->done, 0); + smp_wmb(); +} + +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + atomic_set(&obj_request->done, 1); + smp_wmb(); +} + +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_rmb(); + return atomic_read(&obj_request->done) != 0; +} + static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { - atomic_set(&obj_request->done, 1); + obj_request_done_set(obj_request); } static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) @@ -1249,14 +1267,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, xferred = obj_request->length; } obj_request->xferred = xferred; - atomic_set(&obj_request->done, 1); + obj_request_done_set(obj_request); } static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { obj_request->xferred = le64_to_cpu(op->extent.length); - atomic_set(&obj_request->done, 1); + obj_request_done_set(obj_request); } static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, @@ -1300,7 +1318,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, break; } - if (atomic_read(&obj_request->done)) + if (obj_request_done_test(obj_request)) rbd_obj_request_complete(obj_request); } @@ -1407,7 +1425,7 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, obj_request->which = BAD_WHICH; obj_request->type = type; INIT_LIST_HEAD(&obj_request->links); - atomic_set(&obj_request->done, 0); + obj_request_done_init(obj_request); init_completion(&obj_request->completion); kref_init(&obj_request->kref); @@ -1611,7 +1629,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) rbd_assert(more); rbd_assert(which < img_request->obj_request_count); - if (!atomic_read(&obj_request->done)) + if (!obj_request_done_test(obj_request)) break; rbd_assert(obj_request->xferred <= (u64) UINT_MAX); -- cgit v1.2.3 From 3c663bbdcdf9296e0fe3362acb9e81f49d7b72c6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 11:42:30 -0600 Subject: libceph: kill ceph_osdc_create_event() "one_shot" parameter There is only one caller of ceph_osdc_create_event(), and it provides 0 as its "one_shot" argument. Get rid of that argument and just use 0 in its place. Replace the code in handle_watch_notify() that executes if one_shot is nonzero in the event with a BUG_ON() call. While modifying "osd_client.c", give handle_watch_notify() static scope. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 11 +++++------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 982963ec2607..13ee789f2328 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1744,7 +1744,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start) rbd_assert(start ^ !!rbd_dev->watch_request); if (start) { - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, &rbd_dev->watch_event); if (ret < 0) return ret; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c39e7ed4b203..39c55d61e159 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -273,8 +273,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, /* watch/notify events */ extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, void (*event_cb)(u64, u64, u8, void *), - int one_shot, void *data, - struct ceph_osd_event **pevent); + void *data, struct ceph_osd_event **pevent); extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); extern int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cd3a489b7438..4322faa7c811 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1477,8 +1477,7 @@ static void __remove_event(struct ceph_osd_event *event) int ceph_osdc_create_event(struct ceph_osd_client *osdc, void (*event_cb)(u64, u64, u8, void *), - int one_shot, void *data, - struct ceph_osd_event **pevent) + void *data, struct ceph_osd_event **pevent) { struct ceph_osd_event *event; @@ -1488,7 +1487,7 @@ int ceph_osdc_create_event(struct ceph_osd_client *osdc, dout("create_event %p\n", event); event->cb = event_cb; - event->one_shot = one_shot; + event->one_shot = 0; event->data = data; event->osdc = osdc; INIT_LIST_HEAD(&event->osd_node); @@ -1541,7 +1540,8 @@ static void do_event_work(struct work_struct *work) /* * Process osd watch notifications */ -void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static void handle_watch_notify(struct ceph_osd_client *osdc, + struct ceph_msg *msg) { void *p, *end; u8 proto_ver; @@ -1562,9 +1562,8 @@ void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) spin_lock(&osdc->event_lock); event = __find_event(osdc, cookie); if (event) { + BUG_ON(event->one_shot); get_event(event); - if (event->one_shot) - __remove_event(event); } spin_unlock(&osdc->event_lock); dout("handle_watch_notify cookie %lld ver %lld event %p\n", -- cgit v1.2.3 From ef06f4d32ae5b656f17b53ee3f3c43471a11cc73 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Feb 2013 09:55:48 -0600 Subject: rbd: add parentheses to object request iterator macros The for_each_obj_request*() macros should parenthesize their uses of the ireq parameter. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 13ee789f2328..ff9e9255745c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -226,11 +226,11 @@ struct rbd_img_request { }; #define for_each_obj_request(ireq, oreq) \ - list_for_each_entry(oreq, &ireq->obj_requests, links) + list_for_each_entry(oreq, &(ireq)->obj_requests, links) #define for_each_obj_request_from(ireq, oreq) \ - list_for_each_entry_from(oreq, &ireq->obj_requests, links) + list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) #define for_each_obj_request_safe(ireq, oreq, n) \ - list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links) + list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) struct rbd_snap { struct device dev; -- cgit v1.2.3 From fbfab53966b279f9cdb36b96ffa1e22f042c96ff Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Feb 2013 09:55:48 -0600 Subject: libceph: allow STAT osd operations Add support for CEPH_OSD_OP_STAT operations in the osd client and in rbd. This operation sends no data to the osd; everything required is encoded in identity of the target object. The result will be ENOENT if the object doesn't exist. If it does exist and no other error occurs the server returns the size and last modification time of the target object as output data (in little endian format). The size is a 64 bit unsigned and the time is ceph_timespec structure (two unsigned 32-bit integers, representing a seconds and nanoseconds value). This resolves: http://tracker.ceph.com/issues/4007 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 +++++++++++++++ net/ceph/osd_client.c | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ff9e9255745c..09514d9d8a97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,6 +1148,8 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) if (opcode == CEPH_OSD_OP_WRITE) op->payload_len = op->extent.length; break; + case CEPH_OSD_OP_STAT: + break; case CEPH_OSD_OP_CALL: /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ op->cls.class_name = va_arg(args, char *); @@ -1277,6 +1279,16 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, obj_request_done_set(obj_request); } +/* + * For a simple stat call there's nothing to do. We'll do more if + * this is part of a write sequence for a layered image. + */ +static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request, + struct ceph_osd_op *op) +{ + obj_request_done_set(obj_request); +} + static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, struct ceph_msg *msg) { @@ -1307,6 +1319,9 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_WRITE: rbd_osd_write_callback(obj_request, op); break; + case CEPH_OSD_OP_STAT: + rbd_osd_stat_callback(obj_request, op); + break; case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b58748ec405d..39629b66f3b1 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -220,6 +220,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req, dst->op = cpu_to_le16(src->op); switch (src->op) { + case CEPH_OSD_OP_STAT: + break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: dst->extent.offset = @@ -255,7 +257,6 @@ static void osd_req_encode_op(struct ceph_osd_request *req, pr_err("unrecognized osd opcode %d\n", dst->op); WARN_ON(1); break; - case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_MAPEXT: case CEPH_OSD_OP_MASKTRUNC: case CEPH_OSD_OP_SPARSE_READ: -- cgit v1.2.3 From 1ceae7ef0fd00c965a2257c6e9eb497ca91f01c7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Feb 2013 13:11:38 -0600 Subject: rbd: prevent bytes transferred overflow In rbd_obj_read_sync(), verify the number of bytes transferred won't exceed what can be represented by a size_t before using it to indicate the number of bytes to copy to the result buffer. (The real motivation for this is to prepare for the next patch.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 09514d9d8a97..93369a1a08e1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2048,6 +2048,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, struct ceph_osd_client *osdc; struct page **pages = NULL; u32 page_count; + size_t size; int ret; page_count = (u32) calc_pages_for(offset, length); @@ -2084,7 +2085,10 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, ret = obj_request->result; if (ret < 0) goto out; - ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred); + + rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); + size = (size_t) obj_request->xferred; + ret = ceph_copy_from_page_vector(pages, buf, 0, size); if (version) *version = obj_request->version; out: -- cgit v1.2.3 From 23ed6e13b320b33decb516cbe66e71b132df488d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Feb 2013 13:11:38 -0600 Subject: rbd: ignore result of ceph_copy_from_page_vector() The result of ceph_copy_from_page_vector() is simply the length argument it is provided. This is called by rbd_obj_method_sync(), which returns the result if it's non-negative. But we always either ignore or overwrite that return value. So explicitly ignore what's returned by the copy function, and have rbd_obj_method_sync() always return either a negative errno or 0. We also return the result of ceph_copy_from_page_vector() in rbd_obj_read_sync(). There we still want to return the number of bytes transferred, but we can use the value we already have in hand rather than what ceph_copy_from_page_vector() provides. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 93369a1a08e1..c259b4089e95 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1889,7 +1889,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, ret = obj_request->result; if (ret < 0) goto out; - ret = ceph_copy_from_page_vector(pages, inbound, 0, + ret = 0; + (void) ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); if (version) *version = obj_request->version; @@ -2088,7 +2089,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); size = (size_t) obj_request->xferred; - ret = ceph_copy_from_page_vector(pages, buf, 0, size); + (void) ceph_copy_from_page_vector(pages, buf, 0, size); + rbd_assert(size <= (size_t) INT_MAX); + ret = (int) size; if (version) *version = obj_request->version; out: @@ -2141,7 +2144,6 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); - if (ret < 0) goto out_err; if (WARN_ON((size_t) ret < size)) { @@ -2803,7 +2805,6 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_obj_method_sync() can return positive */ p = reply_buf; rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, @@ -3742,7 +3743,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); if (ret < 0) goto out; - ret = 0; /* rbd_obj_method_sync() can return positive */ p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, -- cgit v1.2.3 From 903bb32e890237ca43ab847e561e5377cfe0fdb3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Feb 2013 13:11:38 -0600 Subject: libceph: drop return value from page vector copy routines The return values provided for ceph_copy_to_page_vector() and ceph_copy_from_page_vector() serve no purpose, so get rid of them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 5 ++--- include/linux/ceph/libceph.h | 4 ++-- net/ceph/pagevec.c | 14 ++++++-------- 3 files changed, 10 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c259b4089e95..b0eea3eaee93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1890,8 +1890,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, if (ret < 0) goto out; ret = 0; - (void) ceph_copy_from_page_vector(pages, inbound, 0, - obj_request->xferred); + ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); if (version) *version = obj_request->version; out: @@ -2089,7 +2088,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); size = (size_t) obj_request->xferred; - (void) ceph_copy_from_page_vector(pages, buf, 0, size); + ceph_copy_from_page_vector(pages, buf, 0, size); rbd_assert(size <= (size_t) INT_MAX); ret = (int) size; if (version) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 2250f8bb2490..29818fc3fa49 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -232,10 +232,10 @@ extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_copy_user_to_page_vector(struct page **pages, const void __user *data, loff_t off, size_t len); -extern int ceph_copy_to_page_vector(struct page **pages, +extern void ceph_copy_to_page_vector(struct page **pages, const void *data, loff_t off, size_t len); -extern int ceph_copy_from_page_vector(struct page **pages, +extern void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len); extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data, diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index 5b20be979c19..815a2249cfa9 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages, } EXPORT_SYMBOL(ceph_copy_user_to_page_vector); -int ceph_copy_to_page_vector(struct page **pages, +void ceph_copy_to_page_vector(struct page **pages, const void *data, loff_t off, size_t len) { int i = 0; size_t po = off & ~PAGE_CACHE_MASK; size_t left = len; - size_t l; while (left > 0) { - l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(page_address(pages[i]) + po, data, l); data += l; left -= l; @@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages, i++; } } - return len; } EXPORT_SYMBOL(ceph_copy_to_page_vector); -int ceph_copy_from_page_vector(struct page **pages, +void ceph_copy_from_page_vector(struct page **pages, void *data, loff_t off, size_t len) { int i = 0; size_t po = off & ~PAGE_CACHE_MASK; size_t left = len; - size_t l; while (left > 0) { - l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + memcpy(data, page_address(pages[i]) + po, l); data += l; left -= l; @@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages, i++; } } - return len; } EXPORT_SYMBOL(ceph_copy_from_page_vector); -- cgit v1.2.3 From 4dda41d3d76747414586a4bad5615b550e0986b1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 20 Feb 2013 21:59:33 -0600 Subject: rbd: ignore zero-length requests The old request code simply ignored zero-length requests. We should still operate that same way to avoid any changes in behavior. We can implement handling for special zero-length requests separately (see http://tracker.ceph.com/issues/4236). Add some assertions based on this new constraint. This resolves: http://tracker.ceph.com/issues/4237 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b0eea3eaee93..3cc003b26610 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1560,6 +1560,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, image_offset = img_request->offset; rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT); resid = img_request->length; + rbd_assert(resid > 0); while (resid) { const char *object_name; unsigned int clone_size; @@ -1627,8 +1628,10 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) bool more = true; img_request = obj_request->img_request; + rbd_assert(img_request != NULL); rbd_assert(img_request->rq != NULL); + rbd_assert(img_request->obj_request_count > 0); rbd_assert(which != BAD_WHICH); rbd_assert(which < img_request->obj_request_count); rbd_assert(which >= img_request->next_completion); @@ -1918,6 +1921,19 @@ static void rbd_request_fn(struct request_queue *q) /* Ignore any non-FS requests that filter through. */ if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + __blk_end_request_all(rq, 0); + continue; + } + + /* Ignore/skip any zero-length requests */ + + offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; + length = (u64) blk_rq_bytes(rq); + + if (!length) { + dout("%s: zero-length request\n", __func__); __blk_end_request_all(rq, 0); continue; } @@ -1947,9 +1963,6 @@ static void rbd_request_fn(struct request_queue *q) goto end_request; } - offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT; - length = (u64) blk_rq_bytes(rq); - result = -EINVAL; if (WARN_ON(offset && length > U64_MAX - offset + 1)) goto end_request; /* Shouldn't happen */ -- cgit v1.2.3 From 632b88cadece050ca925d74bda250c4a320c5cc7 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 21 Feb 2013 10:10:06 -0600 Subject: rbd: barriers are hard Let's go shopping! I'm afraid this may not have gotten it right: 07741308 rbd: add barriers near done flag operations The smp_wmb() should have been done *before* setting the done flag, to ensure all other data was valid before marking the object request done. Switch to use atomic_inc_return() here to set the done flag, which allows us to verify we don't mark something done more than once. Doing this also implies general barriers before and after the call. And although a read memory barrier might have been sufficient before reading the done flag, convert this to a full memory barrier just to put this issue to bed. This resolves: http://tracker.ceph.com/issues/4238 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3cc003b26610..bd6078bf99d3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1226,13 +1226,22 @@ static void obj_request_done_init(struct rbd_obj_request *obj_request) static void obj_request_done_set(struct rbd_obj_request *obj_request) { - atomic_set(&obj_request->done, 1); - smp_wmb(); + int done; + + done = atomic_inc_return(&obj_request->done); + if (done > 1) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; + + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "obj_request %p was already done\n", + obj_request); + } } static bool obj_request_done_test(struct rbd_obj_request *obj_request) { - smp_rmb(); + smp_mb(); return atomic_read(&obj_request->done) != 0; } -- cgit v1.2.3 From 37206ee5bede14d59306fea3af4c0105d4712342 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 20 Feb 2013 17:32:08 -0600 Subject: rbd: normalize dout() calls Add dout() calls to facilitate tracing of image and object requests. Change a few existing calls so they use __func__ rather than the hard-coded function name. Have calls always add ":" after the name of the function, and prefix pointer values with a consistent tag indicating what it represents. (Note that there remain some older dout() calls that are left untouched by this patch.) Issue a warning if rbd_osd_write_callback() ever gets a short write. This resolves: http://tracker.ceph.com/issues/4235 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bd6078bf99d3..a9c86ca5889f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -443,7 +443,7 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) struct rbd_client *rbdc; int ret = -ENOMEM; - dout("rbd_client_create\n"); + dout("%s:\n", __func__); rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); if (!rbdc) goto out_opt; @@ -467,8 +467,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) spin_unlock(&rbd_client_list_lock); mutex_unlock(&ctl_mutex); + dout("%s: rbdc %p\n", __func__, rbdc); - dout("rbd_client_create created %p\n", rbdc); return rbdc; out_err: @@ -479,6 +479,8 @@ out_mutex: out_opt: if (ceph_opts) ceph_destroy_options(ceph_opts); + dout("%s: error %d\n", __func__, ret); + return ERR_PTR(ret); } @@ -605,7 +607,7 @@ static void rbd_client_release(struct kref *kref) { struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); - dout("rbd_release_client %p\n", rbdc); + dout("%s: rbdc %p\n", __func__, rbdc); spin_lock(&rbd_client_list_lock); list_del(&rbdc->node); spin_unlock(&rbd_client_list_lock); @@ -1064,6 +1066,8 @@ out_err: static void rbd_obj_request_get(struct rbd_obj_request *obj_request) { + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); kref_get(&obj_request->kref); } @@ -1071,11 +1075,15 @@ static void rbd_obj_request_destroy(struct kref *kref); static void rbd_obj_request_put(struct rbd_obj_request *obj_request) { rbd_assert(obj_request != NULL); + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); kref_put(&obj_request->kref, rbd_obj_request_destroy); } static void rbd_img_request_get(struct rbd_img_request *img_request) { + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); kref_get(&img_request->kref); } @@ -1083,6 +1091,8 @@ static void rbd_img_request_destroy(struct kref *kref); static void rbd_img_request_put(struct rbd_img_request *img_request) { rbd_assert(img_request != NULL); + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); kref_put(&img_request->kref, rbd_img_request_destroy); } @@ -1097,6 +1107,8 @@ static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, rbd_assert(obj_request->which != BAD_WHICH); img_request->obj_request_count++; list_add_tail(&obj_request->links, &img_request->obj_requests); + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); } static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, @@ -1104,6 +1116,8 @@ static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, { rbd_assert(obj_request->which != BAD_WHICH); + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); list_del(&obj_request->links); rbd_assert(img_request->obj_request_count > 0); img_request->obj_request_count--; @@ -1200,11 +1214,14 @@ static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) static int rbd_obj_request_submit(struct ceph_osd_client *osdc, struct rbd_obj_request *obj_request) { + dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request); + return ceph_osdc_start_request(osdc, obj_request->osd_req, false); } static void rbd_img_request_complete(struct rbd_img_request *img_request) { + dout("%s: img %p\n", __func__, img_request); if (img_request->callback) img_request->callback(img_request); else @@ -1215,6 +1232,8 @@ static void rbd_img_request_complete(struct rbd_img_request *img_request) static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) { + dout("%s: obj %p\n", __func__, obj_request); + return wait_for_completion_interruptible(&obj_request->completion); } @@ -1248,11 +1267,14 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { + dout("%s: obj %p\n", __func__, obj_request); obj_request_done_set(obj_request); } static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { + dout("%s: obj %p cb %p\n", __func__, obj_request, + obj_request->callback); if (obj_request->callback) obj_request->callback(obj_request); else @@ -1270,6 +1292,8 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, */ xferred = le64_to_cpu(op->extent.length); rbd_assert(xferred < (u64) UINT_MAX); + dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, + obj_request->result, xferred, obj_request->length); if (obj_request->result == (s32) -ENOENT) { zero_bio_chain(obj_request->bio_list, 0); obj_request->result = 0; @@ -1284,7 +1308,22 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { + obj_request->xferred = le64_to_cpu(op->extent.length); + dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, + obj_request->result, obj_request->xferred, obj_request->length); + + /* A short write really shouldn't occur. Warn if we see one */ + + if (obj_request->xferred != obj_request->length) { + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev; + + rbd_dev = img_request ? img_request->rbd_dev : NULL; + rbd_warn(rbd_dev, "wrote %llu want %llu\n", + obj_request->xferred, obj_request->length); + } + obj_request_done_set(obj_request); } @@ -1295,6 +1334,7 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { + dout("%s: obj %p\n", __func__, obj_request); obj_request_done_set(obj_request); } @@ -1307,6 +1347,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, u32 num_ops; u16 opcode; + dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); rbd_assert(osd_req == obj_request->osd_req); rbd_assert(!!obj_request->img_request ^ (obj_request->which == BAD_WHICH)); @@ -1453,6 +1494,9 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, init_completion(&obj_request->completion); kref_init(&obj_request->kref); + dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, + offset, length, (int)type, obj_request); + return obj_request; } @@ -1462,6 +1506,8 @@ static void rbd_obj_request_destroy(struct kref *kref) obj_request = container_of(kref, struct rbd_obj_request, kref); + dout("%s: obj %p\n", __func__, obj_request); + rbd_assert(obj_request->img_request == NULL); rbd_assert(obj_request->which == BAD_WHICH); @@ -1531,6 +1577,10 @@ struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev, rbd_img_request_get(img_request); /* Avoid a warning */ rbd_img_request_put(img_request); /* TEMPORARY */ + dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, + write_request ? "write" : "read", offset, length, + img_request); + return img_request; } @@ -1542,6 +1592,8 @@ static void rbd_img_request_destroy(struct kref *kref) img_request = container_of(kref, struct rbd_img_request, kref); + dout("%s: img %p\n", __func__, img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) rbd_img_obj_request_del(img_request, obj_request); rbd_assert(img_request->obj_request_count == 0); @@ -1563,6 +1615,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request, u64 resid; u16 opcode; + dout("%s: img %p bio %p\n", __func__, img_request, bio_list); + opcode = img_request->write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; bio_offset = 0; @@ -1638,6 +1692,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) img_request = obj_request->img_request; + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); rbd_assert(img_request != NULL); rbd_assert(img_request->rq != NULL); rbd_assert(img_request->obj_request_count > 0); @@ -1685,6 +1740,7 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct rbd_obj_request *obj_request; + dout("%s: img %p\n", __func__, img_request); for_each_obj_request(img_request, obj_request) { int ret; @@ -1745,7 +1801,7 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, rbd_dev->header_name, (unsigned long long) notify_id, (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); @@ -3371,7 +3427,7 @@ static int rbd_dev_snaps_register(struct rbd_device *rbd_dev) struct rbd_snap *snap; int ret = 0; - dout("%s called\n", __func__); + dout("%s:\n", __func__); if (WARN_ON(!device_is_registered(&rbd_dev->dev))) return -EIO; -- cgit v1.2.3 From cc344fa1b541b116190291d366583585f03d0fe6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 19 Feb 2013 12:25:56 -0600 Subject: rbd: eliminate sparse warnings Fengguang Wu reminded me that there were outstanding sparse reports in the ceph and rbd code. This patch fixes these problems in rbd that lead to those reports: - Convert functions that are never referenced externally to have static scope. - Add a lockdep annotation to rbd_request_fn(), because it releases a lock before acquiring it again. This partially resolves: http://tracker.ceph.com/issues/4184 Reported-by: Fengguang Wu Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a9c86ca5889f..c6b15d4b3d73 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1141,7 +1141,7 @@ static bool obj_request_type_valid(enum obj_request_type type) } } -struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; va_list args; @@ -1537,7 +1537,8 @@ static void rbd_obj_request_destroy(struct kref *kref) * that comprises the image request, and the Linux request pointer * (if there is one). */ -struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev, +static struct rbd_img_request *rbd_img_request_create( + struct rbd_device *rbd_dev, u64 offset, u64 length, bool write_request) { @@ -1971,6 +1972,7 @@ out: } static void rbd_request_fn(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) { struct rbd_device *rbd_dev = q->queuedata; bool read_only = rbd_dev->mapping.read_only; @@ -2705,7 +2707,7 @@ static void rbd_spec_free(struct kref *kref) kfree(spec); } -struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, +static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, struct rbd_spec *spec) { struct rbd_device *rbd_dev; @@ -4256,7 +4258,7 @@ static void rbd_sysfs_cleanup(void) device_unregister(&rbd_root_dev); } -int __init rbd_init(void) +static int __init rbd_init(void) { int rc; @@ -4272,7 +4274,7 @@ int __init rbd_init(void) return 0; } -void __exit rbd_exit(void) +static void __exit rbd_exit(void) { rbd_sysfs_cleanup(); } -- cgit v1.2.3 From 39bf2c5d096729939cab657fe641044eceaa84a2 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 26 Feb 2013 14:23:07 -0600 Subject: rbd: move rbd_osd_trivial_callback() This function is slightly out of place, probably the result of an errant automatic merge or something. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- drivers/block/rbd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c6b15d4b3d73..4f5a647dbfd2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1264,13 +1264,6 @@ static bool obj_request_done_test(struct rbd_obj_request *obj_request) return atomic_read(&obj_request->done) != 0; } -static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, - struct ceph_osd_op *op) -{ - dout("%s: obj %p\n", __func__, obj_request); - obj_request_done_set(obj_request); -} - static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) { dout("%s: obj %p cb %p\n", __func__, obj_request, @@ -1281,6 +1274,13 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, + struct ceph_osd_op *op) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} + static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, struct ceph_osd_op *op) { -- cgit v1.2.3 From c47f9371545abe2510ac3b66c3fc180921816f65 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 26 Feb 2013 14:23:07 -0600 Subject: rbd: pass length, not op for osd completions The only thing type-specific osd completion functions do with their osd op parameter is (in some cases) extract the number of bytes transferred from it. In the other cases, the xferred bytes field is not used, and total message data transfer byte count (which may well be zero) is used. Just set the object request transfer count in the main osd request callback function and provide that to the other routines. There is then no longer any need to pass the op pointer to the type-specific completion routines, so drop those parameters. Stop doing anything with the total message data length. Signed-off-by: Alex Elder Reviewed-by: Sage Weil --- drivers/block/rbd.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4f5a647dbfd2..22085e86a409 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1274,42 +1274,30 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) complete_all(&obj_request->completion); } -static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request, - struct ceph_osd_op *op) +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) { dout("%s: obj %p\n", __func__, obj_request); obj_request_done_set(obj_request); } -static void rbd_osd_read_callback(struct rbd_obj_request *obj_request, - struct ceph_osd_op *op) +static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { - u64 xferred; - /* - * We support a 64-bit length, but ultimately it has to be - * passed to blk_end_request(), which takes an unsigned int. - */ - xferred = le64_to_cpu(op->extent.length); - rbd_assert(xferred < (u64) UINT_MAX); dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, - obj_request->result, xferred, obj_request->length); + obj_request->result, obj_request->xferred, obj_request->length); if (obj_request->result == (s32) -ENOENT) { zero_bio_chain(obj_request->bio_list, 0); obj_request->result = 0; - } else if (xferred < obj_request->length && !obj_request->result) { - zero_bio_chain(obj_request->bio_list, xferred); - xferred = obj_request->length; + } else if (obj_request->xferred < obj_request->length && + !obj_request->result) { + zero_bio_chain(obj_request->bio_list, obj_request->xferred); + obj_request->xferred = obj_request->length; } - obj_request->xferred = xferred; obj_request_done_set(obj_request); } -static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, - struct ceph_osd_op *op) +static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) { - - obj_request->xferred = le64_to_cpu(op->extent.length); dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, obj_request->result, obj_request->xferred, obj_request->length); @@ -1331,8 +1319,7 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request, * For a simple stat call there's nothing to do. We'll do more if * this is part of a write sequence for a layered image. */ -static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request, - struct ceph_osd_op *op) +static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) { dout("%s: obj %p\n", __func__, obj_request); obj_request_done_set(obj_request); @@ -1352,7 +1339,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_assert(!!obj_request->img_request ^ (obj_request->which == BAD_WHICH)); - obj_request->xferred = le32_to_cpu(msg->hdr.data_len); reply_head = msg->front.iov_base; obj_request->result = (s32) le32_to_cpu(reply_head->result); obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); @@ -1360,22 +1346,29 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, num_ops = le32_to_cpu(reply_head->num_ops); WARN_ON(num_ops != 1); /* For now */ + /* + * We support a 64-bit length, but ultimately it has to be + * passed to blk_end_request(), which takes an unsigned int. + */ op = &reply_head->ops[0]; + obj_request->xferred = le64_to_cpu(op->extent.length); + rbd_assert(obj_request->xferred < (u64) UINT_MAX); + opcode = le16_to_cpu(op->op); switch (opcode) { case CEPH_OSD_OP_READ: - rbd_osd_read_callback(obj_request, op); + rbd_osd_read_callback(obj_request); break; case CEPH_OSD_OP_WRITE: - rbd_osd_write_callback(obj_request, op); + rbd_osd_write_callback(obj_request); break; case CEPH_OSD_OP_STAT: - rbd_osd_stat_callback(obj_request, op); + rbd_osd_stat_callback(obj_request); break; case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: - rbd_osd_trivial_callback(obj_request, op); + rbd_osd_trivial_callback(obj_request); break; default: rbd_warn(NULL, "%s: unsupported op %hu\n", -- cgit v1.2.3 From 1b83bef24c6746a146d39915a18fb5425f2facb0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Feb 2013 16:11:12 -0800 Subject: libceph: update osd request/reply encoding Use the new version of the encoding for osd requests and replies. In the process, update the way we are tracking request ops and reply lengths and results in the struct ceph_osd_request. Update the rbd and fs/ceph users appropriately. The main changes are: - we keep pointers into the request memory for fields we need to update each time the request is sent out over the wire - we keep information about the result in an array in the request struct where the users can easily get at it. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- drivers/block/rbd.c | 52 +++++---- fs/ceph/addr.c | 31 ++---- include/linux/ceph/osd_client.h | 19 +++- include/linux/ceph/rados.h | 38 ------- net/ceph/debugfs.c | 18 +--- net/ceph/osd_client.c | 233 +++++++++++++++++++++++++++++----------- 6 files changed, 222 insertions(+), 169 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 22085e86a409..6c81a4c040b9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -196,7 +196,7 @@ struct rbd_obj_request { u64 xferred; /* bytes transferred */ u64 version; - s32 result; + int result; atomic_t done; rbd_obj_callback_t callback; @@ -1282,12 +1282,19 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) { - dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, obj_request->result, obj_request->xferred, obj_request->length); - if (obj_request->result == (s32) -ENOENT) { + /* + * ENOENT means a hole in the object. We zero-fill the + * entire length of the request. A short read also implies + * zero-fill to the end of the request. Either way we + * update the xferred count to indicate the whole request + * was satisfied. + */ + if (obj_request->result == -ENOENT) { zero_bio_chain(obj_request->bio_list, 0); obj_request->result = 0; + obj_request->xferred = obj_request->length; } else if (obj_request->xferred < obj_request->length && !obj_request->result) { zero_bio_chain(obj_request->bio_list, obj_request->xferred); @@ -1298,20 +1305,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) { - dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, - obj_request->result, obj_request->xferred, obj_request->length); - - /* A short write really shouldn't occur. Warn if we see one */ - - if (obj_request->xferred != obj_request->length) { - struct rbd_img_request *img_request = obj_request->img_request; - struct rbd_device *rbd_dev; - - rbd_dev = img_request ? img_request->rbd_dev : NULL; - rbd_warn(rbd_dev, "wrote %llu want %llu\n", - obj_request->xferred, obj_request->length); - } - + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short write. + * Our xferred value is the number of bytes transferred + * back. Set it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; obj_request_done_set(obj_request); } @@ -1329,9 +1330,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, struct ceph_msg *msg) { struct rbd_obj_request *obj_request = osd_req->r_priv; - struct ceph_osd_reply_head *reply_head; - struct ceph_osd_op *op; - u32 num_ops; u16 opcode; dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); @@ -1339,22 +1337,19 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_assert(!!obj_request->img_request ^ (obj_request->which == BAD_WHICH)); - reply_head = msg->front.iov_base; - obj_request->result = (s32) le32_to_cpu(reply_head->result); + if (osd_req->r_result < 0) + obj_request->result = osd_req->r_result; obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); - num_ops = le32_to_cpu(reply_head->num_ops); - WARN_ON(num_ops != 1); /* For now */ + WARN_ON(osd_req->r_num_ops != 1); /* For now */ /* * We support a 64-bit length, but ultimately it has to be * passed to blk_end_request(), which takes an unsigned int. */ - op = &reply_head->ops[0]; - obj_request->xferred = le64_to_cpu(op->extent.length); + obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64) UINT_MAX); - - opcode = le16_to_cpu(op->op); + opcode = osd_req->r_request_ops[0].op; switch (opcode) { case CEPH_OSD_OP_READ: rbd_osd_read_callback(obj_request); @@ -1719,6 +1714,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) more = blk_end_request(img_request->rq, result, xferred); which++; } + rbd_assert(more ^ (which == img_request->obj_request_count)); img_request->next_completion = which; out: diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index fc613715af46..cfef3e01a9b3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - int rc, bytes; + int rc = req->r_result; + int bytes = le32_to_cpu(msg->hdr.data_len); int i; - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - rc = le32_to_cpu(replyhead->result); - bytes = le32_to_cpu(msg->hdr.data_len); - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); /* unlock all pages, zeroing any data we didn't read */ @@ -553,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_msg *msg) { struct inode *inode = req->r_inode; - struct ceph_osd_reply_head *replyhead; - struct ceph_osd_op *op; struct ceph_inode_info *ci = ceph_inode(inode); unsigned wrote; struct page *page; int i; struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; - __s32 rc = -EIO; - u64 bytes = 0; + int rc = req->r_result; + u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); long writeback_stat; unsigned issued = ceph_caps_issued(ci); - /* parse reply */ - replyhead = msg->front.iov_base; - WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); - op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); - bytes = le64_to_cpu(op->extent.length); - if (rc >= 0) { /* * Assume we wrote the pages we originally sent. The @@ -740,8 +725,6 @@ retry: struct page *page; int want; u64 offset, len; - struct ceph_osd_request_head *reqhead; - struct ceph_osd_op *op; long writeback_stat; next = 0; @@ -905,10 +888,8 @@ get_more_pages: /* revise final length, page count */ req->r_num_pages = locked_pages; - reqhead = req->r_request->front.iov_base; - op = (void *)(reqhead + 1); - op->extent.length = cpu_to_le64(len); - op->payload_len = cpu_to_le32(len); + req->r_request_ops[0].extent.length = cpu_to_le64(len); + req->r_request_ops[0].payload_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad8899fc3157..1dd5d466b6f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -47,6 +47,9 @@ struct ceph_osd { struct list_head o_keepalive_item; }; + +#define CEPH_OSD_MAX_OP 10 + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -63,9 +66,23 @@ struct ceph_osd_request { struct ceph_connection *r_con_filling_msg; struct ceph_msg *r_request, *r_reply; - int r_result; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ + int r_num_ops; + + /* encoded message content */ + struct ceph_osd_op *r_request_ops; + /* these are updated on each send */ + __le32 *r_request_osdmap_epoch; + __le32 *r_request_flags; + __le64 *r_request_pool; + void *r_request_pgid; + __le32 *r_request_attempts; + struct ceph_eversion *r_request_reassert_version; + + int r_result; + int r_reply_op_len[CEPH_OSD_MAX_OP]; + s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index d784c8dfb09a..68c96a508ac2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -416,43 +416,5 @@ struct ceph_osd_op { __le32 payload_len; } __attribute__ ((packed)); -/* - * osd request message header. each request may include multiple - * ceph_osd_op object operations. - */ -struct ceph_osd_request_head { - __le32 client_inc; /* client incarnation */ - struct ceph_object_layout layout; /* pgid */ - __le32 osdmap_epoch; /* client's osdmap epoch */ - - __le32 flags; - - struct ceph_timespec mtime; /* for mutations only */ - struct ceph_eversion reassert_version; /* if we are replaying op */ - - __le32 object_len; /* length of object name */ - - __le64 snapid; /* snapid to read */ - __le64 snap_seq; /* writer's snap context */ - __le32 num_snaps; - - __le16 num_ops; - struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ -} __attribute__ ((packed)); - -struct ceph_osd_reply_head { - __le32 client_inc; /* client incarnation */ - __le32 flags; - struct ceph_object_layout layout; - __le32 osdmap_epoch; - struct ceph_eversion reassert_version; /* for replaying uncommitted */ - - __le32 result; /* result code */ - - __le32 object_len; /* length of object name */ - __le32 num_ops; - struct ceph_osd_op ops[0]; /* ops[], object */ -} __attribute__ ((packed)); - #endif diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index f4d4b27d6026..00d051f4894e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -123,10 +123,7 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; - struct ceph_osd_request_head *head; - struct ceph_osd_op *op; - int num_ops; - int opcode, olen; + int opcode; int i; req = rb_entry(p, struct ceph_osd_request, r_node); @@ -135,13 +132,7 @@ static int osdc_show(struct seq_file *s, void *pp) req->r_osd ? req->r_osd->o_osd : -1, req->r_pgid.pool, req->r_pgid.seed); - head = req->r_request->front.iov_base; - op = (void *)(head + 1); - - num_ops = le16_to_cpu(head->num_ops); - olen = le32_to_cpu(head->object_len); - seq_printf(s, "%.*s", olen, - (const char *)(head->ops + num_ops)); + seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); if (req->r_reassert_version.epoch) seq_printf(s, "\t%u'%llu", @@ -150,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp) else seq_printf(s, "\t"); - for (i = 0; i < num_ops; i++) { - opcode = le16_to_cpu(op->op); + for (i = 0; i < req->r_num_ops; i++) { + opcode = le16_to_cpu(req->r_request_ops[i].op); seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); - op++; } seq_printf(s, "\n"); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5584f0a08e28..d730dd4d8eb2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -146,15 +146,23 @@ EXPORT_SYMBOL(ceph_osdc_release_request); struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - unsigned int num_op, + unsigned int num_ops, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; - size_t msg_size = sizeof(struct ceph_osd_request_head); - - msg_size += num_op*sizeof(struct ceph_osd_op); + size_t msg_size; + + msg_size = 4 + 4 + 8 + 8 + 4+8; + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += 1 + 8 + 4 + 4; /* pg_t */ + msg_size += 4 + MAX_OBJ_NAME_SIZE; + msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); + msg_size += 8; /* snapid */ + msg_size += 8; /* snap_seq */ + msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4; if (use_mempool) { req = mempool_alloc(osdc->req_mempool, gfp_flags); @@ -193,9 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ - msg_size += MAX_OBJ_NAME_SIZE; - if (snapc) - msg_size += sizeof(u64) * snapc->num_snaps; if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else @@ -324,55 +329,80 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, unsigned int num_op, + u64 off, u64 len, unsigned int num_ops, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) { struct ceph_msg *msg = req->r_request; - struct ceph_osd_request_head *head; struct ceph_osd_req_op *src_op; - struct ceph_osd_op *op; void *p; - size_t msg_size = sizeof(*head) + num_op*sizeof(*op); + size_t msg_size; int flags = req->r_flags; u64 data_len; int i; - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - - head = msg->front.iov_base; - head->snapid = cpu_to_le64(snap_id); - op = (void *)(head + 1); - p = (void *)(op + num_op); - + req->r_num_ops = num_ops; + req->r_snapid = snap_id; req->r_snapc = ceph_get_snap_context(snapc); - head->client_inc = cpu_to_le32(1); /* always, for now. */ - head->flags = cpu_to_le32(flags); - if (flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(&head->mtime, mtime); - BUG_ON(num_op > (unsigned int) ((u16) -1)); - head->num_ops = cpu_to_le16(num_op); + /* encode request */ + msg->hdr.version = cpu_to_le16(4); - /* fill in oid */ - head->object_len = cpu_to_le32(req->r_oid_len); + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_oid_len); memcpy(p, req->r_oid, req->r_oid_len); + dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); p += req->r_oid_len; + /* ops */ + ceph_encode_16(&p, num_ops); src_op = src_ops; - while (num_op--) - osd_req_encode_op(req, op++, src_op++); + req->r_request_ops = p; + for (i = 0; i < num_ops; i++, src_op++) { + osd_req_encode_op(req, p, src_op); + p += sizeof(struct ceph_osd_op); + } - if (snapc) { - head->snap_seq = cpu_to_le64(snapc->seq); - head->num_snaps = cpu_to_le32(snapc->num_snaps); + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { for (i = 0; i < snapc->num_snaps; i++) { - put_unaligned_le64(snapc->snaps[i], p); - p += sizeof(u64); + ceph_encode_64(&p, req->r_snapc->snaps[i]); } } + req->r_request_attempts = p; + p += 4; + data_len = req->r_trail.length; if (flags & CEPH_OSD_FLAG_WRITE) { req->r_request->hdr.data_off = cpu_to_le16(off); @@ -385,6 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, msg_size = p - msg->front.iov_base; msg->front.iov_len = msg_size; msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, + num_ops); return; } EXPORT_SYMBOL(ceph_osdc_build_request); @@ -991,21 +1024,22 @@ out: static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req) { - struct ceph_osd_request_head *reqhead; - - dout("send_request %p tid %llu to osd%d flags %d\n", - req, req->r_tid, req->r_osd->o_osd, req->r_flags); - - reqhead = req->r_request->front.iov_base; - reqhead->snapid = cpu_to_le64(req->r_snapid); - reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); - reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ - reqhead->reassert_version = req->r_reassert_version; + void *p; - reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed); - reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool); - reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1); - reqhead->layout.ol_stripe_unit = 0; + dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", + req, req->r_tid, req->r_osd->o_osd, req->r_flags, + (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + + /* fill in message content that changes each time we send it */ + put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); + put_unaligned_le32(req->r_flags, req->r_request_flags); + put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); + p = req->r_request_pgid; + ceph_encode_64(&p, req->r_pgid.pool); + ceph_encode_32(&p, req->r_pgid.seed); + put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ + memcpy(req->r_request_reassert_version, &req->r_reassert_version, + sizeof(req->r_reassert_version)); req->r_stamp = jiffies; list_move_tail(&req->r_req_lru_item, &osdc->req_lru); @@ -1105,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req) complete_all(&req->r_safe_completion); /* fsync waiter */ } +static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) +{ + __u8 v; + + ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); + v = ceph_decode_8(p); + if (v > 1) { + pr_warning("do not understand pg encoding %d > 1", v); + return -EINVAL; + } + pgid->pool = ceph_decode_64(p); + pgid->seed = ceph_decode_32(p); + *p += 4; + return 0; + +bad: + pr_warning("incomplete pg encoding"); + return -EINVAL; +} + /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1112,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req) static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_connection *con) { - struct ceph_osd_reply_head *rhead = msg->front.iov_base; + void *p, *end; struct ceph_osd_request *req; u64 tid; - int numops, object_len, flags; + int object_len; + int numops, payload_len, flags; s32 result; + s32 retry_attempt; + struct ceph_pg pg; + int err; + u32 reassert_epoch; + u64 reassert_version; + u32 osdmap_epoch; + int i; tid = le64_to_cpu(msg->hdr.tid); - if (msg->front.iov_len < sizeof(*rhead)) - goto bad; - numops = le32_to_cpu(rhead->num_ops); - object_len = le32_to_cpu(rhead->object_len); - result = le32_to_cpu(rhead->result); - if (msg->front.iov_len != sizeof(*rhead) + object_len + - numops * sizeof(struct ceph_osd_op)) + dout("handle_reply %p tid %llu\n", msg, tid); + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 4, bad); + object_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, object_len, bad); + p += object_len; + + err = __decode_pgid(&p, end, &pg); + if (err) goto bad; - dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); + + ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); + flags = ceph_decode_64(&p); + result = ceph_decode_32(&p); + reassert_epoch = ceph_decode_32(&p); + reassert_version = ceph_decode_64(&p); + osdmap_epoch = ceph_decode_32(&p); + /* lookup */ mutex_lock(&osdc->request_mutex); req = __lookup_request(osdc, tid); @@ -1137,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, return; } ceph_osdc_get_request(req); - flags = le32_to_cpu(rhead->flags); + + dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, + req, result); + + ceph_decode_need(&p, end, 4, bad); + numops = ceph_decode_32(&p); + if (numops > CEPH_OSD_MAX_OP) + goto bad_put; + if (numops != req->r_num_ops) + goto bad_put; + payload_len = 0; + ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad); + for (i = 0; i < numops; i++) { + struct ceph_osd_op *op = p; + int len; + + len = le32_to_cpu(op->payload_len); + req->r_reply_op_len[i] = len; + dout(" op %d has %d bytes\n", i, len); + payload_len += len; + p += sizeof(*op); + } + if (payload_len != le32_to_cpu(msg->hdr.data_len)) { + pr_warning("sum of op payload lens %d != data_len %d", + payload_len, le32_to_cpu(msg->hdr.data_len)); + goto bad_put; + } + + ceph_decode_need(&p, end, 4 + numops * 4, bad); + retry_attempt = ceph_decode_32(&p); + for (i = 0; i < numops; i++) + req->r_reply_op_result[i] = ceph_decode_32(&p); /* * if this connection filled our message, drop our reference now, to @@ -1152,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, if (!req->r_got_reply) { unsigned int bytes; - req->r_result = le32_to_cpu(rhead->result); + req->r_result = result; bytes = le32_to_cpu(msg->hdr.data_len); dout("handle_reply result %d bytes %d\n", req->r_result, bytes); @@ -1160,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req->r_result = bytes; /* in case this is a write and we need to replay, */ - req->r_reassert_version = rhead->reassert_version; + req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); + req->r_reassert_version.version = cpu_to_le64(reassert_version); req->r_got_reply = 1; } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { @@ -1195,10 +1301,11 @@ done: ceph_osdc_put_request(req); return; +bad_put: + ceph_osdc_put_request(req); bad: - pr_err("corrupt osd_op_reply got %d %d expected %d\n", - (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), - (int)sizeof(*rhead)); + pr_err("corrupt osd_op_reply got %d %d\n", + (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); ceph_msg_dump(msg); } -- cgit v1.2.3