diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2020-05-23 11:45:48 +0200 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2020-06-01 13:22:53 +0200 |
commit | 117d96a04f007ce8fc2e292369056c3bd09f6f63 (patch) | |
tree | 8fb67a4f204be96d285341aff9bffce670ce2930 /net/ceph/osd_client.c | |
parent | libceph: crush_location infrastructure (diff) | |
download | linux-117d96a04f007ce8fc2e292369056c3bd09f6f63.tar.xz linux-117d96a04f007ce8fc2e292369056c3bd09f6f63.zip |
libceph: support for balanced and localized reads
OSD-side issues with reads from replica have been resolved in
Octopus. Reading from replica should be safe wrt. unstable or
uncommitted state now, so add support for balanced and localized
reads.
There are two cases when a read from replica can't be served:
- OSD may silently drop the request, expecting the client to
notice that the acting set has changed and resend via the usual
means (handled with t->used_replica)
- OSD may return EAGAIN, expecting the client to resend to the
primary, ignoring replica read flags (see handle_reply())
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Diffstat (limited to 'net/ceph/osd_client.c')
-rw-r--r-- | net/ceph/osd_client.c | 87 |
1 files changed, 83 insertions, 4 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ece124a5138e..4ce6cdc744e4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1497,6 +1497,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, (osdc->osdmap->epoch < osdc->epoch_barrier); } +static int pick_random_replica(const struct ceph_osds *acting) +{ + int i = prandom_u32() % acting->size; + + dout("%s picked osd%d, primary osd%d\n", __func__, + acting->osds[i], acting->primary); + return i; +} + +/* + * Picks the closest replica based on client's location given by + * crush_location option. Prefers the primary if the locality is + * the same. + */ +static int pick_closest_replica(struct ceph_osd_client *osdc, + const struct ceph_osds *acting) +{ + struct ceph_options *opt = osdc->client->options; + int best_i, best_locality; + int i = 0, locality; + + do { + locality = ceph_get_crush_locality(osdc->osdmap, + acting->osds[i], + &opt->crush_locs); + if (i == 0 || + (locality >= 0 && best_locality < 0) || + (locality >= 0 && best_locality >= 0 && + locality < best_locality)) { + best_i = i; + best_locality = locality; + } + } while (++i < acting->size); + + dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, + acting->osds[best_i], best_locality, acting->primary); + return best_i; +} + enum calc_target_result { CALC_TARGET_NO_ACTION = 0, CALC_TARGET_NEED_RESEND, @@ -1510,6 +1549,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; + bool is_read = t->flags & CEPH_OSD_FLAG_READ; + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; bool force_resend = false; bool unpaused = false; bool legacy_change = false; @@ -1540,9 +1581,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oloc_copy(&t->target_oloc, &t->base_oloc); if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + if (is_read && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; - if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + if (is_write && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); @@ -1581,7 +1622,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, unpaused = true; } legacy_change = ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change); + ceph_osds_changed(&t->acting, &acting, + t->used_replica || any_change); if (t->pg_num) split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); @@ -1597,7 +1639,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->sort_bitwise = sort_bitwise; t->recovery_deletes = recovery_deletes; - t->osd = acting.primary; + if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS)) && + !is_write && pi->type == CEPH_POOL_TYPE_REP && + acting.size > 1) { + int pos; + + WARN_ON(!is_read || acting.osds[0] != acting.primary); + if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { + pos = pick_random_replica(&acting); + } else { + pos = pick_closest_replica(osdc, &acting); + } + t->osd = acting.osds[pos]; + t->used_replica = pos > 0; + } else { + t->osd = acting.primary; + t->used_replica = false; + } } if (unpaused || legacy_change || force_resend || split) @@ -3660,6 +3719,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) goto out_unlock_osdc; } + if (m.result == -EAGAIN) { + dout("req %p tid %llu EAGAIN\n", req, req->r_tid); + unlink_request(osd, req); + mutex_unlock(&osd->lock); + + /* + * The object is missing on the replica or not (yet) + * readable. Clear pgid to force a resend to the primary + * via legacy_change. + */ + req->r_t.pgid.pool = 0; + req->r_t.pgid.seed = 0; + WARN_ON(!req->r_t.used_replica); + req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | + CEPH_OSD_FLAG_LOCALIZE_READS); + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; + } + if (m.num_ops != req->r_num_ops) { pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, req->r_num_ops, req->r_tid); |