summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-07-11 21:12:28 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2017-07-11 21:12:28 +0200
commit3bf7878f0f7d60c394f6d6631bb179e86f09f73c (patch)
treef998ef959865db1657baa410c4ed281ad9003183 /include
parentMerge git://www.linux-watchdog.org/linux-watchdog (diff)
parentlibceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS (diff)
downloadlinux-3bf7878f0f7d60c394f6d6631bb179e86f09f73c.tar.xz
linux-3bf7878f0f7d60c394f6d6631bb179e86f09f73c.zip
Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The main item here is support for v12.y.z ("Luminous") clusters: RESEND_ON_SPLIT, RADOS_BACKOFF, OSDMAP_PG_UPMAP and CRUSH_CHOOSE_ARGS feature bits, and various other changes in the RADOS client protocol. On top of that we have a new fsc mount option to allow supplying fscache uniquifier (similar to NFS) and the usual pile of filesystem fixes from Zheng" * tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client: (44 commits) libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS libceph: osd_state is 32 bits wide in luminous crush: remove an obsolete comment crush: crush_init_workspace starts with struct crush_work libceph, crush: per-pool crush_choose_arg_map for crush_do_rule() crush: implement weight and id overrides for straw2 libceph: apply_upmap() libceph: compute actual pgid in ceph_pg_to_up_acting_osds() libceph: pg_upmap[_items] infrastructure libceph: ceph_decode_skip_* helpers libceph: kill __{insert,lookup,remove}_pg_mapping() libceph: introduce and switch to decode_pg_mapping() libceph: don't pass pgid by value libceph: respect RADOS_BACKOFF backoffs libceph: make DEFINE_RB_* helpers more general libceph: avoid unnecessary pi lookups in calc_target() libceph: use target pi for calc_target() calculations libceph: always populate t->target_{oid,oloc} in calc_target() libceph: make sure need_resend targets reflect latest map libceph: delete from need_resend_linger before check_linger_pool_dne() ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/ceph/ceph_features.h264
-rw-r--r--include/linux/ceph/ceph_fs.h1
-rw-r--r--include/linux/ceph/decode.h60
-rw-r--r--include/linux/ceph/libceph.h49
-rw-r--r--include/linux/ceph/messenger.h2
-rw-r--r--include/linux/ceph/osd_client.h70
-rw-r--r--include/linux/ceph/osdmap.h41
-rw-r--r--include/linux/ceph/rados.h6
-rw-r--r--include/linux/crush/crush.h66
-rw-r--r--include/linux/crush/mapper.h9
10 files changed, 450 insertions, 118 deletions
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index fd8b2953c78f..f0f6c537b64c 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -2,103 +2,174 @@
#define __CEPH_FEATURES
/*
- * feature bits
+ * Each time we reclaim bits for reuse we need to specify another bit
+ * that, if present, indicates we have the new incarnation of that
+ * feature. Base case is 1 (first use).
*/
-#define CEPH_FEATURE_UID (1ULL<<0)
-#define CEPH_FEATURE_NOSRCADDR (1ULL<<1)
-#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
-#define CEPH_FEATURE_FLOCK (1ULL<<3)
-#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4)
-#define CEPH_FEATURE_MONNAMES (1ULL<<5)
-#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6)
-#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
-#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8)
-#define CEPH_FEATURE_PGID64 (1ULL<<9)
-#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10)
-#define CEPH_FEATURE_PGPOOL3 (1ULL<<11)
-#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12)
-#define CEPH_FEATURE_OSDENC (1ULL<<13)
-#define CEPH_FEATURE_OMAP (1ULL<<14)
-#define CEPH_FEATURE_MONENC (1ULL<<15)
-#define CEPH_FEATURE_QUERY_T (1ULL<<16)
-#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
-#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
-#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
-#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
-#define CEPH_FEATURE_MON_GV (1ULL<<21)
-#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
-#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
-#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
-#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
-#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
-#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
-#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
-#define CEPH_FEATURE_MDSENC (1ULL<<29)
-#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
-#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
-#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
-#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
-#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
-#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
-#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
-#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
-#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
-#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
-/* The process supports new-style OSDMap encoding. Monitors also use
- this bit to determine if peers support NAK messages. */
-#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
-#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
-#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
-#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
-#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42)
-#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43)
-#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44)
-#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
-#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
-#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */
-#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */
-#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
-#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47)
-#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */
-#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
-// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
-#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
-#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
-#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
-#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
-#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
-#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
-#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
-#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
-#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
-#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
-#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
-// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
-#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
-#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */
+#define CEPH_FEATURE_INCARNATION_1 (0ull)
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+
+#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
+ const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
+
+/* this bit is ignored but still advertised by release *when* */
+#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
+ const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
+ const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
+ (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
/*
- * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
- * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63
- * to mean 33 bit ~0, and introduce a helper below to do the
- * translation.
+ * this bit is ignored by release *unused* and not advertised by
+ * release *unadvertised*
+ */
+#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
+
+
+/*
+ * test for a feature. this test is safer than a typical mask against
+ * the bit because it ensures that we have the bit AND the marker for the
+ * bit's incarnation. this must be used in any case where the features
+ * bits may include an old meaning of the bit.
+ */
+#define CEPH_HAVE_FEATURE(x, name) \
+ (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
+
+
+/*
+ * Notes on deprecation:
+ *
+ * A *major* release is a release through which all upgrades must pass
+ * (e.g., jewel). For example, no pre-jewel server will ever talk to
+ * a post-jewel server (mon, osd, etc).
+ *
+ * For feature bits used *only* on the server-side:
+ *
+ * - In the first phase we indicate that a feature is DEPRECATED as of
+ * a particular release. This is the first major release X (say,
+ * jewel) that does not depend on its peers advertising the feature.
+ * That is, it safely assumes its peers all have the feature. We
+ * indicate this with the DEPRECATED macro. For example,
+ *
+ * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
+ *
+ * because 10.2.z (jewel) did not care if its peers advertised this
+ * feature bit.
+ *
+ * - In the second phase we stop advertising the the bit and call it
+ * RETIRED. This can normally be done in the *next* major release
+ * following the one in which we marked the feature DEPRECATED. In
+ * the above example, for 12.0.z (luminous) we can say:
+ *
+ * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
*
- * This was introduced by ceph.git commit
- * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8
- * and fixed by ceph.git commit
- * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c
+ * - The bit can be reused in the first post-luminous release, 13.0.z
+ * (m).
+ *
+ * This ensures that no two versions who have different meanings for
+ * the bit ever speak to each other.
*/
-#define CEPH_FEATURE_RESERVED (1ULL<<63)
-
-static inline u64 ceph_sanitize_features(u64 features)
-{
- if (features & CEPH_FEATURE_RESERVED) {
- /* everything through OSD_SNAPMAPPER */
- return 0x1ffffffffull;
- } else {
- return features;
- }
-}
+
+DEFINE_CEPH_FEATURE( 0, 1, UID)
+DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
+DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
+DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
+DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
+DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
+DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
+DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
+DEFINE_CEPH_FEATURE( 9, 1, PGID64)
+DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
+DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
+DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
+DEFINE_CEPH_FEATURE(13, 1, OSDENC)
+DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
+DEFINE_CEPH_FEATURE(15, 1, MONENC)
+DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
+DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
+DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
+DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
+DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
+DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
+DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
+
+DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
+DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
+DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
+DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(29, 1, MDSENC)
+DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
+DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
+DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
+DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
+DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
+DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
+DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
+DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
+DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
+DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
+DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
+DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
+DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
+DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
+DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
+
+DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
+DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
+DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
+DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
+
+DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
+DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
+DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
+DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
+DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
+DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
+DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
+DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
+DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
+DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
+DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
+DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
+DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
+DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
+DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
+DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
+DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
+DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit*
+
+DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down!
+DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
+DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
+
/*
* Features supported.
@@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \
+ CEPH_FEATURE_SERVER_LUMINOUS | \
+ CEPH_FEATURE_RESEND_ON_SPLIT | \
+ CEPH_FEATURE_RADOS_BACKOFF | \
+ CEPH_FEATURE_OSDMAP_PG_UPMAP | \
+ CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \
CEPH_FEATURE_MSG_AUTH | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
@@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_CRUSH_TUNABLES3 | \
CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
CEPH_FEATURE_MSGR_KEEPALIVE2 | \
+ CEPH_FEATURE_OSD_POOLRESEND | \
CEPH_FEATURE_CRUSH_V4 | \
+ CEPH_FEATURE_NEW_OSDOP_ENCODING | \
+ CEPH_FEATURE_SERVER_JEWEL | \
+ CEPH_FEATURE_MON_STATEFUL_SUB | \
CEPH_FEATURE_CRUSH_TUNABLES5 | \
CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ad078ebe25d6..edf5b04b918a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -147,6 +147,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43
#define CEPH_MSG_WATCH_NOTIFY 44
+#define CEPH_MSG_OSD_BACKOFF 61
/* watch-notify operations */
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index f990f2cc907a..14af9b70d301 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -133,6 +133,66 @@ bad:
}
/*
+ * skip helpers
+ */
+#define ceph_decode_skip_n(p, end, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ *p += n; \
+ } while (0)
+
+#define ceph_decode_skip_64(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u64), bad)
+
+#define ceph_decode_skip_32(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u32), bad)
+
+#define ceph_decode_skip_16(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u16), bad)
+
+#define ceph_decode_skip_8(p, end, bad) \
+ceph_decode_skip_n(p, end, sizeof(u8), bad)
+
+#define ceph_decode_skip_string(p, end, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ ceph_decode_skip_n(p, end, len, bad); \
+ } while (0)
+
+#define ceph_decode_skip_set(p, end, type, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) \
+ ceph_decode_skip_##type(p, end, bad); \
+ } while (0)
+
+#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype(p, end, bad); \
+ ceph_decode_skip_##vtype(p, end, bad); \
+ } \
+ } while (0)
+
+#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
+ do { \
+ u32 len; \
+ \
+ ceph_decode_32_safe(p, end, len, bad); \
+ while (len--) { \
+ ceph_decode_skip_##ktype1(p, end, bad); \
+ ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
+ } \
+ } while (0)
+
+/*
* struct ceph_timespec <-> struct timespec
*/
static inline void ceph_decode_timespec(struct timespec *ts,
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 3229ae6c7846..8a79587e1317 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len)
(off >> PAGE_SHIFT);
}
-/*
- * These are not meant to be generic - an integer key is assumed.
- */
-#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+#define RB_BYVAL(a) (a)
+#define RB_BYPTR(a) (&(a))
+#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
+
+#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
static void insert_##name(struct rb_root *root, type *t) \
{ \
struct rb_node **n = &root->rb_node; \
@@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \
\
while (*n) { \
type *cur = rb_entry(*n, type, nodefld); \
+ int cmp; \
\
parent = *n; \
- if (t->keyfld < cur->keyfld) \
+ cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \
+ if (cmp < 0) \
n = &(*n)->rb_left; \
- else if (t->keyfld > cur->keyfld) \
+ else if (cmp > 0) \
n = &(*n)->rb_right; \
else \
BUG(); \
@@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \
RB_CLEAR_NODE(&t->nodefld); \
}
-#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
-extern type __lookup_##name##_key; \
-static type *lookup_##name(struct rb_root *root, \
- typeof(__lookup_##name##_key.keyfld) key) \
+/*
+ * @lookup_param_type is a parameter and not constructed from (@type,
+ * @keyfld) with typeof() because adding const is too unwieldy.
+ */
+#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+static type *lookup_##name(struct rb_root *root, lookup_param_type key) \
{ \
struct rb_node *n = root->rb_node; \
\
while (n) { \
type *cur = rb_entry(n, type, nodefld); \
+ int cmp; \
\
- if (key < cur->keyfld) \
+ cmp = cmpexp(key, keyexp(cur->keyfld)); \
+ if (cmp < 0) \
n = n->rb_left; \
- else if (key > cur->keyfld) \
+ else if (cmp > 0) \
n = n->rb_right; \
else \
return cur; \
@@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \
return NULL; \
}
+#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
+ lookup_param_type, nodefld)
+
+/*
+ * Shorthands for integer keys.
+ */
+#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
+DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
+
+#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
+extern type __lookup_##name##_key; \
+DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \
+ typeof(__lookup_##name##_key.keyfld), nodefld)
+
#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index c5c4c713e00f..fbd94d9fa5dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -44,6 +44,8 @@ struct ceph_connection_operations {
struct ceph_msg_header *hdr,
int *skip);
+ void (*reencode_message) (struct ceph_msg *msg);
+
int (*sign_message) (struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_msg *msg);
};
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 85650b415e73..c6d96a5f46fd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -1,6 +1,7 @@
#ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H
+#include <linux/bitrev.h>
#include <linux/completion.h>
#include <linux/kref.h>
#include <linux/mempool.h>
@@ -36,6 +37,8 @@ struct ceph_osd {
struct ceph_connection o_con;
struct rb_root o_requests;
struct rb_root o_linger_requests;
+ struct rb_root o_backoff_mappings;
+ struct rb_root o_backoffs_by_id;
struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth;
unsigned long lru_ttl;
@@ -136,7 +139,8 @@ struct ceph_osd_request_target {
struct ceph_object_id target_oid;
struct ceph_object_locator target_oloc;
- struct ceph_pg pgid;
+ struct ceph_pg pgid; /* last raw pg we mapped to */
+ struct ceph_spg spgid; /* last actual spg we mapped to */
u32 pg_num;
u32 pg_num_mask;
struct ceph_osds acting;
@@ -148,6 +152,9 @@ struct ceph_osd_request_target {
unsigned int flags; /* CEPH_OSD_FLAG_* */
bool paused;
+ u32 epoch;
+ u32 last_force_resend;
+
int osd;
};
@@ -193,7 +200,6 @@ struct ceph_osd_request {
unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */
int r_attempts;
- u32 r_last_force_resend;
u32 r_map_dne_bound;
struct ceph_osd_req_op r_ops[];
@@ -203,6 +209,23 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc;
};
+/*
+ * osd request identifier
+ *
+ * caller name + incarnation# + tid to unique identify this request
+ */
+struct ceph_osd_reqid {
+ struct ceph_entity_name name;
+ __le64 tid;
+ __le32 inc;
+} __packed;
+
+struct ceph_blkin_trace_info {
+ __le64 trace_id;
+ __le64 span_id;
+ __le64 parent_span_id;
+} __packed;
+
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
@@ -221,7 +244,6 @@ struct ceph_osd_linger_request {
struct list_head pending_lworks;
struct ceph_osd_request_target t;
- u32 last_force_resend;
u32 map_dne_bound;
struct timespec mtime;
@@ -256,6 +278,48 @@ struct ceph_watch_item {
struct ceph_entity_addr addr;
};
+struct ceph_spg_mapping {
+ struct rb_node node;
+ struct ceph_spg spgid;
+
+ struct rb_root backoffs;
+};
+
+struct ceph_hobject_id {
+ void *key;
+ size_t key_len;
+ void *oid;
+ size_t oid_len;
+ u64 snapid;
+ u32 hash;
+ u8 is_max;
+ void *nspace;
+ size_t nspace_len;
+ s64 pool;
+
+ /* cache */
+ u32 hash_reverse_bits;
+};
+
+static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
+{
+ hoid->hash_reverse_bits = bitrev32(hoid->hash);
+}
+
+/*
+ * PG-wide backoff: [begin, end)
+ * per-object backoff: begin == end
+ */
+struct ceph_osd_backoff {
+ struct rb_node spg_node;
+ struct rb_node id_node;
+
+ struct ceph_spg spgid;
+ u64 id;
+ struct ceph_hobject_id *begin;
+ struct ceph_hobject_id *end;
+};
+
#define CEPH_LINGER_ID_START 0xffff000000000000ULL
struct ceph_osd_client {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 938656f70807..a0996cb9faed 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,7 +24,15 @@ struct ceph_pg {
uint32_t seed;
};
+#define CEPH_SPG_NOSHARD -1
+
+struct ceph_spg {
+ struct ceph_pg pgid;
+ s8 shard;
+};
+
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
+int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
together */
@@ -135,10 +143,14 @@ struct ceph_pg_mapping {
struct {
int len;
int osds[];
- } pg_temp;
+ } pg_temp, pg_upmap;
struct {
int osd;
} primary_temp;
+ struct {
+ int len;
+ int from_to[][2];
+ } pg_upmap_items;
};
};
@@ -150,13 +162,17 @@ struct ceph_osdmap {
u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */
- u8 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp;
struct rb_root primary_temp;
+ /* remap (post-CRUSH, pre-up) */
+ struct rb_root pg_upmap; /* PG := raw set */
+ struct rb_root pg_upmap_items; /* from -> to within raw set */
+
u32 *osd_primary_affinity;
struct rb_root pg_pools;
@@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
return !ceph_osd_is_up(map, osd);
}
-extern char *ceph_osdmap_state_str(char *str, int len, int state);
+char *ceph_osdmap_state_str(char *str, int len, u32 state);
extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
@@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
return &map->osd_addr[osd];
}
+#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
+
static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
{
__u8 version;
- if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
+ if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
pr_warn("incomplete pg encoding\n");
return -EINVAL;
}
@@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set)
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
+bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
+ u32 new_pg_num);
bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting,
const struct ceph_osds *old_up,
@@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen);
+int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
+ struct ceph_pg *raw_pgid);
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
- struct ceph_object_id *oid,
- struct ceph_object_locator *oloc,
+ const struct ceph_object_id *oid,
+ const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_osds *up,
struct ceph_osds *acting);
+bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
+ struct ceph_pg_pool_info *pi,
+ const struct ceph_pg *raw_pgid,
+ struct ceph_spg *spgid);
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 5d0018782d50..385db08bb8b2 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -439,6 +439,12 @@ enum {
const char *ceph_osd_watch_op_name(int o);
+enum {
+ CEPH_OSD_BACKOFF_OP_BLOCK = 1,
+ CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
+ CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
+};
+
/*
* an individual object operation. each may be accompanied by some data
* payload
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index fbecbd089d75..92e165d417a6 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -2,6 +2,7 @@
#define CEPH_CRUSH_CRUSH_H
#ifdef __KERNEL__
+# include <linux/rbtree.h>
# include <linux/types.h>
#else
# include "crush_compat.h"
@@ -137,6 +138,68 @@ struct crush_bucket {
};
+/** @ingroup API
+ *
+ * Replacement weights for each item in a bucket. The size of the
+ * array must be exactly the size of the straw2 bucket, just as the
+ * item_weights array.
+ *
+ */
+struct crush_weight_set {
+ __u32 *weights; /*!< 16.16 fixed point weights
+ in the same order as items */
+ __u32 size; /*!< size of the __weights__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for a given straw2 bucket, for
+ * placement purposes.
+ *
+ * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
+ * replacement weights found at __weight_set[N]__ are used instead of
+ * the weights from __item_weights__. If __N__ is greater than
+ * __weight_set_size__, the weights found at __weight_set_size-1__ are
+ * used instead. For instance if __weight_set__ is:
+ *
+ * [ [ 0x10000, 0x20000 ], // position 0
+ * [ 0x20000, 0x40000 ] ] // position 1
+ *
+ * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
+ * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
+ * etc.
+ *
+ */
+struct crush_choose_arg {
+ __s32 *ids; /*!< values to use instead of items */
+ __u32 ids_size; /*!< size of the __ids__ array */
+ struct crush_weight_set *weight_set; /*!< weight replacements for
+ a given position */
+ __u32 weight_set_size; /*!< size of the __weight_set__ array */
+};
+
+/** @ingroup API
+ *
+ * Replacement weights and ids for each bucket in the crushmap. The
+ * __size__ of the __args__ array must be exactly the same as the
+ * __map->max_buckets__.
+ *
+ * The __crush_choose_arg__ at index N will be used when choosing
+ * an item from the bucket __map->buckets[N]__ bucket, provided it
+ * is a straw2 bucket.
+ *
+ */
+struct crush_choose_arg_map {
+#ifdef __KERNEL__
+ struct rb_node node;
+ u64 choose_args_index;
+#endif
+ struct crush_choose_arg *args; /*!< replacement for each bucket
+ in the crushmap */
+ __u32 size; /*!< size of the __args__ array */
+};
+
struct crush_bucket_uniform {
struct crush_bucket h;
__u32 item_weight; /* 16-bit fixed point; all items equally weighted */
@@ -236,6 +299,9 @@ struct crush_map {
__u32 allowed_bucket_algs;
__u32 *choose_tries;
+#else
+ /* CrushWrapper::choose_args */
+ struct rb_root choose_args;
#endif
};
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index c95e19e1ff11..141edabb947e 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -11,11 +11,10 @@
#include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
-extern int crush_do_rule(const struct crush_map *map,
- int ruleno,
- int x, int *result, int result_max,
- const __u32 *weights, int weight_max,
- void *cwin);
+int crush_do_rule(const struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ const __u32 *weight, int weight_max,
+ void *cwin, const struct crush_choose_arg *choose_args);
/*
* Returns the exact amount of workspace that will need to be used