summaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/br_device.c51
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/can/bcm.c4
-rw-r--r--net/can/raw.c1
-rw-r--r--net/ceph/buffer.c22
-rw-r--r--net/ceph/ceph_common.c24
-rw-r--r--net/ceph/crush/crush.c7
-rw-r--r--net/ceph/crush/mapper.c336
-rw-r--r--net/ceph/debugfs.c3
-rw-r--r--net/ceph/messenger.c75
-rw-r--r--net/ceph/mon_client.c8
-rw-r--r--net/ceph/osd_client.c283
-rw-r--r--net/ceph/osdmap.c78
-rw-r--r--net/core/netpoll.c4
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c2
-rw-r--r--net/core/sock.c6
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/ip_tunnel.c29
-rw-r--r--net/ipv4/tcp_input.c18
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp_offload.c17
-rw-r--r--net/llc/llc_output.c2
-rw-r--r--net/openvswitch/datapath.c23
-rw-r--r--net/openvswitch/flow_table.c88
-rw-r--r--net/openvswitch/flow_table.h2
-rw-r--r--net/rxrpc/ar-connection.c2
-rw-r--r--net/rxrpc/ar-recvmsg.c7
-rw-r--r--net/sctp/ipv6.c2
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c17
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c17
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c2
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c82
-rw-r--r--net/sunrpc/cache.c4
-rw-r--r--net/sunrpc/clnt.c15
-rw-r--r--net/sunrpc/netns.h4
-rw-r--r--net/sunrpc/rpc_pipe.c169
-rw-r--r--net/sunrpc/sunrpc_syms.c8
-rw-r--r--net/sunrpc/svc.c25
-rw-r--r--net/sunrpc/xprt.c5
-rw-r--r--net/sunrpc/xprtsock.c49
41 files changed, 1077 insertions, 424 deletions
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e4401a531afb..d9a9b0fc1795 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -226,6 +226,33 @@ static void br_netpoll_cleanup(struct net_device *dev)
br_netpoll_disable(p);
}
+static int __br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
+{
+ struct netpoll *np;
+ int err;
+
+ np = kzalloc(sizeof(*p->np), gfp);
+ if (!np)
+ return -ENOMEM;
+
+ err = __netpoll_setup(np, p->dev, gfp);
+ if (err) {
+ kfree(np);
+ return err;
+ }
+
+ p->np = np;
+ return err;
+}
+
+int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
+{
+ if (!p->br->dev->npinfo)
+ return 0;
+
+ return __br_netpoll_enable(p, gfp);
+}
+
static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni,
gfp_t gfp)
{
@@ -236,7 +263,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni,
list_for_each_entry(p, &br->port_list, list) {
if (!p->dev)
continue;
- err = br_netpoll_enable(p, gfp);
+ err = __br_netpoll_enable(p, gfp);
if (err)
goto fail;
}
@@ -249,28 +276,6 @@ fail:
goto out;
}
-int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
-{
- struct netpoll *np;
- int err;
-
- if (!p->br->dev->npinfo)
- return 0;
-
- np = kzalloc(sizeof(*p->np), gfp);
- if (!np)
- return -ENOMEM;
-
- err = __netpoll_setup(np, p->dev, gfp);
- if (err) {
- kfree(np);
- return err;
- }
-
- p->np = np;
- return err;
-}
-
void br_netpoll_disable(struct net_bridge_port *p)
{
struct netpoll *np = p->np;
diff --git a/net/can/af_can.c b/net/can/af_can.c
index d249874a366d..a27f8aad9e99 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -57,6 +57,7 @@
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
+#include <linux/can/skb.h>
#include <linux/ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -290,7 +291,7 @@ int can_send(struct sk_buff *skb, int loop)
return -ENOMEM;
}
- newskb->sk = skb->sk;
+ can_skb_set_owner(newskb, skb->sk);
newskb->ip_summed = CHECKSUM_UNNECESSARY;
newskb->pkt_type = PACKET_BROADCAST;
}
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 3fc737b214c7..dcb75c0e66c1 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -268,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op)
/* send with loopback */
skb->dev = dev;
- skb->sk = op->sk;
+ can_skb_set_owner(skb, op->sk);
can_send(skb, 1);
/* update statistics */
@@ -1223,7 +1223,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
can_skb_prv(skb)->ifindex = dev->ifindex;
skb->dev = dev;
- skb->sk = sk;
+ can_skb_set_owner(skb, sk);
err = can_send(skb, 1); /* send with loopback */
dev_put(dev);
diff --git a/net/can/raw.c b/net/can/raw.c
index 07d72d852324..8be757cca2ec 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -715,6 +715,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock,
skb->dev = dev;
skb->sk = sk;
+ skb->priority = sk->sk_priority;
err = can_send(skb, ro->loopback);
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
index bf3e6a13c215..621b5f65407f 100644
--- a/net/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -6,6 +6,7 @@
#include <linux/ceph/buffer.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */
struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
{
@@ -15,16 +16,10 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
if (!b)
return NULL;
- b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
- if (b->vec.iov_base) {
- b->is_vmalloc = false;
- } else {
- b->vec.iov_base = __vmalloc(len, gfp | __GFP_HIGHMEM, PAGE_KERNEL);
- if (!b->vec.iov_base) {
- kfree(b);
- return NULL;
- }
- b->is_vmalloc = true;
+ b->vec.iov_base = ceph_kvmalloc(len, gfp);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
}
kref_init(&b->kref);
@@ -40,12 +35,7 @@ void ceph_buffer_release(struct kref *kref)
struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
dout("buffer_release %p\n", b);
- if (b->vec.iov_base) {
- if (b->is_vmalloc)
- vfree(b->vec.iov_base);
- else
- kfree(b->vec.iov_base);
- }
+ ceph_kvfree(b->vec.iov_base);
kfree(b);
}
EXPORT_SYMBOL(ceph_buffer_release);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 34b11ee8124e..67d7721d237e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/statfs.h>
#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
@@ -170,6 +171,25 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
EXPORT_SYMBOL(ceph_compare_options);
+void *ceph_kvmalloc(size_t size, gfp_t flags)
+{
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ void *ptr = kmalloc(size, flags | __GFP_NOWARN);
+ if (ptr)
+ return ptr;
+ }
+
+ return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL);
+}
+
+void ceph_kvfree(const void *ptr)
+{
+ if (is_vmalloc_addr(ptr))
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
static int parse_fsid(const char *str, struct ceph_fsid *fsid)
{
@@ -461,8 +481,8 @@ EXPORT_SYMBOL(ceph_client_id);
* create a fresh client instance
*/
struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
- unsigned int supported_features,
- unsigned int required_features)
+ u64 supported_features,
+ u64 required_features)
{
struct ceph_client *client;
struct ceph_entity_addr *myaddr = NULL;
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 089613234f03..16bc199d9a62 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -116,11 +116,14 @@ void crush_destroy(struct crush_map *map)
if (map->rules) {
__u32 b;
for (b = 0; b < map->max_rules; b++)
- kfree(map->rules[b]);
+ crush_destroy_rule(map->rules[b]);
kfree(map->rules);
}
kfree(map);
}
-
+void crush_destroy_rule(struct crush_rule *rule)
+{
+ kfree(rule);
+}
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index cbd06a91941c..b703790b4e44 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -189,7 +189,7 @@ static int terminal(int x)
static int bucket_tree_choose(struct crush_bucket_tree *bucket,
int x, int r)
{
- int n, l;
+ int n;
__u32 w;
__u64 t;
@@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
n = bucket->num_nodes >> 1;
while (!terminal(n)) {
+ int l;
/* pick point in [0, w) */
w = bucket->node_weights[n];
t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
* true if device is marked "out" (failed, fully offloaded)
* of the cluster
*/
-static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
+static int is_out(const struct crush_map *map,
+ const __u32 *weight, int weight_max,
+ int item, int x)
{
+ if (item >= weight_max)
+ return 1;
if (weight[item] >= 0x10000)
return 0;
if (weight[item] == 0)
@@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
}
/**
- * crush_choose - choose numrep distinct items of given type
+ * crush_choose_firstn - choose numrep distinct items of given type
* @map: the crush_map
* @bucket: the bucket we are choose an item from
* @x: crush input value
@@ -285,18 +290,24 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
* @type: the type of item to choose
* @out: pointer to output vector
* @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
- * @recurse_to_leaf: true if we want one device under each item of given type
- * @descend_once: true if we should only try one descent before giving up
+ * @tries: number of attempts to make
+ * @recurse_tries: number of attempts to have recursive chooseleaf make
+ * @local_tries: localized retries
+ * @local_fallback_tries: localized fallback retries
+ * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @out2: second output vector for leaf items (if @recurse_to_leaf)
*/
-static int crush_choose(const struct crush_map *map,
- struct crush_bucket *bucket,
- const __u32 *weight,
- int x, int numrep, int type,
- int *out, int outpos,
- int firstn, int recurse_to_leaf,
- int descend_once, int *out2)
+static int crush_choose_firstn(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ unsigned int local_tries,
+ unsigned int local_fallback_tries,
+ int recurse_to_leaf,
+ int *out2)
{
int rep;
unsigned int ftotal, flocal;
@@ -325,35 +336,17 @@ static int crush_choose(const struct crush_map *map,
collide = 0;
retry_bucket = 0;
r = rep;
- if (in->alg == CRUSH_BUCKET_UNIFORM) {
- /* be careful */
- if (firstn || (__u32)numrep >= in->size)
- /* r' = r + f_total */
- r += ftotal;
- else if (in->size % numrep == 0)
- /* r'=r+(n+1)*f_local */
- r += (numrep+1) *
- (flocal+ftotal);
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- } else {
- if (firstn)
- /* r' = r + f_total */
- r += ftotal;
- else
- /* r' = r + n*f_local */
- r += numrep * (flocal+ftotal);
- }
+ /* r' = r + f_total */
+ r += ftotal;
/* bucket choose */
if (in->size == 0) {
reject = 1;
goto reject;
}
- if (map->choose_local_fallback_tries > 0 &&
+ if (local_fallback_tries > 0 &&
flocal >= (in->size>>1) &&
- flocal > map->choose_local_fallback_tries)
+ flocal > local_fallback_tries)
item = bucket_perm_choose(in, x, r);
else
item = crush_bucket_choose(in, x, r);
@@ -394,13 +387,15 @@ static int crush_choose(const struct crush_map *map,
reject = 0;
if (!collide && recurse_to_leaf) {
if (item < 0) {
- if (crush_choose(map,
+ if (crush_choose_firstn(map,
map->buckets[-1-item],
- weight,
+ weight, weight_max,
x, outpos+1, 0,
out2, outpos,
- firstn, 0,
- map->chooseleaf_descend_once,
+ recurse_tries, 0,
+ local_tries,
+ local_fallback_tries,
+ 0,
NULL) <= outpos)
/* didn't get leaf */
reject = 1;
@@ -414,6 +409,7 @@ static int crush_choose(const struct crush_map *map,
/* out? */
if (itemtype == 0)
reject = is_out(map, weight,
+ weight_max,
item, x);
else
reject = 0;
@@ -424,17 +420,14 @@ reject:
ftotal++;
flocal++;
- if (reject && descend_once)
- /* let outer call try again */
- skip_rep = 1;
- else if (collide && flocal <= map->choose_local_tries)
+ if (collide && flocal <= local_tries)
/* retry locally a few times */
retry_bucket = 1;
- else if (map->choose_local_fallback_tries > 0 &&
- flocal <= in->size + map->choose_local_fallback_tries)
+ else if (local_fallback_tries > 0 &&
+ flocal <= in->size + local_fallback_tries)
/* exhaustive bucket search */
retry_bucket = 1;
- else if (ftotal <= map->choose_total_tries)
+ else if (ftotal <= tries)
/* then retry descent */
retry_descent = 1;
else
@@ -464,21 +457,179 @@ reject:
/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+ struct crush_bucket *bucket,
+ const __u32 *weight, int weight_max,
+ int x, int left, int numrep, int type,
+ int *out, int outpos,
+ unsigned int tries,
+ unsigned int recurse_tries,
+ int recurse_to_leaf,
+ int *out2,
+ int parent_r)
+{
+ struct crush_bucket *in = bucket;
+ int endpos = outpos + left;
+ int rep;
+ unsigned int ftotal;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide;
+
+ dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+ bucket->id, x, outpos, numrep);
+
+ /* initially my result is undefined */
+ for (rep = outpos; rep < endpos; rep++) {
+ out[rep] = CRUSH_ITEM_UNDEF;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_UNDEF;
+ }
+
+ for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] != CRUSH_ITEM_UNDEF)
+ continue;
+
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ for (;;) {
+ /* note: we base the choice on the position
+ * even in the nested call. that means that
+ * if the first layer chooses the same bucket
+ * in a different position, we will tend to
+ * choose a different item in that bucket.
+ * this will involve more devices in data
+ * movement and tend to distribute the load.
+ */
+ r = rep + parent_r;
+
+ /* be careful */
+ if (in->alg == CRUSH_BUCKET_UNIFORM &&
+ in->size % numrep == 0)
+ /* r'=r+(n+1)*f_total */
+ r += (numrep+1) * ftotal;
+ else
+ /* r' = r + n*f_total */
+ r += numrep * ftotal;
+
+ /* bucket choose */
+ if (in->size == 0) {
+ dprintk(" empty bucket\n");
+ break;
+ }
+
+ item = crush_bucket_choose(in, x, r);
+ if (item >= map->max_devices) {
+ dprintk(" bad item %d\n", item);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] = CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ if (item >= 0 ||
+ (-1-item) >= map->max_buckets) {
+ dprintk(" bad item type %d\n", type);
+ out[rep] = CRUSH_ITEM_NONE;
+ if (out2)
+ out2[rep] =
+ CRUSH_ITEM_NONE;
+ left--;
+ break;
+ }
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ collide = 0;
+ for (i = outpos; i < endpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+ if (collide)
+ break;
+
+ if (recurse_to_leaf) {
+ if (item < 0) {
+ crush_choose_indep(map,
+ map->buckets[-1-item],
+ weight, weight_max,
+ x, 1, numrep, 0,
+ out2, rep,
+ recurse_tries, 0,
+ 0, NULL, r);
+ if (out2[rep] == CRUSH_ITEM_NONE) {
+ /* placed nothing; no leaf */
+ break;
+ }
+ } else {
+ /* we already have a leaf! */
+ out2[rep] = item;
+ }
+ }
+
+ /* out? */
+ if (itemtype == 0 &&
+ is_out(map, weight, weight_max, item, x))
+ break;
+
+ /* yay! */
+ out[rep] = item;
+ left--;
+ break;
+ }
+ }
+ }
+ for (rep = outpos; rep < endpos; rep++) {
+ if (out[rep] == CRUSH_ITEM_UNDEF) {
+ out[rep] = CRUSH_ITEM_NONE;
+ }
+ if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+ out2[rep] = CRUSH_ITEM_NONE;
+ }
+ }
+}
+
+/**
* crush_do_rule - calculate a mapping with the given input and rule
* @map: the crush_map
* @ruleno: the rule id
* @x: hash input
* @result: pointer to result vector
* @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @scratch: scratch vector for private use; must be >= 3 * result_max
*/
int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max,
- const __u32 *weight)
+ const __u32 *weight, int weight_max,
+ int *scratch)
{
int result_len;
- int a[CRUSH_MAX_SET];
- int b[CRUSH_MAX_SET];
- int c[CRUSH_MAX_SET];
+ int *a = scratch;
+ int *b = scratch + result_max;
+ int *c = scratch + result_max*2;
int recurse_to_leaf;
int *w;
int wsize = 0;
@@ -489,8 +640,10 @@ int crush_do_rule(const struct crush_map *map,
__u32 step;
int i, j;
int numrep;
- int firstn;
- const int descend_once = 0;
+ int choose_tries = map->choose_total_tries;
+ int choose_local_tries = map->choose_local_tries;
+ int choose_local_fallback_tries = map->choose_local_fallback_tries;
+ int choose_leaf_tries = 0;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -503,29 +656,49 @@ int crush_do_rule(const struct crush_map *map,
o = b;
for (step = 0; step < rule->len; step++) {
+ int firstn = 0;
struct crush_rule_step *curstep = &rule->steps[step];
- firstn = 0;
switch (curstep->op) {
case CRUSH_RULE_TAKE:
w[0] = curstep->arg1;
wsize = 1;
break;
- case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_SET_CHOOSE_TRIES:
+ if (curstep->arg1 > 0)
+ choose_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
+ if (curstep->arg1 > 0)
+ choose_leaf_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
+ if (curstep->arg1 > 0)
+ choose_local_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
+ if (curstep->arg1 > 0)
+ choose_local_fallback_tries = curstep->arg1;
+ break;
+
+ case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
/* fall through */
- case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ case CRUSH_RULE_CHOOSELEAF_INDEP:
case CRUSH_RULE_CHOOSE_INDEP:
if (wsize == 0)
break;
recurse_to_leaf =
curstep->op ==
- CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+ CRUSH_RULE_CHOOSELEAF_FIRSTN ||
curstep->op ==
- CRUSH_RULE_CHOOSE_LEAF_INDEP;
+ CRUSH_RULE_CHOOSELEAF_INDEP;
/* reset output */
osize = 0;
@@ -543,22 +716,51 @@ int crush_do_rule(const struct crush_map *map,
continue;
}
j = 0;
- osize += crush_choose(map,
- map->buckets[-1-w[i]],
- weight,
- x, numrep,
- curstep->arg2,
- o+osize, j,
- firstn,
- recurse_to_leaf,
- descend_once, c+osize);
+ if (firstn) {
+ int recurse_tries;
+ if (choose_leaf_tries)
+ recurse_tries =
+ choose_leaf_tries;
+ else if (map->chooseleaf_descend_once)
+ recurse_tries = 1;
+ else
+ recurse_tries = choose_tries;
+ osize += crush_choose_firstn(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ recurse_tries,
+ choose_local_tries,
+ choose_local_fallback_tries,
+ recurse_to_leaf,
+ c+osize);
+ } else {
+ crush_choose_indep(
+ map,
+ map->buckets[-1-w[i]],
+ weight, weight_max,
+ x, numrep, numrep,
+ curstep->arg2,
+ o+osize, j,
+ choose_tries,
+ choose_leaf_tries ?
+ choose_leaf_tries : 1,
+ recurse_to_leaf,
+ c+osize,
+ 0);
+ osize += numrep;
+ }
}
if (recurse_to_leaf)
/* copy final _leaf_ values to output set */
memcpy(o, c, osize*sizeof(*o));
- /* swap t and w arrays */
+ /* swap o and w arrays */
tmp = o;
o = w;
w = tmp;
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 83661cdc0766..258a382e75ed 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -132,7 +132,8 @@ static int osdc_show(struct seq_file *s, void *pp)
req->r_osd ? req->r_osd->o_osd : -1,
req->r_pgid.pool, req->r_pgid.seed);
- seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
+ seq_printf(s, "%.*s", req->r_base_oid.name_len,
+ req->r_base_oid.name);
if (req->r_reassert_version.epoch)
seq_printf(s, "\t%u'%llu",
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4a5df7b1cc9f..0e478a0f4204 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -15,6 +15,7 @@
#include <linux/dns_resolver.h>
#include <net/tcp.h>
+#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include <linux/ceph/decode.h>
@@ -777,13 +778,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
bio = data->bio;
BUG_ON(!bio);
- BUG_ON(!bio->bi_vcnt);
cursor->resid = min(length, data->bio_length);
cursor->bio = bio;
- cursor->vector_index = 0;
- cursor->vector_offset = 0;
- cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
+ cursor->bvec_iter = bio->bi_iter;
+ cursor->last_piece =
+ cursor->resid <= bio_iter_len(bio, cursor->bvec_iter);
}
static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@ -792,71 +792,63 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
{
struct ceph_msg_data *data = cursor->data;
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
- bio_vec = &bio->bi_io_vec[index];
- BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
- *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+ *page_offset = (size_t) bio_vec.bv_offset;
BUG_ON(*page_offset >= PAGE_SIZE);
if (cursor->last_piece) /* pagelist offset is always 0 */
*length = cursor->resid;
else
- *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+ *length = (size_t) bio_vec.bv_len;
BUG_ON(*length > cursor->resid);
BUG_ON(*page_offset + *length > PAGE_SIZE);
- return bio_vec->bv_page;
+ return bio_vec.bv_page;
}
static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
struct bio *bio;
- struct bio_vec *bio_vec;
- unsigned int index;
+ struct bio_vec bio_vec;
BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
bio = cursor->bio;
BUG_ON(!bio);
- index = cursor->vector_index;
- BUG_ON(index >= (unsigned int) bio->bi_vcnt);
- bio_vec = &bio->bi_io_vec[index];
+ bio_vec = bio_iter_iovec(bio, cursor->bvec_iter);
/* Advance the cursor offset */
BUG_ON(cursor->resid < bytes);
cursor->resid -= bytes;
- cursor->vector_offset += bytes;
- if (cursor->vector_offset < bio_vec->bv_len)
+
+ bio_advance_iter(bio, &cursor->bvec_iter, bytes);
+
+ if (bytes < bio_vec.bv_len)
return false; /* more bytes to process in this segment */
- BUG_ON(cursor->vector_offset != bio_vec->bv_len);
/* Move on to the next segment, and possibly the next bio */
- if (++index == (unsigned int) bio->bi_vcnt) {
+ if (!cursor->bvec_iter.bi_size) {
bio = bio->bi_next;
- index = 0;
+ cursor->bvec_iter = bio->bi_iter;
}
cursor->bio = bio;
- cursor->vector_index = index;
- cursor->vector_offset = 0;
if (!cursor->last_piece) {
BUG_ON(!cursor->resid);
BUG_ON(!bio);
/* A short read is OK, so use <= rather than == */
- if (cursor->resid <= bio->bi_io_vec[index].bv_len)
+ if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter))
cursor->last_piece = true;
}
@@ -1865,7 +1857,9 @@ int ceph_parse_ips(const char *c, const char *end,
port = (port * 10) + (*p - '0');
p++;
}
- if (port > 65535 || port == 0)
+ if (port == 0)
+ port = CEPH_MON_PORT;
+ else if (port > 65535)
goto bad;
} else {
port = CEPH_MON_PORT;
@@ -1945,7 +1939,8 @@ static int process_connect(struct ceph_connection *con)
{
u64 sup_feat = con->msgr->supported_features;
u64 req_feat = con->msgr->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
+ u64 server_feat = ceph_sanitize_features(
+ le64_to_cpu(con->in_reply.features));
int ret;
dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -2853,8 +2848,8 @@ static void con_fault(struct ceph_connection *con)
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
- u32 supported_features,
- u32 required_features,
+ u64 supported_features,
+ u64 required_features,
bool nocrc)
{
msgr->supported_features = supported_features;
@@ -3126,15 +3121,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
INIT_LIST_HEAD(&m->data);
/* front */
- m->front_max = front_len;
if (front_len) {
- if (front_len > PAGE_CACHE_SIZE) {
- m->front.iov_base = __vmalloc(front_len, flags,
- PAGE_KERNEL);
- m->front_is_vmalloc = true;
- } else {
- m->front.iov_base = kmalloc(front_len, flags);
- }
+ m->front.iov_base = ceph_kvmalloc(front_len, flags);
if (m->front.iov_base == NULL) {
dout("ceph_msg_new can't allocate %d bytes\n",
front_len);
@@ -3143,7 +3131,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
} else {
m->front.iov_base = NULL;
}
- m->front.iov_len = front_len;
+ m->front_alloc_len = m->front.iov_len = front_len;
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
@@ -3256,10 +3244,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
void ceph_msg_kfree(struct ceph_msg *m)
{
dout("msg_kfree %p\n", m);
- if (m->front_is_vmalloc)
- vfree(m->front.iov_base);
- else
- kfree(m->front.iov_base);
+ ceph_kvfree(m->front.iov_base);
kmem_cache_free(ceph_msg_cache, m);
}
@@ -3301,8 +3286,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
void ceph_msg_dump(struct ceph_msg *msg)
{
- pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
- msg->front_max, msg->data_length);
+ pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg,
+ msg->front_alloc_len, msg->data_length);
print_hex_dump(KERN_DEBUG, "header: ",
DUMP_PREFIX_OFFSET, 16, 1,
&msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 1fe25cd29d0e..2ac9ef35110b 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -152,7 +152,7 @@ static int __open_session(struct ceph_mon_client *monc)
/* initiatiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
__send_prepared_auth_request(monc, ret);
} else {
dout("open_session mon%d already open\n", monc->cur_mon);
@@ -196,7 +196,7 @@ static void __send_subscribe(struct ceph_mon_client *monc)
int num;
p = msg->front.iov_base;
- end = p + msg->front_max;
+ end = p + msg->front_alloc_len;
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
ceph_encode_32(&p, num);
@@ -897,7 +897,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len,
monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
if (ret < 0) {
monc->client->auth_err = ret;
wake_up_all(&monc->client->auth_wq);
@@ -939,7 +939,7 @@ static int __validate_auth(struct ceph_mon_client *monc)
return 0;
ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
- monc->m_auth->front_max);
+ monc->m_auth->front_alloc_len);
if (ret <= 0)
return ret; /* either an error, or no need to authenticate */
__send_prepared_auth_request(monc, ret);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2b4b32aaa893..010ff3bd58ad 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -338,7 +338,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
msg_size = 4 + 4 + 8 + 8 + 4+8;
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pg_t */
- msg_size += 4 + MAX_OBJ_NAME_SIZE;
+ msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
@@ -368,6 +368,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
INIT_LIST_HEAD(&req->r_req_lru_item);
INIT_LIST_HEAD(&req->r_osd_item);
+ req->r_base_oloc.pool = -1;
+ req->r_target_oloc.pool = -1;
+
/* create reply message */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -761,11 +764,11 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (num_ops > 1)
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
- req->r_file_layout = *layout; /* keep a copy */
+ req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
- snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
- vino.ino, objnum);
- req->r_oid_len = strlen(req->r_oid);
+ snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
+ "%llx.%08llx", vino.ino, objnum);
+ req->r_base_oid.name_len = strlen(req->r_base_oid.name);
return req;
}
@@ -1044,8 +1047,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
!ceph_con_opened(&osd->o_con)) {
struct ceph_osd_request *req;
- dout(" osd addr hasn't changed and connection never opened,"
- " letting msgr retry");
+ dout("osd addr hasn't changed and connection never opened, "
+ "letting msgr retry\n");
/* touch each r_stamp for handle_timeout()'s benfit */
list_for_each_entry(req, &osd->o_requests, r_osd_item)
req->r_stamp = jiffies;
@@ -1232,6 +1235,61 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
+ * Returns whether a request should be blocked from being sent
+ * based on the current osdmap and osd_client settings.
+ *
+ * Caller should hold map_sem for read.
+ */
+static bool __req_should_be_paused(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
+ bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+ return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
+ (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
+}
+
+/*
+ * Calculate mapping of a request to a PG. Takes tiering into account.
+ */
+static int __calc_request_pg(struct ceph_osdmap *osdmap,
+ struct ceph_osd_request *req,
+ struct ceph_pg *pg_out)
+{
+ bool need_check_tiering;
+
+ need_check_tiering = false;
+ if (req->r_target_oloc.pool == -1) {
+ req->r_target_oloc = req->r_base_oloc; /* struct */
+ need_check_tiering = true;
+ }
+ if (req->r_target_oid.name_len == 0) {
+ ceph_oid_copy(&req->r_target_oid, &req->r_base_oid);
+ need_check_tiering = true;
+ }
+
+ if (need_check_tiering &&
+ (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
+ struct ceph_pg_pool_info *pi;
+
+ pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool);
+ if (pi) {
+ if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
+ pi->read_tier >= 0)
+ req->r_target_oloc.pool = pi->read_tier;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
+ pi->write_tier >= 0)
+ req->r_target_oloc.pool = pi->write_tier;
+ }
+ /* !pi is caught in ceph_oloc_oid_to_pg() */
+ }
+
+ return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc,
+ &req->r_target_oid, pg_out);
+}
+
+/*
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct
* (as needed), and set the request r_osd appropriately. If there is
* no up osd, set r_osd to NULL. Move the request to the appropriate list
@@ -1248,10 +1306,11 @@ static int __map_request(struct ceph_osd_client *osdc,
int acting[CEPH_PG_MAX_SIZE];
int o = -1, num = 0;
int err;
+ bool was_paused;
dout("map_request %p tid %lld\n", req, req->r_tid);
- err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
- ceph_file_layout_pg_pool(req->r_file_layout));
+
+ err = __calc_request_pg(osdc->osdmap, req, &pgid);
if (err) {
list_move(&req->r_req_lru_item, &osdc->req_notarget);
return err;
@@ -1264,12 +1323,18 @@ static int __map_request(struct ceph_osd_client *osdc,
num = err;
}
+ was_paused = req->r_paused;
+ req->r_paused = __req_should_be_paused(osdc, req);
+ if (was_paused && !req->r_paused)
+ force_resend = 1;
+
if ((!force_resend &&
req->r_osd && req->r_osd->o_osd == o &&
req->r_sent >= req->r_osd->o_incarnation &&
req->r_num_pg_osds == num &&
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
- (req->r_osd == NULL && o == -1))
+ (req->r_osd == NULL && o == -1) ||
+ req->r_paused)
return 0; /* no change */
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
@@ -1331,7 +1396,7 @@ static void __send_request(struct ceph_osd_client *osdc,
/* fill in message content that changes each time we send it */
put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch);
put_unaligned_le32(req->r_flags, req->r_request_flags);
- put_unaligned_le64(req->r_pgid.pool, req->r_request_pool);
+ put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
p = req->r_request_pgid;
ceph_encode_64(&p, req->r_pgid.pool);
ceph_encode_32(&p, req->r_pgid.seed);
@@ -1432,6 +1497,109 @@ static void handle_osds_timeout(struct work_struct *work)
round_jiffies_relative(delay));
}
+static int ceph_oloc_decode(void **p, void *end,
+ struct ceph_object_locator *oloc)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret = 0;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_v < 3) {
+ pr_warn("got v %d < 3 cv %d of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ if (struct_cv > 6) {
+ pr_warn("got v %d cv %d > 6 of ceph_object_locator\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ oloc->pool = ceph_decode_64(p);
+ *p += 4; /* skip preferred */
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::key is set\n");
+ goto e_inval;
+ }
+
+ if (struct_v >= 5) {
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_object_locator::nspace is set\n");
+ goto e_inval;
+ }
+ }
+
+ if (struct_v >= 6) {
+ s64 hash = ceph_decode_64(p);
+ if (hash != -1) {
+ pr_warn("ceph_object_locator::hash is set\n");
+ goto e_inval;
+ }
+ }
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
+static int ceph_redirect_decode(void **p, void *end,
+ struct ceph_request_redirect *redir)
+{
+ u8 struct_v, struct_cv;
+ u32 len;
+ void *struct_end;
+ int ret;
+
+ ceph_decode_need(p, end, 1 + 1 + 4, e_inval);
+ struct_v = ceph_decode_8(p);
+ struct_cv = ceph_decode_8(p);
+ if (struct_cv > 1) {
+ pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n",
+ struct_v, struct_cv);
+ goto e_inval;
+ }
+ len = ceph_decode_32(p);
+ ceph_decode_need(p, end, len, e_inval);
+ struct_end = *p + len;
+
+ ret = ceph_oloc_decode(p, end, &redir->oloc);
+ if (ret)
+ goto out;
+
+ len = ceph_decode_32(p);
+ if (len > 0) {
+ pr_warn("ceph_request_redirect::object_name is set\n");
+ goto e_inval;
+ }
+
+ len = ceph_decode_32(p);
+ *p += len; /* skip osd_instructions */
+
+ /* skip the rest */
+ *p = struct_end;
+out:
+ return ret;
+
+e_inval:
+ ret = -EINVAL;
+ goto out;
+}
+
static void complete_request(struct ceph_osd_request *req)
{
complete_all(&req->r_safe_completion); /* fsync waiter */
@@ -1446,6 +1614,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
{
void *p, *end;
struct ceph_osd_request *req;
+ struct ceph_request_redirect redir;
u64 tid;
int object_len;
unsigned int numops;
@@ -1525,10 +1694,41 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
for (i = 0; i < numops; i++)
req->r_reply_op_result[i] = ceph_decode_32(&p);
- already_completed = req->r_got_reply;
+ if (le16_to_cpu(msg->hdr.version) >= 6) {
+ p += 8 + 4; /* skip replay_version */
+ p += 8; /* skip user_version */
- if (!req->r_got_reply) {
+ err = ceph_redirect_decode(&p, end, &redir);
+ if (err)
+ goto bad_put;
+ } else {
+ redir.oloc.pool = -1;
+ }
+
+ if (redir.oloc.pool != -1) {
+ dout("redirect pool %lld\n", redir.oloc.pool);
+
+ __unregister_request(osdc, req);
+ mutex_unlock(&osdc->request_mutex);
+
+ req->r_target_oloc = redir.oloc; /* struct */
+
+ /*
+ * Start redirect requests with nofail=true. If
+ * mapping fails, request will end up on the notarget
+ * list, waiting for the new osdmap (which can take
+ * a while), even though the original request mapped
+ * successfully. In the future we might want to follow
+ * original request's nofail setting here.
+ */
+ err = ceph_osdc_start_request(osdc, req, true);
+ BUG_ON(err);
+ goto done;
+ }
+
+ already_completed = req->r_got_reply;
+ if (!req->r_got_reply) {
req->r_result = result;
dout("handle_reply result %d bytes %d\n", req->r_result,
bytes);
@@ -1581,6 +1781,13 @@ done:
return;
bad_put:
+ req->r_result = -EIO;
+ __unregister_request(osdc, req);
+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete_all(&req->r_completion);
+ complete_request(req);
ceph_osdc_put_request(req);
bad_mutex:
mutex_unlock(&osdc->request_mutex);
@@ -1613,14 +1820,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
*
* Caller should hold map_sem for read.
*/
-static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
+static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
+ bool force_resend_writes)
{
struct ceph_osd_request *req, *nreq;
struct rb_node *p;
int needmap = 0;
int err;
+ bool force_resend_req;
- dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
+ dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
+ force_resend_writes ? " (force resend writes)" : "");
mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; ) {
req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -1645,7 +1855,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
continue;
}
- err = __map_request(osdc, req, force_resend);
+ force_resend_req = force_resend ||
+ (force_resend_writes &&
+ req->r_flags & CEPH_OSD_FLAG_WRITE);
+ err = __map_request(osdc, req, force_resend_req);
if (err < 0)
continue; /* error */
if (req->r_osd == NULL) {
@@ -1665,7 +1878,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
r_linger_item) {
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
- err = __map_request(osdc, req, force_resend);
+ err = __map_request(osdc, req,
+ force_resend || force_resend_writes);
dout("__map_request returned %d\n", err);
if (err == 0)
continue; /* no change and no osd was specified */
@@ -1707,6 +1921,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
struct ceph_osdmap *newmap = NULL, *oldmap;
int err;
struct ceph_fsid fsid;
+ bool was_full;
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
p = msg->front.iov_base;
@@ -1720,6 +1935,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
down_write(&osdc->map_sem);
+ was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
+
/* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad);
dout(" %d inc maps\n", nr_maps);
@@ -1744,7 +1961,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_osdmap_destroy(osdc->osdmap);
osdc->osdmap = newmap;
}
- kick_requests(osdc, 0);
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, 0, was_full);
} else {
dout("ignoring incremental map %u len %d\n",
epoch, maplen);
@@ -1787,7 +2007,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
skipped_map = 1;
ceph_osdmap_destroy(oldmap);
}
- kick_requests(osdc, skipped_map);
+ was_full = was_full ||
+ ceph_osdmap_flag(osdc->osdmap,
+ CEPH_OSDMAP_FULL);
+ kick_requests(osdc, skipped_map, was_full);
}
p += maplen;
nr_maps--;
@@ -1804,7 +2027,9 @@ done:
* we find out when we are no longer full and stop returning
* ENOSPC.
*/
- if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
+ if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
+ ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
ceph_monc_request_next_osdmap(&osdc->client->monc);
mutex_lock(&osdc->request_mutex);
@@ -2068,10 +2293,11 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
ceph_encode_32(&p, -1); /* preferred */
/* oid */
- ceph_encode_32(&p, req->r_oid_len);
- memcpy(p, req->r_oid, req->r_oid_len);
- dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
- p += req->r_oid_len;
+ ceph_encode_32(&p, req->r_base_oid.name_len);
+ memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len);
+ dout("oid '%.*s' len %d\n", req->r_base_oid.name_len,
+ req->r_base_oid.name, req->r_base_oid.name_len);
+ p += req->r_base_oid.name_len;
/* ops--can imply data */
ceph_encode_16(&p, (u16)req->r_num_ops);
@@ -2454,7 +2680,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
struct ceph_osd_client *osdc = osd->o_osdc;
struct ceph_msg *m;
struct ceph_osd_request *req;
- int front = le32_to_cpu(hdr->front_len);
+ int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
u64 tid;
@@ -2474,12 +2700,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply, req->r_reply->con);
ceph_msg_revoke_incoming(req->r_reply);
- if (front > req->r_reply->front.iov_len) {
+ if (front_len > req->r_reply->front_alloc_len) {
pr_warning("get_reply front %d > preallocated %d (%u#%llu)\n",
- front, (int)req->r_reply->front.iov_len,
+ front_len, req->r_reply->front_alloc_len,
(unsigned int)con->peer_name.type,
le64_to_cpu(con->peer_name.num));
- m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
+ false);
if (!m)
goto out;
ceph_msg_put(req->r_reply);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index dbd9a4792427..aade4a5c1c07 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -464,6 +464,11 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
return NULL;
}
+struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
+{
+ return __lookup_pg_pool(&map->pg_pools, id);
+}
+
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
{
struct ceph_pg_pool_info *pi;
@@ -514,8 +519,8 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
- if (cv > 7) {
- pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev, cv);
+ if (cv > 9) {
+ pr_warning("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv);
return -EINVAL;
}
len = ceph_decode_32(p);
@@ -543,12 +548,34 @@ static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += len;
}
- /* skip removed snaps */
+ /* skip removed_snaps */
num = ceph_decode_32(p);
*p += num * (8 + 8);
*p += 8; /* skip auid */
pi->flags = ceph_decode_64(p);
+ *p += 4; /* skip crash_replay_interval */
+
+ if (ev >= 7)
+ *p += 1; /* skip min_size */
+
+ if (ev >= 8)
+ *p += 8 + 8; /* skip quota_max_* */
+
+ if (ev >= 9) {
+ /* skip tiers */
+ num = ceph_decode_32(p);
+ *p += num * 8;
+
+ *p += 8; /* skip tier_of */
+ *p += 1; /* skip cache_mode */
+
+ pi->read_tier = ceph_decode_64(p);
+ pi->write_tier = ceph_decode_64(p);
+ } else {
+ pi->read_tier = -1;
+ pi->write_tier = -1;
+ }
/* ignore the rest */
@@ -1090,25 +1117,40 @@ invalid:
EXPORT_SYMBOL(ceph_calc_file_object_mapping);
/*
- * calculate an object layout (i.e. pgid) from an oid,
- * file_layout, and osdmap
+ * Calculate mapping of a (oloc, oid) pair to a PG. Should only be
+ * called with target's (oloc, oid), since tiering isn't taken into
+ * account.
*/
-int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
- struct ceph_osdmap *osdmap, uint64_t pool)
+int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
+ struct ceph_object_locator *oloc,
+ struct ceph_object_id *oid,
+ struct ceph_pg *pg_out)
{
- struct ceph_pg_pool_info *pool_info;
+ struct ceph_pg_pool_info *pi;
- BUG_ON(!osdmap);
- pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
- if (!pool_info)
+ pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool);
+ if (!pi)
return -EIO;
- pg->pool = pool;
- pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
- dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
+ pg_out->pool = oloc->pool;
+ pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+
+ dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
+ pg_out->pool, pg_out->seed);
return 0;
}
-EXPORT_SYMBOL(ceph_calc_ceph_pg);
+EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
+
+static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
+ int *result, int result_max,
+ const __u32 *weight, int weight_max)
+{
+ int scratch[result_max * 3];
+
+ return crush_do_rule(map, ruleno, x, result, result_max,
+ weight, weight_max, scratch);
+}
/*
* Calculate raw osd vector for the given pgid. Return pointer to osd
@@ -1163,9 +1205,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
pool->pgp_num_mask) +
(unsigned)pgid.pool;
}
- r = crush_do_rule(osdmap->crush, ruleno, pps, osds,
- min_t(int, pool->size, *num),
- osdmap->osd_weight);
+ r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
+ osds, min_t(int, pool->size, *num),
+ osdmap->osd_weight, osdmap->max_osd);
if (r < 0) {
pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
" size %d\n", r, pgid.pool, pool->crush_ruleset,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c03f3dec4763..a664f7829a6d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -948,6 +948,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
int ipv6;
+ bool ipversion_set = false;
if (*cur != '@') {
if ((delim = strchr(cur, '@')) == NULL)
@@ -960,6 +961,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
cur++;
if (*cur != '/') {
+ ipversion_set = true;
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
@@ -1002,7 +1004,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
if (ipv6 < 0)
goto parse_failed;
- else if (np->ipv6 != (bool)ipv6)
+ else if (ipversion_set && np->ipv6 != (bool)ipv6)
goto parse_failed;
else
np->ipv6 = (bool)ipv6;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 393b1bc9a618..048dc8d183aa 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -374,7 +374,7 @@ static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
if (!master_dev)
return 0;
ops = master_dev->rtnl_link_ops;
- if (!ops->get_slave_size)
+ if (!ops || !ops->get_slave_size)
return 0;
/* IFLA_INFO_SLAVE_DATA + nested data */
return nla_total_size(sizeof(struct nlattr)) +
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9ae6d11374d1..5976ef0846bd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2121,7 +2121,7 @@ EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);
/**
* skb_zerocopy - Zero copy skb to skb
* @to: destination buffer
- * @source: source buffer
+ * @from: source buffer
* @len: number of bytes to copy from source buffer
* @hlen: size of linear headroom in destination buffer
*
diff --git a/net/core/sock.c b/net/core/sock.c
index 0c127dcdf6a8..5b6a9431b017 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1775,7 +1775,9 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
while (order) {
if (npages >= 1 << order) {
page = alloc_pages(sk->sk_allocation |
- __GFP_COMP | __GFP_NOWARN,
+ __GFP_COMP |
+ __GFP_NOWARN |
+ __GFP_NORETRY,
order);
if (page)
goto fill_page;
@@ -1845,7 +1847,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
gfp_t gfp = prio;
if (order)
- gfp |= __GFP_COMP | __GFP_NOWARN;
+ gfp |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY;
pfrag->page = alloc_pages(gfp, order);
if (likely(pfrag->page)) {
pfrag->offset = 0;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ac2dff3c2c1c..bdbf68bb2e2d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1443,7 +1443,8 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_LOCAL */
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
- + nla_total_size(4); /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
static inline u32 cstamp_delta(unsigned long cstamp)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index bd28f386bd02..50228be5c17b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -101,28 +101,22 @@ static void tunnel_dst_reset_all(struct ip_tunnel *t)
__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
}
-static struct dst_entry *tunnel_dst_get(struct ip_tunnel *t)
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
{
struct dst_entry *dst;
rcu_read_lock();
dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
- if (dst)
+ if (dst) {
+ if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ rcu_read_unlock();
+ tunnel_dst_reset(t);
+ return NULL;
+ }
dst_hold(dst);
- rcu_read_unlock();
- return dst;
-}
-
-static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
-{
- struct dst_entry *dst = tunnel_dst_get(t);
-
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
- tunnel_dst_reset(t);
- return NULL;
}
-
- return dst;
+ rcu_read_unlock();
+ return (struct rtable *)dst;
}
/* Often modified stats are per cpu, other are shared (netdev->stats) */
@@ -584,7 +578,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4;
u8 tos, ttl;
__be16 df;
- struct rtable *rt = NULL; /* Route to the other host */
+ struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
int err;
@@ -657,8 +651,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
- if (connected)
- rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
+ rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 65cf90e063d5..227cba79fa6b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -671,6 +671,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
{
struct tcp_sock *tp = tcp_sk(sk);
long m = mrtt; /* RTT */
+ u32 srtt = tp->srtt;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -688,11 +689,9 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
* does not matter how to _calculate_ it. Seems, it was trap
* that VJ failed to avoid. 8)
*/
- if (m == 0)
- m = 1;
- if (tp->srtt != 0) {
- m -= (tp->srtt >> 3); /* m is now error in rtt est */
- tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */
+ if (srtt != 0) {
+ m -= (srtt >> 3); /* m is now error in rtt est */
+ srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
m -= (tp->mdev >> 2); /* similar update on mdev */
@@ -723,11 +722,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
}
} else {
/* no previous measure. */
- tp->srtt = m << 3; /* take the measured time to be rtt */
+ srtt = m << 3; /* take the measured time to be rtt */
tp->mdev = m << 1; /* make sure rto = 3*rtt */
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
tp->rtt_seq = tp->snd_nxt;
}
+ tp->srtt = max(1U, srtt);
}
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@@ -746,8 +746,10 @@ static void tcp_update_pacing_rate(struct sock *sk)
rate *= max(tp->snd_cwnd, tp->packets_out);
- /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
- * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+ /* Correction for small srtt and scheduling constraints.
+ * For small rtt, consider noise is too high, and use
+ * the minimal value (srtt = 1 -> 125 us for HZ=1000)
+ *
* We probably need usec resolution in the future.
* Note: This also takes care of possible srtt=0 case,
* when tcp_rtt_estimator() was not yet called.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 03d26b85eab8..10435b3b9d0f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1977,7 +1977,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application.
*/
- if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+ if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 25f5cee3a08a..88b4023ecfcf 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -17,6 +17,8 @@
static DEFINE_SPINLOCK(udp_offload_lock);
static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
+#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
+
struct udp_offload_priv {
struct udp_offload *offload;
struct rcu_head rcu;
@@ -100,8 +102,7 @@ out:
int udp_add_offload(struct udp_offload *uo)
{
- struct udp_offload_priv __rcu **head = &udp_offload_base;
- struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL);
+ struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
if (!new_offload)
return -ENOMEM;
@@ -109,8 +110,8 @@ int udp_add_offload(struct udp_offload *uo)
new_offload->offload = uo;
spin_lock(&udp_offload_lock);
- rcu_assign_pointer(new_offload->next, rcu_dereference(*head));
- rcu_assign_pointer(*head, new_offload);
+ new_offload->next = udp_offload_base;
+ rcu_assign_pointer(udp_offload_base, new_offload);
spin_unlock(&udp_offload_lock);
return 0;
@@ -130,12 +131,12 @@ void udp_del_offload(struct udp_offload *uo)
spin_lock(&udp_offload_lock);
- uo_priv = rcu_dereference(*head);
+ uo_priv = udp_deref_protected(*head);
for (; uo_priv != NULL;
- uo_priv = rcu_dereference(*head)) {
-
+ uo_priv = udp_deref_protected(*head)) {
if (uo_priv->offload == uo) {
- rcu_assign_pointer(*head, rcu_dereference(uo_priv->next));
+ rcu_assign_pointer(*head,
+ udp_deref_protected(uo_priv->next));
goto unlock;
}
head = &uo_priv->next;
diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index 2dae8a5df23f..94425e421213 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -43,7 +43,7 @@ int llc_mac_hdr_init(struct sk_buff *skb,
rc = 0;
break;
default:
- WARN(1, "device type not supported: %d\n", skb->dev->type);
+ break;
}
return rc;
}
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index df4692826ead..e9a48baf8551 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -55,6 +55,7 @@
#include "datapath.h"
#include "flow.h"
+#include "flow_table.h"
#include "flow_netlink.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -160,7 +161,6 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
{
struct datapath *dp = container_of(rcu, struct datapath, rcu);
- ovs_flow_tbl_destroy(&dp->table);
free_percpu(dp->stats_percpu);
release_net(ovs_dp_get_net(dp));
kfree(dp->ports);
@@ -466,6 +466,14 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
skb_zerocopy(user_skb, skb, skb->len, hlen);
+ /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
+ if (!(dp->user_features & OVS_DP_F_UNALIGNED)) {
+ size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len;
+
+ if (plen > 0)
+ memset(skb_put(user_skb, plen), 0, plen);
+ }
+
((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
@@ -852,11 +860,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_ovs;
/* The unmasked key has to be the same for flow updates. */
- error = -EINVAL;
- if (!ovs_flow_cmp_unmasked_key(flow, &match)) {
- OVS_NLERR("Flow modification message rejected, unmasked key does not match.\n");
+ if (!ovs_flow_cmp_unmasked_key(flow, &match))
goto err_unlock_ovs;
- }
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
@@ -1079,6 +1084,7 @@ static size_t ovs_dp_cmd_msg_size(void)
msgsize += nla_total_size(IFNAMSIZ);
msgsize += nla_total_size(sizeof(struct ovs_dp_stats));
msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats));
+ msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */
return msgsize;
}
@@ -1279,7 +1285,7 @@ err_destroy_ports_array:
err_destroy_percpu:
free_percpu(dp->stats_percpu);
err_destroy_table:
- ovs_flow_tbl_destroy(&dp->table);
+ ovs_flow_tbl_destroy(&dp->table, false);
err_free_dp:
release_net(ovs_dp_get_net(dp));
kfree(dp);
@@ -1306,10 +1312,13 @@ static void __dp_destroy(struct datapath *dp)
list_del_rcu(&dp->list_node);
/* OVSP_LOCAL is datapath internal port. We need to make sure that
- * all port in datapath are destroyed first before freeing datapath.
+ * all ports in datapath are destroyed first before freeing datapath.
*/
ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL));
+ /* RCU destroy the flow table */
+ ovs_flow_tbl_destroy(&dp->table, true);
+
call_rcu(&dp->rcu, destroy_dp_rcu);
}
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index c58a0fe3c889..3c268b3d71c3 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -153,29 +153,29 @@ static void rcu_free_flow_callback(struct rcu_head *rcu)
flow_free(flow);
}
-static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
-{
- if (!mask)
- return;
-
- BUG_ON(!mask->ref_count);
- mask->ref_count--;
-
- if (!mask->ref_count) {
- list_del_rcu(&mask->list);
- if (deferred)
- kfree_rcu(mask, rcu);
- else
- kfree(mask);
- }
-}
-
void ovs_flow_free(struct sw_flow *flow, bool deferred)
{
if (!flow)
return;
- flow_mask_del_ref(flow->mask, deferred);
+ if (flow->mask) {
+ struct sw_flow_mask *mask = flow->mask;
+
+ /* ovs-lock is required to protect mask-refcount and
+ * mask list.
+ */
+ ASSERT_OVSL();
+ BUG_ON(!mask->ref_count);
+ mask->ref_count--;
+
+ if (!mask->ref_count) {
+ list_del_rcu(&mask->list);
+ if (deferred)
+ kfree_rcu(mask, rcu);
+ else
+ kfree(mask);
+ }
+ }
if (deferred)
call_rcu(&flow->rcu, rcu_free_flow_callback);
@@ -188,26 +188,9 @@ static void free_buckets(struct flex_array *buckets)
flex_array_free(buckets);
}
+
static void __table_instance_destroy(struct table_instance *ti)
{
- int i;
-
- if (ti->keep_flows)
- goto skip_flows;
-
- for (i = 0; i < ti->n_buckets; i++) {
- struct sw_flow *flow;
- struct hlist_head *head = flex_array_get(ti->buckets, i);
- struct hlist_node *n;
- int ver = ti->node_ver;
-
- hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
- hlist_del(&flow->hash_node[ver]);
- ovs_flow_free(flow, false);
- }
- }
-
-skip_flows:
free_buckets(ti->buckets);
kfree(ti);
}
@@ -258,20 +241,38 @@ static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
static void table_instance_destroy(struct table_instance *ti, bool deferred)
{
+ int i;
+
if (!ti)
return;
+ if (ti->keep_flows)
+ goto skip_flows;
+
+ for (i = 0; i < ti->n_buckets; i++) {
+ struct sw_flow *flow;
+ struct hlist_head *head = flex_array_get(ti->buckets, i);
+ struct hlist_node *n;
+ int ver = ti->node_ver;
+
+ hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
+ hlist_del_rcu(&flow->hash_node[ver]);
+ ovs_flow_free(flow, deferred);
+ }
+ }
+
+skip_flows:
if (deferred)
call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
else
__table_instance_destroy(ti);
}
-void ovs_flow_tbl_destroy(struct flow_table *table)
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred)
{
struct table_instance *ti = ovsl_dereference(table->ti);
- table_instance_destroy(ti, false);
+ table_instance_destroy(ti, deferred);
}
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
@@ -504,16 +505,11 @@ static struct sw_flow_mask *mask_alloc(void)
mask = kmalloc(sizeof(*mask), GFP_KERNEL);
if (mask)
- mask->ref_count = 0;
+ mask->ref_count = 1;
return mask;
}
-static void mask_add_ref(struct sw_flow_mask *mask)
-{
- mask->ref_count++;
-}
-
static bool mask_equal(const struct sw_flow_mask *a,
const struct sw_flow_mask *b)
{
@@ -554,9 +550,11 @@ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
mask->key = new->key;
mask->range = new->range;
list_add_rcu(&mask->list, &tbl->mask_list);
+ } else {
+ BUG_ON(!mask->ref_count);
+ mask->ref_count++;
}
- mask_add_ref(mask);
flow->mask = mask;
return 0;
}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 1996e34c0fd8..baaeb101924d 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -60,7 +60,7 @@ void ovs_flow_free(struct sw_flow *, bool deferred);
int ovs_flow_tbl_init(struct flow_table *);
int ovs_flow_tbl_count(struct flow_table *table);
-void ovs_flow_tbl_destroy(struct flow_table *table);
+void ovs_flow_tbl_destroy(struct flow_table *table, bool deferred);
int ovs_flow_tbl_flush(struct flow_table *flow_table);
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 4106ca95ec86..7bf5b5b9e8b9 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -381,6 +381,8 @@ static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
rxrpc_assign_connection_id(conn);
rx->conn = conn;
+ } else {
+ spin_lock(&trans->client_lock);
}
/* we've got a connection with a free channel and we can now attach the
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index 898492a8d61b..34b5490dde65 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -180,7 +180,8 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
if (copy > len - copied)
copy = len - copied;
- if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ if (skb->ip_summed == CHECKSUM_UNNECESSARY ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
ret = skb_copy_datagram_iovec(skb, offset,
msg->msg_iov, copy);
} else {
@@ -353,6 +354,10 @@ csum_copy_error:
if (continue_call)
rxrpc_put_call(continue_call);
rxrpc_kill_skb(skb);
+ if (!(flags & MSG_PEEK)) {
+ if (skb_dequeue(&rx->sk.sk_receive_queue) != skb)
+ BUG();
+ }
skb_kill_datagram(&rx->sk, skb, flags);
rxrpc_put_call(call);
return -EAGAIN;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0f6259a6a932..2b1738ef9394 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -662,6 +662,8 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
*/
sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk);
+ newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+
sk_refcnt_debug_inc(newsk);
if (newsk->sk_prot->init(newsk)) {
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 42fdfc634e56..0a2aee060f9f 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -536,8 +536,7 @@ static void warn_gssd(void)
unsigned long now = jiffies;
if (time_after(now, ratelimit)) {
- printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n"
- "Please check user daemon is running.\n");
+ pr_warn("RPC: AUTH_GSS upcall failed. Please check user daemon is running.\n");
ratelimit = now + 15*HZ;
}
}
@@ -600,7 +599,6 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
struct rpc_pipe *pipe;
struct rpc_cred *cred = &gss_cred->gc_base;
struct gss_upcall_msg *gss_msg;
- unsigned long timeout;
DEFINE_WAIT(wait);
int err;
@@ -608,17 +606,16 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
__func__, from_kuid(&init_user_ns, cred->cr_uid));
retry:
err = 0;
- /* Default timeout is 15s unless we know that gssd is not running */
- timeout = 15 * HZ;
- if (!sn->gssd_running)
- timeout = HZ >> 2;
+ /* if gssd is down, just skip upcalling altogether */
+ if (!gssd_running(net)) {
+ warn_gssd();
+ return -EACCES;
+ }
gss_msg = gss_setup_upcall(gss_auth, cred);
if (PTR_ERR(gss_msg) == -EAGAIN) {
err = wait_event_interruptible_timeout(pipe_version_waitqueue,
- sn->pipe_version >= 0, timeout);
+ sn->pipe_version >= 0, 15 * HZ);
if (sn->pipe_version < 0) {
- if (err == 0)
- sn->gssd_running = 0;
warn_gssd();
err = -EACCES;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 76e42e6be755..24589bd2a4b6 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -59,6 +59,7 @@
#include <linux/crypto.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/xdr.h>
+#include <linux/lcm.h>
#ifdef RPC_DEBUG
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -72,7 +73,7 @@
static void krb5_nfold(u32 inbits, const u8 *in,
u32 outbits, u8 *out)
{
- int a, b, c, lcm;
+ unsigned long ulcm;
int byte, i, msbit;
/* the code below is more readable if I make these bytes
@@ -82,17 +83,7 @@ static void krb5_nfold(u32 inbits, const u8 *in,
outbits >>= 3;
/* first compute lcm(n,k) */
-
- a = outbits;
- b = inbits;
-
- while (b != 0) {
- c = b;
- b = a%b;
- a = c;
- }
-
- lcm = outbits*inbits/a;
+ ulcm = lcm(inbits, outbits);
/* now do the real work */
@@ -101,7 +92,7 @@ static void krb5_nfold(u32 inbits, const u8 *in,
/* this will end up cycling through k lcm(k,n)/k times, which
is correct */
- for (i = lcm-1; i >= 0; i--) {
+ for (i = ulcm-1; i >= 0; i--) {
/* compute the msbit in k which gets added into this byte */
msbit = (
/* first, start with the msbit in the first,
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 458f85e9b0ba..abbb7dcd1689 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -137,7 +137,6 @@ void init_gssp_clnt(struct sunrpc_net *sn)
{
mutex_init(&sn->gssp_lock);
sn->gssp_clnt = NULL;
- init_waitqueue_head(&sn->gssp_wq);
}
int set_gssp_clnt(struct net *net)
@@ -154,7 +153,6 @@ int set_gssp_clnt(struct net *net)
sn->gssp_clnt = clnt;
}
mutex_unlock(&sn->gssp_lock);
- wake_up(&sn->gssp_wq);
return ret;
}
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 008cdade5aae..0f73f4507746 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1263,65 +1263,34 @@ out:
return ret;
}
-DEFINE_SPINLOCK(use_gssp_lock);
-
-static bool use_gss_proxy(struct net *net)
-{
- struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
-
- if (sn->use_gss_proxy != -1)
- return sn->use_gss_proxy;
- spin_lock(&use_gssp_lock);
- /*
- * If you wanted gss-proxy, you should have said so before
- * starting to accept requests:
- */
- sn->use_gss_proxy = 0;
- spin_unlock(&use_gssp_lock);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-
+/*
+ * Try to set the sn->use_gss_proxy variable to a new value. We only allow
+ * it to be changed if it's currently undefined (-1). If it's any other value
+ * then return -EBUSY unless the type wouldn't have changed anyway.
+ */
static int set_gss_proxy(struct net *net, int type)
{
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
- int ret = 0;
+ int ret;
WARN_ON_ONCE(type != 0 && type != 1);
- spin_lock(&use_gssp_lock);
- if (sn->use_gss_proxy == -1 || sn->use_gss_proxy == type)
- sn->use_gss_proxy = type;
- else
- ret = -EBUSY;
- spin_unlock(&use_gssp_lock);
- wake_up(&sn->gssp_wq);
- return ret;
-}
-
-static inline bool gssp_ready(struct sunrpc_net *sn)
-{
- switch (sn->use_gss_proxy) {
- case -1:
- return false;
- case 0:
- return true;
- case 1:
- return sn->gssp_clnt;
- }
- WARN_ON_ONCE(1);
- return false;
+ ret = cmpxchg(&sn->use_gss_proxy, -1, type);
+ if (ret != -1 && ret != type)
+ return -EBUSY;
+ return 0;
}
-static int wait_for_gss_proxy(struct net *net, struct file *file)
+static bool use_gss_proxy(struct net *net)
{
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
- if (file->f_flags & O_NONBLOCK && !gssp_ready(sn))
- return -EAGAIN;
- return wait_event_interruptible(sn->gssp_wq, gssp_ready(sn));
+ /* If use_gss_proxy is still undefined, then try to disable it */
+ if (sn->use_gss_proxy == -1)
+ set_gss_proxy(net, 0);
+ return sn->use_gss_proxy;
}
+#ifdef CONFIG_PROC_FS
static ssize_t write_gssp(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
@@ -1342,10 +1311,10 @@ static ssize_t write_gssp(struct file *file, const char __user *buf,
return res;
if (i != 1)
return -EINVAL;
- res = set_gss_proxy(net, 1);
+ res = set_gssp_clnt(net);
if (res)
return res;
- res = set_gssp_clnt(net);
+ res = set_gss_proxy(net, 1);
if (res)
return res;
return count;
@@ -1355,16 +1324,12 @@ static ssize_t read_gssp(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct net *net = PDE_DATA(file_inode(file));
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
unsigned long p = *ppos;
char tbuf[10];
size_t len;
- int ret;
- ret = wait_for_gss_proxy(net, file);
- if (ret)
- return ret;
-
- snprintf(tbuf, sizeof(tbuf), "%d\n", use_gss_proxy(net));
+ snprintf(tbuf, sizeof(tbuf), "%d\n", sn->use_gss_proxy);
len = strlen(tbuf);
if (p >= len)
return 0;
@@ -1626,8 +1591,7 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
BUG_ON(integ_len % 4);
*p++ = htonl(integ_len);
*p++ = htonl(gc->gc_seq);
- if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
- integ_len))
+ if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len))
BUG();
if (resbuf->tail[0].iov_base == NULL) {
if (resbuf->head[0].iov_len + RPC_MAX_AUTH_SIZE > PAGE_SIZE)
@@ -1635,10 +1599,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
resbuf->tail[0].iov_base = resbuf->head[0].iov_base
+ resbuf->head[0].iov_len;
resbuf->tail[0].iov_len = 0;
- resv = &resbuf->tail[0];
- } else {
- resv = &resbuf->tail[0];
}
+ resv = &resbuf->tail[0];
mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic))
goto out_err;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index e521d20e1970..ae333c1845bb 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1111,9 +1111,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen)
*bp++ = 'x';
len -= 2;
while (blen && len >= 2) {
- unsigned char c = *buf++;
- *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
- *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+ bp = hex_byte_pack(bp, *buf++);
len -= 2;
blen--;
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f09b7db2c492..0edada973434 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1529,9 +1529,13 @@ call_refreshresult(struct rpc_task *task)
task->tk_action = call_refresh;
switch (status) {
case 0:
- if (rpcauth_uptodatecred(task))
+ if (rpcauth_uptodatecred(task)) {
task->tk_action = call_allocate;
- return;
+ return;
+ }
+ /* Use rate-limiting and a max number of retries if refresh
+ * had status 0 but failed to update the cred.
+ */
case -ETIMEDOUT:
rpc_delay(task, 3*HZ);
case -EAGAIN:
@@ -1729,6 +1733,7 @@ call_bind_status(struct rpc_task *task)
return;
case -ECONNREFUSED: /* connection problems */
case -ECONNRESET:
+ case -ECONNABORTED:
case -ENOTCONN:
case -EHOSTDOWN:
case -EHOSTUNREACH:
@@ -1799,7 +1804,9 @@ call_connect_status(struct rpc_task *task)
return;
case -ECONNREFUSED:
case -ECONNRESET:
+ case -ECONNABORTED:
case -ENETUNREACH:
+ case -EHOSTUNREACH:
/* retry with existing socket, after a delay */
rpc_delay(task, 3*HZ);
if (RPC_IS_SOFTCONN(task))
@@ -1902,6 +1909,7 @@ call_transmit_status(struct rpc_task *task)
break;
}
case -ECONNRESET:
+ case -ECONNABORTED:
case -ENOTCONN:
case -EPIPE:
rpc_task_force_reencode(task);
@@ -2011,8 +2019,9 @@ call_status(struct rpc_task *task)
xprt_conditional_disconnect(req->rq_xprt,
req->rq_connect_cookie);
break;
- case -ECONNRESET:
case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
rpc_force_rebind(clnt);
rpc_delay(task, 3*HZ);
case -EPIPE:
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index 779742cfc1ff..df5826876535 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -14,6 +14,7 @@ struct sunrpc_net {
struct cache_detail *rsi_cache;
struct super_block *pipefs_sb;
+ struct rpc_pipe *gssd_dummy;
struct mutex pipefs_sb_lock;
struct list_head all_clients;
@@ -26,14 +27,11 @@ struct sunrpc_net {
unsigned int rpcb_is_af_local : 1;
struct mutex gssp_lock;
- wait_queue_head_t gssp_wq;
struct rpc_clnt *gssp_clnt;
int use_gss_proxy;
int pipe_version;
atomic_t pipe_users;
struct proc_dir_entry *use_gssp_proc;
-
- unsigned int gssd_running;
};
extern int sunrpc_net_id;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index bf04b30a788a..b18554898562 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -17,6 +17,7 @@
#include <linux/fsnotify.h>
#include <linux/kernel.h>
#include <linux/rcupdate.h>
+#include <linux/utsname.h>
#include <asm/ioctls.h>
#include <linux/poll.h>
@@ -38,7 +39,7 @@
#define NET_NAME(net) ((net == &init_net) ? " (init_net)" : "")
static struct file_system_type rpc_pipe_fs_type;
-
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops;
static struct kmem_cache *rpc_inode_cachep __read_mostly;
@@ -216,14 +217,11 @@ rpc_destroy_inode(struct inode *inode)
static int
rpc_pipe_open(struct inode *inode, struct file *filp)
{
- struct net *net = inode->i_sb->s_fs_info;
- struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
struct rpc_pipe *pipe;
int first_open;
int res = -ENXIO;
mutex_lock(&inode->i_mutex);
- sn->gssd_running = 1;
pipe = RPC_I(inode)->pipe;
if (pipe == NULL)
goto out;
@@ -1159,6 +1157,7 @@ enum {
RPCAUTH_nfsd4_cb,
RPCAUTH_cache,
RPCAUTH_nfsd,
+ RPCAUTH_gssd,
RPCAUTH_RootEOF
};
@@ -1195,6 +1194,10 @@ static const struct rpc_filelist files[] = {
.name = "nfsd",
.mode = S_IFDIR | S_IRUGO | S_IXUGO,
},
+ [RPCAUTH_gssd] = {
+ .name = "gssd",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
};
/*
@@ -1208,13 +1211,24 @@ struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
-void rpc_pipefs_init_net(struct net *net)
+int rpc_pipefs_init_net(struct net *net)
{
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ sn->gssd_dummy = rpc_mkpipe_data(&gssd_dummy_pipe_ops, 0);
+ if (IS_ERR(sn->gssd_dummy))
+ return PTR_ERR(sn->gssd_dummy);
+
mutex_init(&sn->pipefs_sb_lock);
- sn->gssd_running = 1;
sn->pipe_version = -1;
+ return 0;
+}
+
+void rpc_pipefs_exit_net(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ rpc_destroy_pipe_data(sn->gssd_dummy);
}
/*
@@ -1244,11 +1258,134 @@ void rpc_put_sb_net(const struct net *net)
}
EXPORT_SYMBOL_GPL(rpc_put_sb_net);
+static const struct rpc_filelist gssd_dummy_clnt_dir[] = {
+ [0] = {
+ .name = "clntXX",
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ },
+};
+
+static ssize_t
+dummy_downcall(struct file *filp, const char __user *src, size_t len)
+{
+ return -EINVAL;
+}
+
+static const struct rpc_pipe_ops gssd_dummy_pipe_ops = {
+ .upcall = rpc_pipe_generic_upcall,
+ .downcall = dummy_downcall,
+};
+
+/*
+ * Here we present a bogus "info" file to keep rpc.gssd happy. We don't expect
+ * that it will ever use this info to handle an upcall, but rpc.gssd expects
+ * that this file will be there and have a certain format.
+ */
+static int
+rpc_show_dummy_info(struct seq_file *m, void *v)
+{
+ seq_printf(m, "RPC server: %s\n", utsname()->nodename);
+ seq_printf(m, "service: foo (1) version 0\n");
+ seq_printf(m, "address: 127.0.0.1\n");
+ seq_printf(m, "protocol: tcp\n");
+ seq_printf(m, "port: 0\n");
+ return 0;
+}
+
+static int
+rpc_dummy_info_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rpc_show_dummy_info, NULL);
+}
+
+static const struct file_operations rpc_dummy_info_operations = {
+ .owner = THIS_MODULE,
+ .open = rpc_dummy_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct rpc_filelist gssd_dummy_info_file[] = {
+ [0] = {
+ .name = "info",
+ .i_fop = &rpc_dummy_info_operations,
+ .mode = S_IFREG | S_IRUSR,
+ },
+};
+
+/**
+ * rpc_gssd_dummy_populate - create a dummy gssd pipe
+ * @root: root of the rpc_pipefs filesystem
+ * @pipe_data: pipe data created when netns is initialized
+ *
+ * Create a dummy set of directories and a pipe that gssd can hold open to
+ * indicate that it is up and running.
+ */
+static struct dentry *
+rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
+{
+ int ret = 0;
+ struct dentry *gssd_dentry;
+ struct dentry *clnt_dentry = NULL;
+ struct dentry *pipe_dentry = NULL;
+ struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
+ strlen(files[RPCAUTH_gssd].name));
+
+ /* We should never get this far if "gssd" doesn't exist */
+ gssd_dentry = d_hash_and_lookup(root, &q);
+ if (!gssd_dentry)
+ return ERR_PTR(-ENOENT);
+
+ ret = rpc_populate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1, NULL);
+ if (ret) {
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ q.name = gssd_dummy_clnt_dir[0].name;
+ q.len = strlen(gssd_dummy_clnt_dir[0].name);
+ clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+ if (!clnt_dentry) {
+ pipe_dentry = ERR_PTR(-ENOENT);
+ goto out;
+ }
+
+ ret = rpc_populate(clnt_dentry, gssd_dummy_info_file, 0, 1, NULL);
+ if (ret) {
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ pipe_dentry = ERR_PTR(ret);
+ goto out;
+ }
+
+ pipe_dentry = rpc_mkpipe_dentry(clnt_dentry, "gssd", NULL, pipe_data);
+ if (IS_ERR(pipe_dentry)) {
+ __rpc_depopulate(clnt_dentry, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
+ }
+out:
+ dput(clnt_dentry);
+ dput(gssd_dentry);
+ return pipe_dentry;
+}
+
+static void
+rpc_gssd_dummy_depopulate(struct dentry *pipe_dentry)
+{
+ struct dentry *clnt_dir = pipe_dentry->d_parent;
+ struct dentry *gssd_dir = clnt_dir->d_parent;
+
+ __rpc_rmpipe(clnt_dir->d_inode, pipe_dentry);
+ __rpc_depopulate(clnt_dir, gssd_dummy_info_file, 0, 1);
+ __rpc_depopulate(gssd_dir, gssd_dummy_clnt_dir, 0, 1);
+ dput(pipe_dentry);
+}
+
static int
rpc_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
- struct dentry *root;
+ struct dentry *root, *gssd_dentry;
struct net *net = data;
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
@@ -1266,6 +1403,13 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL))
return -ENOMEM;
+
+ gssd_dentry = rpc_gssd_dummy_populate(root, sn->gssd_dummy);
+ if (IS_ERR(gssd_dentry)) {
+ __rpc_depopulate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF);
+ return PTR_ERR(gssd_dentry);
+ }
+
dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n",
net, NET_NAME(net));
mutex_lock(&sn->pipefs_sb_lock);
@@ -1280,6 +1424,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
return 0;
err_depopulate:
+ rpc_gssd_dummy_depopulate(gssd_dentry);
blocking_notifier_call_chain(&rpc_pipefs_notifier_list,
RPC_PIPEFS_UMOUNT,
sb);
@@ -1289,6 +1434,16 @@ err_depopulate:
return err;
}
+bool
+gssd_running(struct net *net)
+{
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+ struct rpc_pipe *pipe = sn->gssd_dummy;
+
+ return pipe->nreaders || pipe->nwriters;
+}
+EXPORT_SYMBOL_GPL(gssd_running);
+
static struct dentry *
rpc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 3d6498af9adc..cd30120de9e4 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -44,12 +44,17 @@ static __net_init int sunrpc_init_net(struct net *net)
if (err)
goto err_unixgid;
- rpc_pipefs_init_net(net);
+ err = rpc_pipefs_init_net(net);
+ if (err)
+ goto err_pipefs;
+
INIT_LIST_HEAD(&sn->all_clients);
spin_lock_init(&sn->rpc_client_lock);
spin_lock_init(&sn->rpcb_clnt_lock);
return 0;
+err_pipefs:
+ unix_gid_cache_destroy(net);
err_unixgid:
ip_map_cache_destroy(net);
err_ipmap:
@@ -60,6 +65,7 @@ err_proc:
static __net_exit void sunrpc_exit_net(struct net *net)
{
+ rpc_pipefs_exit_net(net);
unix_gid_cache_destroy(net);
ip_map_cache_destroy(net);
rpc_proc_exit(net);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e7fbe368b4a3..5de6801cd924 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -916,9 +916,6 @@ static int __svc_register(struct net *net, const char *progname,
#endif
}
- if (error < 0)
- printk(KERN_WARNING "svc: failed to register %sv%u RPC "
- "service (errno %d).\n", progname, version, -error);
return error;
}
@@ -937,6 +934,7 @@ int svc_register(const struct svc_serv *serv, struct net *net,
const unsigned short port)
{
struct svc_program *progp;
+ struct svc_version *vers;
unsigned int i;
int error = 0;
@@ -946,7 +944,8 @@ int svc_register(const struct svc_serv *serv, struct net *net,
for (progp = serv->sv_program; progp; progp = progp->pg_next) {
for (i = 0; i < progp->pg_nvers; i++) {
- if (progp->pg_vers[i] == NULL)
+ vers = progp->pg_vers[i];
+ if (vers == NULL)
continue;
dprintk("svc: svc_register(%sv%d, %s, %u, %u)%s\n",
@@ -955,16 +954,26 @@ int svc_register(const struct svc_serv *serv, struct net *net,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
family,
- progp->pg_vers[i]->vs_hidden?
- " (but not telling portmap)" : "");
+ vers->vs_hidden ?
+ " (but not telling portmap)" : "");
- if (progp->pg_vers[i]->vs_hidden)
+ if (vers->vs_hidden)
continue;
error = __svc_register(net, progp->pg_name, progp->pg_prog,
i, family, proto, port);
- if (error < 0)
+
+ if (vers->vs_rpcb_optnl) {
+ error = 0;
+ continue;
+ }
+
+ if (error < 0) {
+ printk(KERN_WARNING "svc: failed to register "
+ "%sv%u RPC service (errno %d).\n",
+ progp->pg_name, i, -error);
break;
+ }
}
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1750048130a7..7d4df99f761f 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -749,6 +749,11 @@ static void xprt_connect_status(struct rpc_task *task)
}
switch (task->tk_status) {
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ECONNABORTED:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
case -EAGAIN:
dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
break;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 75b045e1cd50..817a1e523969 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -257,6 +257,7 @@ struct sock_xprt {
void (*old_data_ready)(struct sock *, int);
void (*old_state_change)(struct sock *);
void (*old_write_space)(struct sock *);
+ void (*old_error_report)(struct sock *);
};
/*
@@ -274,6 +275,11 @@ struct sock_xprt {
*/
#define TCP_RPC_REPLY (1UL << 6)
+static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
+{
+ return (struct rpc_xprt *) sk->sk_user_data;
+}
+
static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt)
{
return (struct sockaddr *) &xprt->addr;
@@ -799,6 +805,7 @@ static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk)
transport->old_data_ready = sk->sk_data_ready;
transport->old_state_change = sk->sk_state_change;
transport->old_write_space = sk->sk_write_space;
+ transport->old_error_report = sk->sk_error_report;
}
static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *sk)
@@ -806,6 +813,34 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_data_ready = transport->old_data_ready;
sk->sk_state_change = transport->old_state_change;
sk->sk_write_space = transport->old_write_space;
+ sk->sk_error_report = transport->old_error_report;
+}
+
+/**
+ * xs_error_report - callback to handle TCP socket state errors
+ * @sk: socket
+ *
+ * Note: we don't call sock_error() since there may be a rpc_task
+ * using the socket, and so we don't want to clear sk->sk_err.
+ */
+static void xs_error_report(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+ int err;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ if (!(xprt = xprt_from_sock(sk)))
+ goto out;
+
+ err = -sk->sk_err;
+ if (err == 0)
+ goto out;
+ dprintk("RPC: xs_error_report client %p, error=%d...\n",
+ xprt, -err);
+ trace_rpc_socket_error(xprt, sk->sk_socket, err);
+ xprt_wake_pending_tasks(xprt, err);
+ out:
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void xs_reset_transport(struct sock_xprt *transport)
@@ -885,11 +920,6 @@ static void xs_destroy(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
-static inline struct rpc_xprt *xprt_from_sock(struct sock *sk)
-{
- return (struct rpc_xprt *) sk->sk_user_data;
-}
-
static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
{
struct xdr_skb_reader desc = {
@@ -1869,6 +1899,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_local_data_ready;
sk->sk_write_space = xs_udp_write_space;
+ sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_ATOMIC;
xprt_clear_connected(xprt);
@@ -2146,6 +2177,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_tcp_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
+ sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_ATOMIC;
/* socket options */
@@ -2932,10 +2964,9 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
/*
* Once we've associated a backchannel xprt with a connection,
- * we want to keep it around as long as long as the connection
- * lasts, in case we need to start using it for a backchannel
- * again; this reference won't be dropped until bc_xprt is
- * destroyed.
+ * we want to keep it around as long as the connection lasts,
+ * in case we need to start using it for a backchannel again;
+ * this reference won't be dropped until bc_xprt is destroyed.
*/
xprt_get(xprt);
args->bc_xprt->xpt_bc_xprt = xprt;