From cd3bafc73d11eb51cb2d3691629718431e1768ce Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Wed, 4 Feb 2015 23:31:10 +0900 Subject: xfrm6: Fix a offset value for network header in _decode_session6 When a network-layer header has multiple IPv6 extension headers, then offset for mobility header goes wrong. This regression breaks an xfrm policy lookup for a particular receive packet. Binding update packets of Mobile IPv6 are all discarded without this fix. Fixes: de3b7a06dfe1 ("xfrm6: Fix transport header offset in _decode_session6.") Signed-off-by: Hajime Tazaki Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_policy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 48bf5a06847b..8d2d01b4800a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -200,6 +200,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: + offset += ipv6_optlen(exthdr); if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; -- cgit v1.2.3 From 044a832a7779c0638bea2d0fea901c055b995f4a Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 12 Jan 2015 13:38:49 +0100 Subject: xfrm: Fix local error reporting crash with interfamily tunnels We set the outer mode protocol too early. As a result, the local error handler might dispatch to the wrong address family and report the error to a wrong socket type. We fix this by setting the outer protocol to the skb after we accessed the inner mode for the last time, right before we do the atcual encapsulation where we switch finally to the outer mode. Reported-by: Chris Ruehl Tested-by: Chris Ruehl Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_output.c | 2 +- net/ipv6/xfrm6_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d5f6bd9a210a..dab73813cb92 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -63,6 +63,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + skb->protocol = htons(ETH_P_IP); return x->outer_mode->output2(x, skb); } @@ -71,7 +72,6 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb->protocol = htons(ETH_P_IP); #ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index ca3f29b98ae5..010f8bd2d577 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -114,6 +114,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; skb->ignore_df = 1; + skb->protocol = htons(ETH_P_IPV6); return x->outer_mode->output2(x, skb); } @@ -122,7 +123,6 @@ EXPORT_SYMBOL(xfrm6_prepare_output); int xfrm6_output_finish(struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - skb->protocol = htons(ETH_P_IPV6); #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; -- cgit v1.2.3 From ac37e2515c1a89c477459a2020b6bfdedabdb91b Mon Sep 17 00:00:00 2001 From: huaibin Wang Date: Wed, 11 Feb 2015 18:10:36 +0100 Subject: xfrm: release dst_orig in case of error in xfrm_lookup() dst_orig should be released on error. Function like __xfrm_route_forward() expects that behavior. Since a recent commit, xfrm_lookup() may also be called by xfrm_lookup_route(), which expects the opposite. Let's introduce a new flag (XFRM_LOOKUP_KEEP_DST_REF) to tell what should be done in case of error. Fixes: f92ee61982d("xfrm: Generate blackhole routes only from route lookup functions") Signed-off-by: huaibin Wang Signed-off-by: Nicolas Dichtel Signed-off-by: Steffen Klassert --- include/net/dst.h | 1 + net/xfrm/xfrm_policy.c | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index a8ae4e760778..0fb99a26e973 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -481,6 +481,7 @@ void dst_init(void); enum { XFRM_LOOKUP_ICMP = 1 << 0, XFRM_LOOKUP_QUEUE = 1 << 1, + XFRM_LOOKUP_KEEP_DST_REF = 1 << 2, }; struct flowi; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cee479bc655c..638af0655aaf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2269,11 +2269,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { - dst_release(dst); - xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - - return ERR_PTR(-EREMOTE); + err = -EREMOTE; + goto error; } err = -EAGAIN; @@ -2324,7 +2322,8 @@ nopol: error: dst_release(dst); dropdst: - dst_release(dst_orig); + if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) + dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } @@ -2338,7 +2337,8 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, - flags | XFRM_LOOKUP_QUEUE); + flags | XFRM_LOOKUP_QUEUE | + XFRM_LOOKUP_KEEP_DST_REF); if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); -- cgit v1.2.3 From 2156d321b879cdadb95a633d046169cfebdbf784 Mon Sep 17 00:00:00 2001 From: Arturo Borrero Date: Sat, 21 Feb 2015 19:30:55 +0100 Subject: netfilter: nft_compat: don't truncate ethernet protocol type to u8 Use u16 for protocol and then cast it to __be16 >> net/netfilter/nft_compat.c:140:37: sparse: incorrect type in assignment (different base types) net/netfilter/nft_compat.c:140:37: expected restricted __be16 [usertype] ethproto net/netfilter/nft_compat.c:140:37: got unsigned char [unsigned] [usertype] proto >> net/netfilter/nft_compat.c:351:37: sparse: incorrect type in assignment (different base types) net/netfilter/nft_compat.c:351:37: expected restricted __be16 [usertype] ethproto net/netfilter/nft_compat.c:351:37: got unsigned char [unsigned] [usertype] proto Reported-by: kbuild test robot Signed-off-by: Arturo Borrero Gonzalez Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1279cd85663e..213584cf04b3 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -123,7 +123,7 @@ static void nft_target_set_tgchk_param(struct xt_tgchk_param *par, const struct nft_ctx *ctx, struct xt_target *target, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -137,7 +137,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -171,7 +171,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, }; -static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; u32 flags; @@ -203,7 +203,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; @@ -334,7 +334,7 @@ static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { static void nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -348,7 +348,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -385,7 +385,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; -- cgit v1.2.3 From 02263db00b6cb98701332aa257c07ca549c2324b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 20 Feb 2015 17:11:10 +0100 Subject: netfilter: nf_tables: fix addition/deletion of elements from commit/abort We have several problems in this path: 1) There is a use-after-free when removing individual elements from the commit path. 2) We have to uninit() the data part of the element from the abort path to avoid a chain refcount leak. 3) We have to check for set->flags to see if there's a mapping, instead of the element flags. 4) We have to check for !(flags & NFT_SET_ELEM_INTERVAL_END) to skip elements that are part of the interval that have no data part, so they don't need to be uninit(). Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 199fd0f27b0e..a8c94620f20e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3612,12 +3612,11 @@ static int nf_tables_commit(struct sk_buff *skb) &te->elem, NFT_MSG_DELSETELEM, 0); te->set->ops->get(te->set, &te->elem); - te->set->ops->remove(te->set, &te->elem); nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); - if (te->elem.flags & NFT_SET_MAP) { - nft_data_uninit(&te->elem.data, - te->set->dtype); - } + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; } @@ -3658,7 +3657,7 @@ static int nf_tables_abort(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; - struct nft_set *set; + struct nft_trans_elem *te; list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -3719,9 +3718,13 @@ static int nf_tables_abort(struct sk_buff *skb) break; case NFT_MSG_NEWSETELEM: nft_trans_elem_set(trans)->nelems--; - set = nft_trans_elem_set(trans); - set->ops->get(set, &nft_trans_elem(trans)); - set->ops->remove(set, &nft_trans_elem(trans)); + te = (struct nft_trans_elem *)trans->data; + te->set->ops->get(te->set, &te->elem); + nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: -- cgit v1.2.3 From 528c943f3bb919aef75ab2fff4f00176f09a4019 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 21 Feb 2015 21:03:10 +0200 Subject: ipvs: add missing ip_vs_pe_put in sync code ip_vs_conn_fill_param_sync() gets in param.pe a module reference for persistence engine from __ip_vs_pe_getbyname() but forgets to put it. Problem occurs in backup for sync protocol v1 (2.6.39). Also, pe_data usually comes in sync messages for connection templates and ip_vs_conn_new() copies the pointer only in this case. Make sure pe_data is not leaked if it comes unexpectedly for normal connections. Leak can happen only if bogus messages are sent to backup server. Fixes: fe5e7a1efb66 ("IPVS: Backup, Adding Version 1 receive capability") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c47ffd7a0a70..d93ceeb3ef04 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -896,6 +896,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + kfree(param->pe_data); } if (opt) @@ -1169,6 +1171,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #endif + ip_vs_pe_put(param.pe); return 0; /* Error exit */ out: -- cgit v1.2.3 From d0c22119f574b851e63360c6b8660fe9593bbc3c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Mon, 2 Mar 2015 14:28:52 -0500 Subject: mac80211: drop unencrypted frames in mesh fwding The mesh forwarding path was not checking that data frames were protected when running an encrypted network; add the necessary check. Cc: stable@vger.kernel.org Reported-by: Johannes Berg Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1101563357ea..944bdc04e913 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2214,6 +2214,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) + return RX_DROP_MONITOR; + /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && -- cgit v1.2.3 From aa75ebc275b2a91b193654a177daf900ad6703f0 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Tue, 10 Feb 2015 12:48:44 +0100 Subject: mac80211: disable u-APSD queues by default Some APs experience problems when working with U-APSD. Decreasing the probability of that happening by using legacy mode for all ACs but VO isn't enough. Cisco 4410N originally forced us to enable VO by default only because it treated non-VO ACs as legacy. However some APs (notably Netgear R7000) silently reclassify packets to different ACs. Since u-APSD ACs require trigger frames for frame retrieval clients would never see some frames (e.g. ARP responses) or would fetch them accidentally after a long time. It makes little sense to enable u-APSD queues by default because it needs userspace applications to be aware of it to actually take advantage of the possible additional powersavings. Implicitly depending on driver autotrigger frame support doesn't make much sense. Cc: stable@vger.kernel.org Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3afe36824703..c0e089c194f1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -58,13 +58,24 @@ struct ieee80211_local; #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* - * Some APs experience problems when working with U-APSD. Decrease the - * probability of that happening by using legacy mode for all ACs but VO. - * The AP that caused us trouble was a Cisco 4410N. It ignores our - * setting, and always treats non-VO ACs as legacy. + * Some APs experience problems when working with U-APSD. Decreasing the + * probability of that happening by using legacy mode for all ACs but VO isn't + * enough. + * + * Cisco 4410N originally forced us to enable VO by default only because it + * treated non-VO ACs as legacy. + * + * However some APs (notably Netgear R7000) silently reclassify packets to + * different ACs. Since u-APSD ACs require trigger frames for frame retrieval + * clients would never see some frames (e.g. ARP responses) or would fetch them + * accidentally after a long time. + * + * It makes little sense to enable u-APSD queues by default because it needs + * userspace applications to be aware of it to actually take advantage of the + * possible additional powersavings. Implicitly depending on driver autotrigger + * frame support doesn't make much sense. */ -#define IEEE80211_DEFAULT_UAPSD_QUEUES \ - IEEE80211_WMM_IE_STA_QOSINFO_AC_VO +#define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL -- cgit v1.2.3 From 8670c3a55e91cb27a4b4d4d4c4fa35b0149e1abf Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:18 +0000 Subject: netfilter: nf_tables: fix transaction race condition A race condition exists in the rule transaction code for rules that get added and removed within the same transaction. The new rule starts out as inactive in the current and active in the next generation and is inserted into the ruleset. When it is deleted, it is additionally set to inactive in the next generation as well. On commit the next generation is begun, then the actions are finalized. For the new rule this would mean clearing out the inactive bit for the previously current, now next generation. However nft_rule_clear() clears out the bits for *both* generations, activating the rule in the current generation, where it should be deactivated due to being deleted. The rule will thus be active until the deletion is finalized, removing the rule from the ruleset. Similarly, when aborting a transaction for the same case, the undo of insertion will remove it from the RCU protected rule list, the deletion will clear out all bits. However until the next RCU synchronization after all operations have been undone, the rule is active on CPUs which can still see the rule on the list. Generally, there may never be any modifications of the current generations' inactive bit since this defeats the entire purpose of atomicity. Change nft_rule_clear() to only touch the next generations bit to fix this. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a8c94620f20e..6fb532bf0fdb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -227,7 +227,7 @@ nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) { - rule->genmask = 0; + rule->genmask &= ~(1 << gencursor_next(net)); } static int -- cgit v1.2.3 From 9889840f5988ecfd43b00c9abb83c1804e21406b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:19 +0000 Subject: netfilter: nf_tables: check for overflow of rule dlen field Check that the space required for the expressions doesn't exceed the size of the dlen field, which would lead to the iterators crashing. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6fb532bf0fdb..7baafd5ab520 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1968,6 +1968,10 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, n++; } } + /* Check for overflow of dlen field */ + err = -EFBIG; + if (size >= 1 << 12) + goto err1; if (nla[NFTA_RULE_USERDATA]) ulen = nla_len(nla[NFTA_RULE_USERDATA]); -- cgit v1.2.3 From 86f1ec32318159a24de349f0a38e79b9d2b3131a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 3 Mar 2015 20:04:20 +0000 Subject: netfilter: nf_tables: fix userdata length overflow The NFT_USERDATA_MAXLEN is defined to 256, however we only have a u8 to store its size. Introduce a struct nft_userdata which contains a length field and indicate its presence using a single bit in the rule. The length field of struct nft_userdata is also a u8, however we don't store zero sized data, so the actual length is udata->len + 1. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 22 +++++++++++++++++++--- net/netfilter/nf_tables_api.c | 28 +++++++++++++++++++--------- 2 files changed, 38 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9eaaa7884586..decb9a095ae7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -119,6 +119,22 @@ int nft_validate_data_load(const struct nft_ctx *ctx, enum nft_registers reg, const struct nft_data *data, enum nft_data_types type); + +/** + * struct nft_userdata - user defined data associated with an object + * + * @len: length of the data + * @data: content + * + * The presence of user data is indicated in an object specific fashion, + * so a length of zero can't occur and the value "len" indicates data + * of length len + 1. + */ +struct nft_userdata { + u8 len; + unsigned char data[0]; +}; + /** * struct nft_set_elem - generic representation of set elements * @@ -380,7 +396,7 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data - * @ulen: length of user data (used for comments) + * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { @@ -388,7 +404,7 @@ struct nft_rule { u64 handle:42, genmask:2, dlen:12, - ulen:8; + udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; @@ -476,7 +492,7 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) return (struct nft_expr *)&rule->data[rule->dlen]; } -static inline void *nft_userdata(const struct nft_rule *rule) +static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7baafd5ab520..74e4b876c96e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1711,9 +1711,12 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, } nla_nest_end(skb, list); - if (rule->ulen && - nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) - goto nla_put_failure; + if (rule->udata) { + struct nft_userdata *udata = nft_userdata(rule); + if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1, + udata->data) < 0) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return 0; @@ -1896,11 +1899,12 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; + struct nft_userdata *udata; struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0; + unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; bool create; u64 handle, pos_handle; @@ -1973,11 +1977,14 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, if (size >= 1 << 12) goto err1; - if (nla[NFTA_RULE_USERDATA]) + if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); + if (ulen > 0) + usize = sizeof(struct nft_userdata) + ulen; + } err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) goto err1; @@ -1985,10 +1992,13 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, rule->handle = handle; rule->dlen = size; - rule->ulen = ulen; + rule->udata = ulen ? 1 : 0; - if (ulen) - nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + if (ulen) { + udata = nft_userdata(rule); + udata->len = ulen - 1; + nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen); + } expr = nft_expr_first(rule); for (i = 0; i < n; i++) { -- cgit v1.2.3 From 59900e0a019e7c2bdb7809a03ed5742d311b15b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 4 Mar 2015 17:55:27 +0100 Subject: netfilter: nf_tables: fix error handling of rule replacement In general, if a transaction object is added to the list successfully, we can rely on the abort path to undo what we've done. This allows us to simplify the error handling of the rule replacement path in nf_tables_newrule(). This implicitly fixes an unnecessary removal of the old rule, which needs to be left in place if we fail to replace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 74e4b876c96e..6ab777912237 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2045,12 +2045,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err3: list_del_rcu(&rule->list); - if (trans) { - list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_clear(net, nft_trans_rule(trans)); - nft_trans_destroy(trans); - chain->use++; - } err2: nf_tables_rule_destroy(&ctx, rule); err1: -- cgit v1.2.3 From 9145736d4862145684009d6a72a6e61324a9439e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Tue, 3 Mar 2015 23:16:16 +0900 Subject: net: ping: Return EAFNOSUPPORT when appropriate. 1. For an IPv4 ping socket, ping_check_bind_addr does not check the family of the socket address that's passed in. Instead, make it behave like inet_bind, which enforces either that the address family is AF_INET, or that the family is AF_UNSPEC and the address is 0.0.0.0. 2. For an IPv6 ping socket, ping_check_bind_addr returns EINVAL if the socket family is not AF_INET6. Return EAFNOSUPPORT instead, for consistency with inet6_bind. 3. Make ping_v4_sendmsg and ping_v6_sendmsg return EAFNOSUPPORT instead of EINVAL if an incorrect socket address structure is passed in. 4. Make IPv6 ping sockets be IPv6-only. The code does not support IPv4, and it cannot easily be made to support IPv4 because the protocol numbers for ICMP and ICMPv6 are different. This makes connect(::ffff:192.0.2.1) fail with EAFNOSUPPORT instead of making the socket unusable. Among other things, this fixes an oops that can be triggered by: int s = socket(AF_INET, SOCK_DGRAM, IPPROTO_ICMP); struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = in6addr_any, }; bind(s, (struct sockaddr *) &sin6, sizeof(sin6)); Change-Id: If06ca86d9f1e4593c0d6df174caca3487c57a241 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- net/ipv4/ping.c | 12 ++++++++++-- net/ipv6/ping.c | 5 +++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e9f66e1cda50..208d5439e59b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -259,6 +259,9 @@ int ping_init_sock(struct sock *sk) kgid_t low, high; int ret = 0; + if (sk->sk_family == AF_INET6) + sk->sk_ipv6only = 1; + inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; @@ -305,6 +308,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin_family != AF_INET && + !(addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr == htonl(INADDR_ANY))) + return -EAFNOSUPPORT; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); @@ -330,7 +338,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return -EINVAL; if (addr->sin6_family != AF_INET6) - return -EINVAL; + return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -716,7 +724,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) - return -EINVAL; + return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index bd46f736f61d..a2dfff6ff227 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -102,9 +102,10 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); - if (msg->msg_namelen < sizeof(struct sockaddr_in6) || - u->sin6_family != AF_INET6) { + if (msg->msg_namelen < sizeof(*u)) return -EINVAL; + if (u->sin6_family != AF_INET6) { + return -EAFNOSUPPORT; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != u->sin6_scope_id) { -- cgit v1.2.3 From 3e32e733d1bbb3f227259dc782ef01d5706bdae0 Mon Sep 17 00:00:00 2001 From: Alexander Drozdov Date: Thu, 5 Mar 2015 10:29:39 +0300 Subject: ipv4: ip_check_defrag should not assume that skb_network_offset is zero ip_check_defrag() may be used by af_packet to defragment outgoing packets. skb_network_offset() of af_packet's outgoing packets is not zero. Signed-off-by: Alexander Drozdov Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2c8d98e728c0..145a50c4d566 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -659,27 +659,30 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { struct iphdr iph; + int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) + netoff = skb_network_offset(skb); + + if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); - if (skb->len < len || len < (iph.ihl * 4)) + if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, iph.ihl*4)) + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) return skb; - if (pskb_trim_rcsum(skb, len)) + if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) -- cgit v1.2.3 From 6c09fa09d468d730eecd7122122175da772d3b09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Mar 2015 08:03:06 -0800 Subject: tcp: align tcp_xmit_size_goal() on tcp_tso_autosize() With some mss values, it is possible tcp_xmit_size_goal() puts one segment more in TSO packet than tcp_tso_autosize(). We send then one TSO packet followed by one single MSS. It is not a serious bug, but we can do slightly better, especially for drivers using netif_set_gso_max_size() to lower gso_max_size. Using same formula avoids these corner cases and makes tcp_xmit_size_goal() a bit faster. Signed-off-by: Eric Dumazet Fixes: 605ad7f184b6 ("tcp: refine TSO autosizing") Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d72a0fcd928..995a2259bcfc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,17 +835,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 new_size_goal, size_goal, hlen; + u32 new_size_goal, size_goal; if (!large_allowed || !sk_can_gso(sk)) return mss_now; - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - new_size_goal = sk->sk_gso_max_size - 1 - hlen; + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); /* We try hard to avoid divides here */ -- cgit v1.2.3 From c247f0534cc5a5a547a343903f42295a471844e2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Sat, 7 Mar 2015 20:33:22 -0500 Subject: ip: fix error queue empty skb handling When reading from the error queue, msg_name and msg_control are only populated for some errors. A new exception for empty timestamp skbs added a false positive on icmp errors without payload. `traceroute -M udpconn` only displayed gateways that return payload with the icmp error: the embedded network headers are pulled before sock_queue_err_skb, leaving an skb with skb->len == 0 otherwise. Fix this regression by refining when msg_name and msg_control branches are taken. The solutions for the two fields are independent. msg_name only makes sense for errors that configure serr->port and serr->addr_offset. Test the first instead of skb->len. This also fixes another issue. saddr could hold the wrong data, as serr->addr_offset is not initialized in some code paths, pointing to the start of the network header. It is only valid when serr->port is set (non-zero). msg_control support differs between IPv4 and IPv6. IPv4 only honors requests for ICMP and timestamps with SOF_TIMESTAMPING_OPT_CMSG. The skb->len test can simply be removed, because skb->dev is also tested and never true for empty skbs. IPv6 honors requests for all errors aside from local errors and timestamps on empty skbs. In both cases, make the policy more explicit by moving this logic to a new function that decides whether to process msg_control and that optionally prepares the necessary fields in skb->cb[]. After this change, the IPv4 and IPv6 paths are more similar. The last case is rxrpc. Here, simply refine to only match timestamps. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: Jan Niehusmann Signed-off-by: Willem de Bruijn ---- Changes v1->v2 - fix local origin test inversion in ip6_datagram_support_cmsg - make v4 and v6 code paths more similar by introducing analogous ipv4_datagram_support_cmsg - fix compile bug in rxrpc Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 33 +++++++++++++++++++++++---------- net/ipv6/datagram.c | 39 ++++++++++++++++++++++++++++----------- net/rxrpc/ar-error.c | 4 ++-- 3 files changed, 53 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 31d8c71986b4..5cd99271d3a6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,17 +432,32 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } -static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, - const struct sk_buff *skb, - int ee_origin) +/* IPv4 supports cmsg on all imcp errors and some timestamps + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ipv4_datagram_support_cmsg(const struct sock *sk, + struct sk_buff *skb, + int ee_origin) { - struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + struct in_pktinfo *info; + + if (ee_origin == SO_EE_ORIGIN_ICMP) + return true; - if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || - (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + if (ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + /* Support IP_PKTINFO on tstamp packets if requested, to correlate + * timestamp with egress dev. Not possible for packets without dev + * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). + */ + if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || (!skb->dev)) return false; + info = PKTINFO_SKB_CB(skb); info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; info->ipi_ifindex = skb->dev->ifindex; return true; @@ -483,7 +498,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -496,9 +511,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (skb->len && - (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || - ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { + if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_sk(sk)->cmsg_flags) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c215be70cac0..ace8daca5c83 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,14 +325,34 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } -static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. + * + * At one point, excluding local errors was a quick test to identify icmp/icmp6 + * errors. This is no longer true, but the test remained, so the v6 stack, + * unlike v4, also honors cmsg requests on all wifi and timestamp errors. + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ip6_datagram_support_cmsg(struct sk_buff *skb, + struct sock_exterr_skb *serr) { - int ifindex = skb->dev ? skb->dev->ifindex : -1; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) + return true; + + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + if (!skb->dev) + return false; if (skb->protocol == htons(ETH_P_IPV6)) - IP6CB(skb)->iif = ifindex; + IP6CB(skb)->iif = skb->dev->ifindex; else - PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; + PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex; + + return true; } /* @@ -369,7 +389,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -394,14 +414,11 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { + + if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; - if (np->rxopt.all) { - if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && - serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) - ip6_datagram_prepare_pktinfo_errqueue(skb); + if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); - } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 5394b6be46ec..0610efa83d72 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -42,7 +42,8 @@ void rxrpc_UDP_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - if (!skb->len) { + serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); kfree_skb(skb); return; @@ -50,7 +51,6 @@ void rxrpc_UDP_error_report(struct sock *sk) rxrpc_new_skb(skb); - serr = SKB_EXT_ERR(skb); addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); port = serr->port; -- cgit v1.2.3 From 969439016d2cf61fef53a973d7e6d2061c3793b1 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 23 Feb 2015 20:37:54 +0100 Subject: can: add missing initialisations in CAN related skbuffs When accessing CAN network interfaces with AF_PACKET sockets e.g. by dhclient this can lead to a skb_under_panic due to missing skb initialisations. Add the missing initialisations at the CAN skbuff creation times on driver level (rx path) and in the network layer (tx path). Reported-by: Austin Schuh Reported-by: Daniel Steer Signed-off-by: Oliver Hartkopp Cc: linux-stable Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 8 ++++++++ net/can/af_can.c | 3 +++ 2 files changed, 11 insertions(+) (limited to 'net') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 3c82e02e3dae..b0f69248cb71 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -579,6 +579,10 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; @@ -603,6 +607,10 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev, skb->pkt_type = PACKET_BROADCAST; skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; diff --git a/net/can/af_can.c b/net/can/af_can.c index 66e08040ced7..32d710eaf1fc 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -259,6 +259,9 @@ int can_send(struct sk_buff *skb, int loop) goto inval_skb; } + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); -- cgit v1.2.3 From 82f17091e68254d1612b42cf23291cad63cfaf04 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Mon, 9 Mar 2015 11:51:04 -0700 Subject: net: delete stale packet_mclist entries When an interface is deleted from a net namespace the ifindex in the corresponding entries in PF_PACKET sockets' mclists becomes stale. This can create inconsistencies if later an interface with the same ifindex is moved from a different namespace (not that unlikely since ifindexes are per-namespace). In particular we saw problems with dev->promiscuity, resulting in "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken" warnings and EOVERFLOW failures of setsockopt(PACKET_ADD_MEMBERSHIP). This patch deletes the mclist entries for interfaces that are deleted. Since this now causes setsockopt(PACKET_DROP_MEMBERSHIP) to fail with EADDRNOTAVAIL if called after the interface is deleted, also make packet_mc_drop not fail. Signed-off-by: Francesco Ruggeri Signed-off-by: David S. Miller --- net/packet/af_packet.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5bf1e968a728..f8db7064d81c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3123,11 +3123,18 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, return 0; } -static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +static void packet_dev_mclist_delete(struct net_device *dev, + struct packet_mclist **mlp) { - for ( ; i; i = i->next) { - if (i->ifindex == dev->ifindex) - packet_dev_mc(dev, i, what); + struct packet_mclist *ml; + + while ((ml = *mlp) != NULL) { + if (ml->ifindex == dev->ifindex) { + packet_dev_mc(dev, ml, -1); + *mlp = ml->next; + kfree(ml); + } else + mlp = &ml->next; } } @@ -3204,12 +3211,11 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) packet_dev_mc(dev, ml, -1); kfree(ml); } - rtnl_unlock(); - return 0; + break; } } rtnl_unlock(); - return -EADDRNOTAVAIL; + return 0; } static void packet_flush_mclist(struct sock *sk) @@ -3559,7 +3565,7 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist(dev, po->mclist, -1); + packet_dev_mclist_delete(dev, &po->mclist); /* fallthrough */ case NETDEV_DOWN: -- cgit v1.2.3 From e6441bae326271090755e1707196ad05aa1dc703 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Mon, 9 Mar 2015 16:16:22 -0400 Subject: tipc: fix bug in link failover handling In commit c637c1035534867b85b78b453c38c495b58e2c5a ("tipc: resolve race problem at unicast message reception") we introduced a new mechanism for delivering buffers upwards from link to socket layer. That code contains a bug in how we handle the new link input queue during failover. When a link is reset, some of its users may be blocked because of congestion, and in order to resolve this, we add any pending wakeup pseudo messages to the link's input queue, and deliver them to the socket. This misses the case where the other, remaining link also may have congested users. Currently, the owner node's reference to the remaining link's input queue is unconditionally overwritten by the reset link's input queue. This has the effect that wakeup events from the remaining link may be unduely delayed (but not lost) for a potentially long period. We fix this by adding the pending events from the reset link to the input queue that is currently referenced by the node, whichever one it is. This commit should be applied to both net and net-next. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index a4cf364316de..14f09b3cb87c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -464,10 +464,11 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Clean up all queues, except inputq: */ __skb_queue_purge(&l_ptr->outqueue); __skb_queue_purge(&l_ptr->deferred_queue); - skb_queue_splice_init(&l_ptr->wakeupq, &l_ptr->inputq); - if (!skb_queue_empty(&l_ptr->inputq)) + if (!owner->inputq) + owner->inputq = &l_ptr->inputq; + skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); + if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - owner->inputq = &l_ptr->inputq; l_ptr->next_out = NULL; l_ptr->unacked_window = 0; l_ptr->checkpoint = 1; -- cgit v1.2.3 From 5778d39d070b4ac5f889928175b7f2d53ae7504e Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 9 Mar 2015 17:03:40 -0700 Subject: net_sched: fix struct tc_u_hnode layout in u32 We dynamically allocate divisor+1 entries for ->ht[] in tc_u_hnode: ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); So ->ht is supposed to be the last field of this struct, however this is broken, since an rcu head is appended after it. Fixes: 1ce87720d456 ("net: sched: make cls_u32 lockless") Cc: Jamal Hadi Salim Cc: John Fastabend Signed-off-by: Cong Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 09487afbfd51..95fdf4e40051 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -78,8 +78,11 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; - struct tc_u_knode __rcu *ht[1]; struct rcu_head rcu; + /* The 'ht' field MUST be the last field in structure to allow for + * more entries allocated at end of structure. + */ + struct tc_u_knode __rcu *ht[1]; }; struct tc_u_common { -- cgit v1.2.3 From 7768eed8bf1d2e5eefa38c573f15f737a9824052 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 10 Mar 2015 19:03:46 +0100 Subject: net: add comment for sock_efree() usage Signed-off-by: Oliver Hartkopp Acked-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 93c8b20c91e4..78e89eb7eb70 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1655,6 +1655,10 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); -- cgit v1.2.3 From 4363890079674db7b00cf1bb0e6fa430e846e86b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Mar 2015 21:58:32 -0400 Subject: net: Handle unregister properly when netdev namespace change fails. If rtnl_newlink() fails on it's call to dev_change_net_namespace(), we have to make use of the ->dellink() method, if present, just like we do when rtnl_configure_link() fails. Fixes: 317f4810e45e ("rtnl: allow to create device with IFLA_LINK_NETNSID set") Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..ee0608bb3bc0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2166,28 +2166,28 @@ replay: } } err = rtnl_configure_link(dev, ifm); - if (err < 0) { - if (ops->newlink) { - LIST_HEAD(list_kill); - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; - } - + if (err < 0) + goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) - unregister_netdevice(dev); + goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; } } -- cgit v1.2.3 From 9949afa42be0b76f5832db112ce51bb6b35b2abb Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:03 -0400 Subject: tcp: fix tcp_cong_avoid_ai() credit accumulation bug with decreases in w The recent change to tcp_cong_avoid_ai() to handle stretch ACKs introduced a bug where snd_cwnd_cnt could accumulate a very large value while w was large, and then if w was reduced snd_cwnd could be incremented by a large delta, leading to a large burst and high packet loss. This was tickled when CUBIC's bictcp_update() sets "ca->cnt = 100 * cwnd". This bug crept in while preparing the upstream version of 814d488c6126. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d694088214cd..62856e185a93 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + /* If credits accumulated at a higher w, apply them gently now. */ + if (tp->snd_cwnd_cnt >= w) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; -- cgit v1.2.3 From d578e18ce93f5d33a7120fd57c453e22a4c0fc37 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 10 Mar 2015 17:17:04 -0400 Subject: tcp: restore 1.5x per RTT limit to CUBIC cwnd growth in congestion avoidance Commit 814d488c6126 ("tcp: fix the timid additive increase on stretch ACKs") fixed a bug where tcp_cong_avoid_ai() would either credit a connection with an increase of snd_cwnd_cnt, or increase snd_cwnd, but not both, resulting in cwnd increasing by 1 packet on at most every alternate invocation of tcp_cong_avoid_ai(). Although the commit correctly implemented the CUBIC algorithm, which can increase cwnd by as much as 1 packet per 1 packet ACKed (2x per RTT), in practice that could be too aggressive: in tests on network paths with small buffers, YouTube server retransmission rates nearly doubled. This commit restores CUBIC to a maximum cwnd growth rate of 1 packet per 2 packets ACKed (1.5x per RTT). In YouTube tests this restored retransmit rates to low levels. Testing: This patch has been tested in datacenter netperf transfers and live youtube.com and google.com servers. Fixes: 9cd981dcf174 ("tcp: fix stretch ACK bugs in CUBIC") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4b276d1ed980..06d3d665a9fd 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -306,8 +306,10 @@ tcp_friendliness: } } - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; + /* The maximum rate of cwnd increase CUBIC allows is 1 packet per + * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. + */ + ca->cnt = max(ca->cnt, 2U); } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) -- cgit v1.2.3 From b1cb59cf2efe7971d3d72a7b963d09a512d994c9 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 11 Mar 2015 14:29:17 +0300 Subject: net: sysctl_net_core: check SNDBUF and RCVBUF for min length sysctl has sysctl.net.core.rmem_*/wmem_* parameters which can be set to incorrect values. Given that 'struct sk_buff' allocates from rcvbuf, incorrectly set buffer length could result to memory allocation failures. For example, set them as follows: # sysctl net.core.rmem_default=64 net.core.wmem_default = 64 # sysctl net.core.wmem_default=64 net.core.wmem_default = 64 # ping localhost -s 1024 -i 0 > /dev/null This could result to the following failure: skbuff: skb_over_panic: text:ffffffff81628db4 len:-32 put:-32 head:ffff88003a1cc200 data:ffff88003a1cc200 tail:0xffffffe0 end:0xc0 dev: kernel BUG at net/core/skbuff.c:102! invalid opcode: 0000 [#1] SMP ... task: ffff88003b7f5550 ti: ffff88003ae88000 task.ti: ffff88003ae88000 RIP: 0010:[] [] skb_put+0xa1/0xb0 RSP: 0018:ffff88003ae8bc68 EFLAGS: 00010296 RAX: 000000000000008d RBX: 00000000ffffffe0 RCX: 0000000000000000 RDX: ffff88003fdcf598 RSI: ffff88003fdcd9c8 RDI: ffff88003fdcd9c8 RBP: ffff88003ae8bc88 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 00000000000002b2 R12: 0000000000000000 R13: 0000000000000000 R14: ffff88003d3f7300 R15: ffff88000012a900 FS: 00007fa0e2b4a840(0000) GS:ffff88003fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000d0f7e0 CR3: 000000003b8fb000 CR4: 00000000000006f0 Stack: ffff88003a1cc200 00000000ffffffe0 00000000000000c0 ffffffff818cab1d ffff88003ae8bd68 ffffffff81628db4 ffff88003ae8bd48 ffff88003b7f5550 ffff880031a09408 ffff88003b7f5550 ffff88000012aa48 ffff88000012ab00 Call Trace: [] unix_stream_sendmsg+0x2c4/0x470 [] sock_write_iter+0x146/0x160 [] new_sync_write+0x92/0xd0 [] vfs_write+0xd6/0x180 [] SyS_write+0x59/0xd0 [] system_call_fastpath+0x12/0x17 Code: 00 00 48 89 44 24 10 8b 87 c8 00 00 00 48 89 44 24 08 48 8b 87 d8 00 00 00 48 c7 c7 30 db 91 81 48 89 04 24 31 c0 e8 4f a8 0e 00 <0f> 0b eb fe 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 RIP [] skb_put+0xa1/0xb0 RSP Kernel panic - not syncing: Fatal exception Moreover, the possible minimum is 1, so we can get another kernel panic: ... BUG: unable to handle kernel paging request at ffff88013caee5c0 IP: [] __alloc_skb+0x12f/0x1f0 ... Signed-off-by: Alexey Kodanev Signed-off-by: David S. Miller --- net/core/sysctl_net_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 433424804284..8ce351ffceb1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -237,7 +239,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_max", @@ -245,7 +247,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", @@ -253,7 +255,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_default", @@ -261,7 +263,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", -- cgit v1.2.3 From c29390c6dfeee0944ac6b5610ebbe403944378fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Mar 2015 18:42:02 -0700 Subject: xps: must clear sender_cpu before forwarding John reported that my previous commit added a regression on his router. This is because sender_cpu & napi_id share a common location, so get_xps_queue() can see garbage and perform an out of bound access. We need to make sure sender_cpu is cleared before doing the transmit, otherwise any NIC busy poll enabled (skb_mark_napi_id()) can trigger this bug. Signed-off-by: Eric Dumazet Reported-by: John Bisected-by: John Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 +++++++ net/core/skbuff.c | 2 +- net/ipv4/ip_forward.c | 1 + net/ipv6/ip6_output.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 30007afe70b3..f54d6659713a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -948,6 +948,13 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_sender_cpu_clear(struct sk_buff *skb) +{ +#ifdef CONFIG_XPS + skb->sender_cpu = 0; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f80507823531..434e78e5254d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4173,7 +4173,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; - skb->sender_cpu = 0; + skb_sender_cpu_clear(skb); skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 787b3c294ce6..d9bc28ac5d1b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0a04a37305d5..7e80b61b51ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct sk_buff *skb) { + skb_sender_cpu_clear(skb); return dst_output(skb); } -- cgit v1.2.3 From 3a8dd9711e0792f64394edafadd66c2d1f1904df Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Mar 2015 15:43:55 -0400 Subject: sock: fix possible NULL sk dereference in __skb_tstamp_tx Test that sk != NULL before reading sk->sk_tsflags. Fixes: 49ca0d8bfaf3 ("net-timestamp: no-payload option") Reported-by: One Thousand Gnomes Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 434e78e5254d..8e4ac97c8477 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3733,9 +3733,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + bool tsonly; - if (!sk || !skb_may_tx_timestamp(sk, tsonly)) + if (!sk) + return; + + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) -- cgit v1.2.3 From f862e07cf95d5b62a5fc5e981dd7d0dbaf33a501 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Mar 2015 22:46:59 +0100 Subject: rds: avoid potential stack overflow The rds_iw_update_cm_id function stores a large 'struct rds_sock' object on the stack in order to pass a pair of addresses. This happens to just fit withint the 1024 byte stack size warning limit on x86, but just exceed that limit on ARM, which gives us this warning: net/rds/iw_rdma.c:200:1: warning: the frame size of 1056 bytes is larger than 1024 bytes [-Wframe-larger-than=] As the use of this large variable is basically bogus, we can rearrange the code to not do that. Instead of passing an rds socket into rds_iw_get_device, we now just pass the two addresses that we have available in rds_iw_update_cm_id, and we change rds_iw_get_mr accordingly, to create two address structures on the stack there. Signed-off-by: Arnd Bergmann Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/iw_rdma.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index a817705ce2d0..dba8d0864f18 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -88,7 +88,9 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, + struct rds_iw_device **rds_iwdev, + struct rdma_cm_id **cm_id) { struct rds_iw_device *iwdev; struct rds_iw_cm_id *i_cm_id; @@ -112,15 +114,15 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd src_addr->sin_port, dst_addr->sin_addr.s_addr, dst_addr->sin_port, - rs->rs_bound_addr, - rs->rs_bound_port, - rs->rs_conn_addr, - rs->rs_conn_port); + src->sin_addr.s_addr, + src->sin_port, + dst->sin_addr.s_addr, + dst->sin_port); #ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && - src_addr->sin_port == rs->rs_bound_port && - dst_addr->sin_addr.s_addr == rs->rs_conn_addr && - dst_addr->sin_port == rs->rs_conn_port) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && + src_addr->sin_port == src->sin_port && + dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && + dst_addr->sin_port == dst->sin_port) { #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only @@ -128,7 +130,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd * zero'ed. It doesn't appear to be properly populated * during connection setup... */ - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { #endif spin_unlock_irq(&iwdev->spinlock); *rds_iwdev = iwdev; @@ -180,19 +182,13 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i { struct sockaddr_in *src_addr, *dst_addr; struct rds_iw_device *rds_iwdev_old; - struct rds_sock rs; struct rdma_cm_id *pcm_id; int rc; src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - rs.rs_bound_addr = src_addr->sin_addr.s_addr; - rs.rs_bound_port = src_addr->sin_port; - rs.rs_conn_addr = dst_addr->sin_addr.s_addr; - rs.rs_conn_port = dst_addr->sin_port; - - rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); if (rc) rds_iw_remove_cm_id(rds_iwdev, cm_id); @@ -598,9 +594,17 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_iw_device *rds_iwdev; struct rds_iw_mr *ibmr = NULL; struct rdma_cm_id *cm_id; + struct sockaddr_in src = { + .sin_addr.s_addr = rs->rs_bound_addr, + .sin_port = rs->rs_bound_port, + }; + struct sockaddr_in dst = { + .sin_addr.s_addr = rs->rs_conn_addr, + .sin_port = rs->rs_conn_port, + }; int ret; - ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); if (ret || !cm_id) { ret = -ENODEV; goto out; -- cgit v1.2.3 From 8051a2a518fcf3827a143470083ad6008697ff17 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 12 Mar 2015 11:53:41 +1030 Subject: 9p/trans_virtio: fix hot-unplug On device hot-unplug, 9p/virtio currently will kfree channel while it might still be in use. Of course, it might stay used forever, so it's an extremely ugly hack, but it seems better than use-after-free that we have now. [ Unused variable removed, whitespace cleanup, msg single-lined --RR ] Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- net/9p/trans_virtio.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d8e376a5f0f1..36a1a739ad68 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,14 +658,30 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) static void p9_virtio_remove(struct virtio_device *vdev) { struct virtio_chan *chan = vdev->priv; - - if (chan->inuse) - p9_virtio_close(chan->client); - vdev->config->del_vqs(vdev); + unsigned long warning_time; mutex_lock(&virtio_9p_lock); + + /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); + warning_time = jiffies; + + /* Wait for existing users to close. */ + while (chan->inuse) { + mutex_unlock(&virtio_9p_lock); + msleep(250); + if (time_after(jiffies, warning_time + 10 * HZ)) { + dev_emerg(&vdev->dev, + "p9_virtio_remove: waiting for device in use.\n"); + warning_time = jiffies; + } + mutex_lock(&virtio_9p_lock); + } + mutex_unlock(&virtio_9p_lock); + + vdev->config->del_vqs(vdev); + sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); kfree(chan->tag); -- cgit v1.2.3 From c8e2c80d7ec00d020320f905822bf49c5ad85250 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 13 Mar 2015 09:49:59 -0700 Subject: inet_diag: fix possible overflow in inet_diag_dump_one_icsk() inet_diag_dump_one_icsk() allocates too small skb. Add inet_sk_attr_size() helper right before inet_sk_diag_fill() so that it can be updated if/when new attributes are added. iproute2/ss currently does not use this dump_one() interface, this might explain nobody noticed this problem yet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_diag.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 81751f12645f..592aff37366b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler( mutex_unlock(&inet_diag_table_mutex); } +static size_t inet_sk_attr_size(void) +{ + return nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, struct user_namespace *user_ns, @@ -326,9 +340,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64, GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; -- cgit v1.2.3 From 4c906c279886550d2aaac6facf71d709158e4e3c Mon Sep 17 00:00:00 2001 From: Venkat Venkatsubra Date: Fri, 13 Mar 2015 07:08:22 -0700 Subject: bridge: reset bridge mtu after deleting an interface On adding an interface br_add_if() sets the MTU to the min of all the interfaces. Do the same thing on removing an interface too in br_del_if. Signed-off-by: Venkat Venkatsubra Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_if.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b087d278c679..1849d96b3c91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -563,6 +563,8 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); + dev_set_mtu(br->dev, br_min_mtu(br)); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); -- cgit v1.2.3 From 3eeff778e00c956875c70b145c52638c313dfb23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:22:21 +0000 Subject: caif: fix MSG_OOB test in caif_seqpkt_recvmsg() It should be checking flags, not msg->msg_flags. It's ->sendmsg() instances that need to look for that in ->msg_flags, ->recvmsg() ones (including the other ->recvmsg() instance in that file, as well as unix_dgram_recvmsg() this one claims to be imitating) check in flags. Braino had been introduced in commit dcda13 ("caif: Bugfix - use MSG_TRUNC in receive") back in 2010, so it goes quite a while back. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 769b185fefbd..a6e2da0bc718 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -281,7 +281,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, int copylen; ret = -EOPNOTSUPP; - if (m->msg_flags&MSG_OOB) + if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); -- cgit v1.2.3 From 7d985ed1dca5c90535d67ce92ef6ca520302340a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 14 Mar 2015 05:34:56 +0000 Subject: rxrpc: bogus MSG_PEEK test in rxrpc_recvmsg() [I would really like an ACK on that one from dhowells; it appears to be quite straightforward, but...] MSG_PEEK isn't passed to ->recvmsg() via msg->msg_flags; as the matter of fact, neither the kernel users of rxrpc, nor the syscalls ever set that bit in there. It gets passed via flags; in fact, another such check in the same function is done correctly - as flags & MSG_PEEK. It had been that way (effectively disabled) for 8 years, though, so the patch needs beating up - that case had never been tested. If it is correct, it's -stable fodder. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/rxrpc/ar-recvmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 4575485ad1b4..19a560626dc4 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -87,7 +87,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) { /* nothing remains on the queue */ if (copied && - (msg->msg_flags & MSG_PEEK || timeo == 0)) + (flags & MSG_PEEK || timeo == 0)) goto out; /* wait for a message to turn up */ -- cgit v1.2.3 From 0f611d28fc2e13cfec64e1c544c16a086886805a Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Thu, 12 Mar 2015 08:53:30 +0200 Subject: mac80211: count interfaces correctly for combination checks Since moving the interface combination checks to mac80211, it's broken because it now only considers interfaces with an assigned channel context, so for example any interface that isn't active can still be up, which is clearly an issue; also, in particular P2P-Device wdevs are an issue since they never have a chanctx. Fix this by counting running interfaces instead the ones with a channel context assigned. Cc: stable@vger.kernel.org [3.16+] Fixes: 73de86a38962b ("cfg80211/mac80211: move interface counting for combination check to mac80211") Signed-off-by: Andrei Otcheretianski Signed-off-by: Emmanuel Grumbach [rewrite commit message, dig out the commit it fixes] Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 8428f4a95479..747bdcf72e92 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3178,7 +3178,7 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || - rcu_access_pointer(sdata_iter->vif.chanctx_conf) == NULL || + !ieee80211_sdata_running(sdata_iter) || local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) continue; -- cgit v1.2.3 From 70a3fd6c61c46c07c63cab935dca9a17d8de1709 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:29 +0200 Subject: mac80211: ask for ECSA IE to be considered for beacon parse CRC When a beacon from the AP contains only the ECSA IE, and not a CSA IE as well, this ECSA IE is not considered for calculating the CRC and the beacon might be dropped as not being interesting. This is clearly wrong, it should be handled and the channel switch should be executed. Fix this by including the ECSA IE ID in the bitmap of interesting IEs. Reported-by: Gil Tribush Reviewed-by: Luciano Coelho Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 10ac6324c1d0..cde8cd3d6595 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3204,7 +3204,8 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | - (1ULL << WLAN_EID_HT_OPERATION); + (1ULL << WLAN_EID_HT_OPERATION) | + (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, -- cgit v1.2.3 From 496fcc294daab18799e190c0264863d653588d1f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:27 +0200 Subject: nl80211: ignore HT/VHT capabilities without QoS/WMM As HT/VHT depend heavily on QoS/WMM, it's not a good idea to let userspace add clients that have HT/VHT but not QoS/WMM. Since it does so in certain cases we've observed (client is using HT IEs but not QoS/WMM) just ignore the HT/VHT info at this point and don't pass it down to the drivers which might unconditionally use it. Cc: stable@vger.kernel.org Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index be2501538011..b6f84f6a2a09 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4400,6 +4400,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; + /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT + * as userspace might just pass through the capabilities from the IEs + * directly, rather than enforcing this restriction and returning an + * error in this case. + */ + if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { + params.ht_capa = NULL; + params.vht_capa = NULL; + } + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); -- cgit v1.2.3 From f84eaa1068315409ffbef57e6fea312180787db3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Mar 2015 08:53:26 +0200 Subject: mac80211: ignore CSA to same channel If the AP is confused and starts doing a CSA to the same channel, just ignore that request instead of trying to act it out since it was likely sent in error anyway. In the case of the bug I was investigating the GO was misbehaving and sending out a beacon with CSA IEs still included after having actually done the channel switch. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c0e089c194f1..8d53d65bd2ab 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -464,6 +464,7 @@ struct ieee80211_if_managed { unsigned int flags; bool csa_waiting_bcn; + bool csa_ignored_same_chan; bool beacon_crc_valid; u32 beacon_crc; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cde8cd3d6595..142f66aece18 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1150,6 +1150,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + if (cfg80211_chandef_identical(&csa_ie.chandef, + &sdata->vif.bss_conf.chandef)) { + if (ifmgd->csa_ignored_same_chan) + return; + sdata_info(sdata, + "AP %pM tries to chanswitch to same channel, ignore\n", + ifmgd->associated->bssid); + ifmgd->csa_ignored_same_chan = true; + return; + } + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, @@ -1210,6 +1221,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; sdata->csa_block_tx = csa_ie.mode; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) ieee80211_stop_vif_queues(local, sdata, @@ -2090,6 +2102,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); -- cgit v1.2.3 From 37355565ba57fd45f78f0934305be2761b641f8f Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 16 Mar 2015 15:56:05 +0100 Subject: ip6_tunnel: fix error code when tunnel exists After commit 2b0bb01b6edb, the kernel returns -ENOBUFS when user tries to add an existing tunnel with ioctl API: $ ip -6 tunnel add ip6tnl1 mode ip6ip6 dev eth1 add tunnel "ip6tnl0" failed: No buffer space available It's confusing, the right error is EEXIST. This patch also change a bit the code returned: - ENOBUFS -> ENOMEM - ENOENT -> ENODEV Fixes: 2b0bb01b6edb ("ip6_tunnel: Return an error when adding an existing tunnel.") CC: Steffen Klassert Reported-by: Pierre Cheynier Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 266a264ec212..ddd94eca19b3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -314,7 +314,7 @@ out: * Create tunnel matching given parameters. * * Return: - * created tunnel or NULL + * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) @@ -322,7 +322,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; - int err; + int err = -ENOMEM; if (p->name[0]) strlcpy(name, p->name, IFNAMSIZ); @@ -348,7 +348,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) failed_free: ip6_dev_free(dev); failed: - return NULL; + return ERR_PTR(err); } /** @@ -362,7 +362,7 @@ failed: * tunnel device is created and registered for use. * * Return: - * matching tunnel or NULL + * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, @@ -380,13 +380,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) - return NULL; + return ERR_PTR(-EEXIST); return t; } } if (!create) - return NULL; + return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } @@ -1420,7 +1420,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); @@ -1445,7 +1445,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; @@ -1457,14 +1457,15 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) else err = ip6_tnl_update(t, &p1); } - if (t) { + if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = PTR_ERR(t); + } break; case SIOCDELTUNNEL: err = -EPERM; @@ -1478,7 +1479,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) @@ -1672,12 +1673,13 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); - struct ip6_tnl *nt; + struct ip6_tnl *nt, *t; nt = netdev_priv(dev); ip6_tnl_netlink_parms(data, &nt->parms); - if (ip6_tnl_locate(net, &nt->parms, 0)) + t = ip6_tnl_locate(net, &nt->parms, 0); + if (!IS_ERR(t)) return -EEXIST; return ip6_tnl_create2(dev); @@ -1697,8 +1699,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], ip6_tnl_netlink_parms(data, &p); t = ip6_tnl_locate(net, &p, 0); - - if (t) { + if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else -- cgit v1.2.3 From cb7cf8a33ff73cf638481d1edf883d8968f934f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 16 Mar 2015 12:19:24 -0700 Subject: inet: Clean up inet_csk_wait_for_connect() vs. might_sleep() I got the following trace with current net-next kernel : [14723.885290] WARNING: CPU: 26 PID: 22658 at kernel/sched/core.c:7285 __might_sleep+0x89/0xa0() [14723.885325] do not call blocking ops when !TASK_RUNNING; state=1 set at [] prepare_to_wait_exclusive+0x34/0xa0 [14723.885355] CPU: 26 PID: 22658 Comm: netserver Not tainted 4.0.0-dbg-DEV #1379 [14723.885359] ffffffff81a223a8 ffff881fae9e7ca8 ffffffff81650b5d 0000000000000001 [14723.885364] ffff881fae9e7cf8 ffff881fae9e7ce8 ffffffff810a72e7 0000000000000000 [14723.885367] ffffffff81a57620 000000000000093a 0000000000000000 ffff881fae9e7e64 [14723.885371] Call Trace: [14723.885377] [] dump_stack+0x4c/0x65 [14723.885382] [] warn_slowpath_common+0x97/0xe0 [14723.885386] [] warn_slowpath_fmt+0x46/0x50 [14723.885390] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885393] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885396] [] ? prepare_to_wait_exclusive+0x34/0xa0 [14723.885399] [] __might_sleep+0x89/0xa0 [14723.885403] [] lock_sock_nested+0x36/0xb0 [14723.885406] [] ? release_sock+0x173/0x1c0 [14723.885411] [] inet_csk_accept+0x157/0x2a0 [14723.885415] [] ? abort_exclusive_wait+0xc0/0xc0 [14723.885419] [] inet_accept+0x2d/0x150 [14723.885424] [] SYSC_accept4+0xff/0x210 [14723.885428] [] ? retint_swapgs+0xe/0x44 [14723.885431] [] ? trace_hardirqs_on_caller+0x10d/0x1d0 [14723.885437] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [14723.885441] [] SyS_accept+0x10/0x20 [14723.885444] [] system_call_fastpath+0x12/0x17 [14723.885447] ---[ end trace ff74cd83355b1873 ]--- In commit 26cabd31259ba43f68026ce3f62b78094124333f Peter added a sched_annotate_sleep() in sk_wait_event() Is the following patch needed as well ? Alternative would be to use sk_wait_event() from inet_csk_wait_for_connect() Signed-off-by: Eric Dumazet Acked-by: Peter Zijlstra (Intel) Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 14d02ea905b6..3e44b9b0b78e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -268,6 +268,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); + sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) -- cgit v1.2.3 From ced585c83b27deca427c606a34dd3eaa6b96d82b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 17 Mar 2015 20:25:57 +0100 Subject: act_bpf: allow non-default TC_ACT opcodes as BPF exec outcome Revisiting commit d23b8ad8ab23 ("tc: add BPF based action") with regards to eBPF support, I was thinking that it might be better to improve return semantics from a BPF program invoked through BPF_PROG_RUN(). Currently, in case filter_res is 0, we overwrite the default action opcode with TC_ACT_SHOT. A default action opcode configured through tc's m_bpf can be: TC_ACT_RECLASSIFY, TC_ACT_PIPE, TC_ACT_SHOT, TC_ACT_UNSPEC, TC_ACT_OK. In cls_bpf, we have the possibility to overwrite the default class associated with the classifier in case filter_res is _not_ 0xffffffff (-1). That allows us to fold multiple [e]BPF programs into a single one, where they would otherwise need to be defined as a separate classifier with its own classid, needlessly redoing parsing work, etc. Similarly, we could do better in act_bpf: Since above TC_ACT* opcodes are exported to UAPI anyway, we reuse them for return-code-to-tc-opcode mapping, where we would allow above possibilities. Thus, like in cls_bpf, a filter_res of 0xffffffff (-1) means that the configured _default_ action is used. Any unkown return code from the BPF program would fail in tcf_bpf() with TC_ACT_UNSPEC. Should we one day want to make use of TC_ACT_STOLEN or TC_ACT_QUEUED, which both have the same semantics, we have the option to either use that as a default action (filter_res of 0xffffffff) or non-default BPF return code. All that will allow us to transparently use tcf_bpf() for both BPF flavours. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Cc: Alexei Starovoitov Cc: Jamal Hadi Salim Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 82c5d7fc1988..5f6288fa3f12 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -25,21 +25,41 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_bpf *b = a->priv; - int action; - int filter_res; + int action, filter_res; spin_lock(&b->tcf_lock); + b->tcf_tm.lastuse = jiffies; bstats_update(&b->tcf_bstats, skb); - action = b->tcf_action; filter_res = BPF_PROG_RUN(b->filter, skb); - if (filter_res == 0) { - /* Return code 0 from the BPF program - * is being interpreted as a drop here. - */ - action = TC_ACT_SHOT; + + /* A BPF program may overwrite the default action opcode. + * Similarly as in cls_bpf, if filter_res == -1 we use the + * default action specified from tc. + * + * In case a different well-known TC_ACT opcode has been + * returned, it will overwrite the default one. + * + * For everything else that is unkown, TC_ACT_UNSPEC is + * returned. + */ + switch (filter_res) { + case TC_ACT_PIPE: + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + action = filter_res; + break; + case TC_ACT_SHOT: + action = filter_res; b->tcf_qstats.drops++; + break; + case TC_ACT_UNSPEC: + action = b->tcf_action; + break; + default: + action = TC_ACT_UNSPEC; + break; } spin_unlock(&b->tcf_lock); -- cgit v1.2.3