From efcdbf24fd5daa88060869e51ed49f68b7ac8708 Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Mon, 30 Jan 2012 14:16:06 -0800 Subject: net: Disambiguate kernel message Some of our machines were reporting: TCP: too many of orphaned sockets even when the number of orphaned sockets was well below the limit. We print a different message depending on whether we're out of TCP memory or there are too many orphaned sockets. Also move the check out of line and cleanup the messages that were printed. Signed-off-by: Arun Sharma Suggested-by: Mohan Srinivasan Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: David Miller Cc: Glauber Costa Cc: Ingo Molnar Cc: Joe Perches Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 +++++++++++++++---- net/ipv4/tcp_timer.c | 5 +---- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 06373b4a449a..a34f5cfdd44c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ + bool too_many_orphans, out_of_socket_memory; + + too_many_orphans = tcp_too_many_orphans(sk, shift); + out_of_socket_memory = tcp_out_of_memory(sk); + + if (too_many_orphans && net_ratelimit()) + pr_info("TCP: too many orphaned sockets\n"); + if (out_of_socket_memory && net_ratelimit()) + pr_info("TCP: out of memory -- consider tuning tcp_mem\n"); + return too_many_orphans || out_of_socket_memory; +} + void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2015,10 +2029,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, 0)) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a516d1e399df..cd2e0723266d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) shift++; - if (tcp_too_many_orphans(sk, shift)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || -- cgit v1.2.3 From 786f528119722f564a22ad953411374e06116333 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 1 Feb 2012 09:32:25 +0000 Subject: ethtool: Null-terminate filename passed to ethtool_ops::flash_device The parameters for ETHTOOL_FLASHDEV include a filename, which ought to be null-terminated. Currently the only driver that implements ethtool_ops::flash_device attempts to add a null terminator if necessary, but does it wrongly. Do it in the ethtool core instead. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_ethtool.c | 6 +----- net/core/ethtool.c | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/emulex/benet/be_ethtool.c b/drivers/net/ethernet/emulex/benet/be_ethtool.c index 6db6b6ae5e9b..802e5ddef8a8 100644 --- a/drivers/net/ethernet/emulex/benet/be_ethtool.c +++ b/drivers/net/ethernet/emulex/benet/be_ethtool.c @@ -716,12 +716,8 @@ static int be_do_flash(struct net_device *netdev, struct ethtool_flash *efl) { struct be_adapter *adapter = netdev_priv(netdev); - char file_name[ETHTOOL_FLASH_MAX_FILENAME]; - file_name[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - strcpy(file_name, efl->data); - - return be_load_fw(adapter, file_name); + return be_load_fw(adapter, efl->data); } static int diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 369b41894527..3f79db1b612a 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1190,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, if (!dev->ethtool_ops->flash_device) return -EOPNOTSUPP; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + return dev->ethtool_ops->flash_device(dev, &efl); } -- cgit v1.2.3 From 07ae2dfcf4f7143ce191c6436da1c33f179af0d6 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 1 Feb 2012 18:48:09 +0200 Subject: mac80211: timeout a single frame in the rx reorder buffer The current code checks for stored_mpdu_num > 1, causing the reorder_timer to be triggered indefinitely, but the frame is never timed-out (until the next packet is received) Signed-off-by: Eliad Peller Cc: Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 751409120769..5a5e504a8ffb 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -611,7 +611,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { + tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any * frames in the reorder buffer have timed out. -- cgit v1.2.3 From c43b874d5d714f271b80d4c3f49e05d0cbf51ed2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 2 Feb 2012 00:07:00 +0000 Subject: tcp: properly initialize tcp memory limits Commit 4acb4190 tries to fix the using uninitialized value introduced by commit 3dc43e3, but it would make the per-socket memory limits too small. This patch fixes this and also remove the redundant codes introduced in 4acb4190. Signed-off-by: Jason Wang Acked-by: Glauber Costa Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 ------ net/ipv4/tcp.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4cb9cd2f2c39..7a7724da9bff 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; - unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -815,11 +814,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; tcp_init_mem(net); - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a34f5cfdd44c..37755ccc0e96 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3229,7 +3229,6 @@ __setup("thash_entries=", set_thash_entries); void tcp_init_mem(struct net *net) { - /* Set per-socket limits to no more than 1/128 the pressure threshold */ unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; @@ -3298,7 +3297,8 @@ void __init tcp_init(void) sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(&init_net); - limit = nr_free_buffer_pages() / 8; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); -- cgit v1.2.3 From b01377a4200d0dfc7b04a8daabb4739727353703 Mon Sep 17 00:00:00 2001 From: "sjur.brandeland@stericsson.com" Date: Thu, 2 Feb 2012 01:21:02 +0000 Subject: caif: Bugfix list_del_rcu race in cfmuxl_ctrlcmd. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Always use cfmuxl_remove_uplayer when removing a up-layer. cfmuxl_ctrlcmd() can be called independently and in parallel with cfmuxl_remove_uplayer(). The race between them could cause list_del_rcu to be called on a node which has been already taken out from the list. That lead to a (rare) crash on accessing poisoned node->prev inside list_del_rcu. This fix ensures that deletion are done holding the same lock. Reported-by: Dmitry Tarnyagin Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/cfmuxl.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index b36f24a4c8e7..94b08612a4d8 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -248,7 +248,6 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; - int idx; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { @@ -257,14 +256,9 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && - layer->id != 0) { - - idx = layer->id % UP_CACHE_SIZE; - spin_lock_bh(&muxl->receive_lock); - RCU_INIT_POINTER(muxl->up_cache[idx], NULL); - list_del_rcu(&layer->node); - spin_unlock_bh(&muxl->receive_lock); - } + layer->id != 0) + cfmuxl_remove_uplayer(layr, layer->id); + /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } -- cgit v1.2.3 From ba7605745d5c99f0e71b3ec6c7cb5ed6afe540ad Mon Sep 17 00:00:00 2001 From: Dmitry Tarnyagin Date: Thu, 2 Feb 2012 01:21:03 +0000 Subject: caif: Bugfix double kfree_skb upon xmit failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKB is freed twice upon send error. The Network stack consumes SKB even when it returns error code. Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a98628086452..a97d97a3a512 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -539,8 +539,10 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); - if (cf_sk->layer.dn == NULL) + if (cf_sk->layer.dn == NULL) { + kfree_skb(skb); return -EINVAL; + } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } @@ -683,10 +685,10 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); - if (err < 0) { - kfree_skb(skb); + if (err < 0) + /* skb is already freed */ goto pipe_err; - } + sent += size; } -- cgit v1.2.3 From 5962b35c1de3254a2f03b95efd3b7854b874d7b7 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 3 Feb 2012 05:18:43 +0000 Subject: netprio_cgroup: Fix obo in get_prioidx It was recently pointed out to me that the get_prioidx function sets a bit in the prioidx map prior to checking to see if the index being set is out of bounds. This patch corrects that, avoiding the possiblity of us writing beyond the end of the array Signed-off-by: Neil Horman Reported-by: Stanislaw Gruszka CC: Stanislaw Gruszka CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd4826b75..9ae183a9a381 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -58,11 +58,12 @@ static int get_prioidx(u32 *prio) spin_lock_irqsave(&prioidx_map_lock, flags); prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { + spin_unlock_irqrestore(&prioidx_map_lock, flags); + return -ENOSPC; + } set_bit(prioidx, prioidx_map); spin_unlock_irqrestore(&prioidx_map_lock, flags); - if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) - return -ENOSPC; - atomic_set(&max_prioidx, prioidx); *prio = prioidx; return 0; -- cgit v1.2.3 From e2446eaab5585555a38ea0df4e01ff313dbb4ac9 Mon Sep 17 00:00:00 2001 From: Shawn Lu Date: Sat, 4 Feb 2012 12:38:09 +0000 Subject: tcp_v4_send_reset: binding oif to iif in no sock case Binding RST packet outgoing interface to incoming interface for tcp v4 when there is no socket associate with it. when sk is not NULL, using sk->sk_bound_dev_if instead. (suggested by Eric Dumazet). This has few benefits: 1. tcp_v6_send_reset already did that. 2. This helps tcp connect with SO_BINDTODEVICE set. When connection is lost, we still able to sending out RST using same interface. 3. we are sending reply, it is most likely to be succeed if iif is used Signed-off-by: Shawn Lu Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 337ba4cca052..94d683a61cba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -651,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. + * routing might fail in this case. using iif for oif to + * make sure we can deliver it + */ + arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; -- cgit v1.2.3 From 6d25886ee2fbc05a7bf4dae5f5ae345cb73df2fd Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Tue, 7 Feb 2012 07:39:11 +0000 Subject: net: Fix build regression when INET_UDP_DIAG=y and IPV6=m Tested-by: Anisse Astier Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index aa2a2c79776f..d183262943d9 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -409,7 +409,7 @@ config INET_TCP_DIAG config INET_UDP_DIAG tristate "UDP: socket monitoring interface" - depends on INET_DIAG + depends on INET_DIAG && (IPV6 || IPV6=n) default n ---help--- Support for UDP socket monitoring interface used by the ss tool. -- cgit v1.2.3 From 5ca3b72c5da47d95b83857b768def6172fbc080a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Feb 2012 08:51:50 +0000 Subject: gro: more generic L2 header check Shlomo Pongratz reported GRO L2 header check was suited for Ethernet only, and failed on IB/ipoib traffic. He provided a patch faking a zeroed header to let GRO aggregates frames. Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header check to be more generic, ie not assuming L2 header is 14 bytes, but taking into account hard_header_len. __napi_gro_receive() has special handling for the common case (Ethernet) to avoid a memcmp() call and use an inline optimized function instead. Signed-off-by: Eric Dumazet Reported-by: Shlomo Pongratz Cc: Roland Dreier Cc: Or Gerlitz Cc: Herbert Xu Tested-by: Sean Hefty Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..6ca32f6b3105 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3500,14 +3500,20 @@ static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.3 From 16bda13d90c8d5da243e2cfa1677e62ecce26860 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 6 Feb 2012 15:14:37 -0500 Subject: net: Make qdisc_skb_cb upper size bound explicit. Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside of other data structures. This is intended to be used by IPoIB so that it can remember addressing information stored at hard_header_ops->create() time that it can fetch when the packet gets to the transmit routine. Signed-off-by: David S. Miller --- include/net/sch_generic.h | 9 ++++++++- net/sched/sch_choke.c | 3 +-- net/sched/sch_netem.c | 3 +-- net/sched/sch_sfb.c | 3 +-- net/sched/sch_sfq.c | 5 ++--- 5 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f6bb08b73ca4..55ce96b53b09 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -220,9 +220,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - long data[]; + unsigned char data[24]; }; +static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) +{ + struct qdisc_skb_cb *qcb; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + BUILD_BUG_ON(sizeof(qcb->data) < sz); +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index e465064d39a3..7e267d7b9c75 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -148,8 +148,7 @@ struct choke_skb_cb { static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 2776012132ea..e83d61ca78ca 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -130,8 +130,7 @@ struct netem_skb_cb { static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 96e42cae4c7a..d7eea99333e9 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -94,8 +94,7 @@ struct sfb_skb_cb { static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 67494aef9acf..60d47180f043 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -166,9 +166,8 @@ struct sfq_skb_cb { static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; + qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } static unsigned int sfq_hash(const struct sfq_sched_data *q, -- cgit v1.2.3 From a87dfe14a78501c931a4d5481efff6a809aa907d Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:36 +0000 Subject: netprio_cgroup: fix an off-by-one bug # mount -t cgroup xxx /mnt # mkdir /mnt/tmp # cat /mnt/tmp/net_prio.ifpriomap lo 0 eth0 0 virbr0 0 # echo 'lo 999' > /mnt/tmp/net_prio.ifpriomap # cat /mnt/tmp/net_prio.ifpriomap lo 999 eth0 0 virbr0 4101267344 We got weired output, because we exceeded the boundary of the array. We may even crash the kernel.. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9ae183a9a381..72c638780805 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -108,7 +108,7 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len) static void update_netdev_tables(void) { struct net_device *dev; - u32 max_len = atomic_read(&max_prioidx); + u32 max_len = atomic_read(&max_prioidx) + 1; struct netprio_map *map; rtnl_lock(); -- cgit v1.2.3 From f5c38208d32412d72b97a4f0d44af0eb39feb20b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:37 +0000 Subject: netprio_cgroup: don't allocate prio table when a device is registered So we delay the allocation till the priority is set through cgroup, and this makes skb_update_priority() faster when it's not set. This also eliminates an off-by-one bug similar with the one fixed in the previous patch. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 72c638780805..4dacc44637ef 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -271,7 +271,6 @@ static int netprio_device_event(struct notifier_block *unused, { struct net_device *dev = ptr; struct netprio_map *old; - u32 max_len = atomic_read(&max_prioidx); /* * Note this is called with rtnl_lock held so we have update side @@ -279,11 +278,6 @@ static int netprio_device_event(struct notifier_block *unused, */ switch (event) { - - case NETDEV_REGISTER: - if (max_len) - extend_netdev_table(dev, max_len); - break; case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); -- cgit v1.2.3 From 2b73bc65e2771372c818db7955709c8caedbf8b9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 10 Feb 2012 05:43:38 +0000 Subject: netprio_cgroup: fix wrong memory access when NETPRIO_CGROUP=m When the netprio_cgroup module is not loaded, net_prio_subsys_id is -1, and so sock_update_prioidx() accesses cgroup_subsys array with negative index subsys[-1]. Make the code resembles cls_cgroup code, which is bug free. Origionally-authored-by: Li Zefan Signed-off-by: Li Zefan Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- include/net/netprio_cgroup.h | 48 ++++++++++++++++++++++++++++++++++++-------- net/core/sock.c | 7 ++----- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 7b2d43139c8e..d58fdec47597 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -37,19 +37,51 @@ extern int net_prio_subsys_id; extern void sock_update_netprioidx(struct sock *sk); -static inline struct cgroup_netprio_state - *task_netprio_state(struct task_struct *p) +#if IS_BUILTIN(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) { -#if IS_ENABLED(CONFIG_NETPRIO_CGROUP) - return container_of(task_subsys_state(p, net_prio_subsys_id), - struct cgroup_netprio_state, css); -#else - return NULL; -#endif + struct cgroup_netprio_state *state; + u32 idx; + + rcu_read_lock(); + state = container_of(task_subsys_state(p, net_prio_subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + rcu_read_unlock(); + return idx; +} + +#elif IS_MODULE(CONFIG_NETPRIO_CGROUP) + +static inline u32 task_netprioidx(struct task_struct *p) +{ + struct cgroup_netprio_state *state; + int subsys_id; + u32 idx = 0; + + rcu_read_lock(); + subsys_id = rcu_dereference_index_check(net_prio_subsys_id, + rcu_read_lock_held()); + if (subsys_id >= 0) { + state = container_of(task_subsys_state(p, subsys_id), + struct cgroup_netprio_state, css); + idx = state->prioidx; + } + rcu_read_unlock(); + return idx; } #else +static inline u32 task_netprioidx(struct task_struct *p) +{ + return 0; +} + +#endif /* CONFIG_NETPRIO_CGROUP */ + +#else #define sock_update_netprioidx(sk) #endif diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..02f8dfe320b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid); void sock_update_netprioidx(struct sock *sk) { - struct cgroup_netprio_state *state; if (in_interrupt()) return; - rcu_read_lock(); - state = task_netprio_state(current); - sk->sk_cgrp_prioidx = state ? state->prioidx : 0; - rcu_read_unlock(); + + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif -- cgit v1.2.3 From 5dc7883f2a7c25f8df40d7479687153558cd531b Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 9 Feb 2012 21:15:25 +0000 Subject: ipv4: Fix wrong order of ip_rt_get_source() and update iph->daddr. This patch fix a bug which introduced by commit ac8a4810 (ipv4: Save nexthop address of LSRR/SSRR option to IPCB.).In that patch, we saved the nexthop of SRR in ip_option->nexthop and update iph->daddr until we get to ip_forward_options(), but we need to update it before ip_rt_get_source(), otherwise we may get a wrong src. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1e60f7679075..42dd1a90edea 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], skb, rt); ip_hdr(skb)->daddr = opt->nexthop; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); -- cgit v1.2.3 From 70620c46ac2b45c24b0f22002fdf5ddd1f7daf81 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 10 Feb 2012 04:07:11 +0000 Subject: net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is disabled Commit 653241 (net: RFC3069, private VLAN proxy arp support) changed the behavior of arp proxy to send arp replies back out on the interface the request came in even if the private VLAN feature is disabled. Previously we checked rt->dst.dev != skb->dev for in scenarios, when proxy arp is enabled on for the netdevice and also when individual proxy neighbour entries have been added. This patch adds the check back for the pneigh_lookup() scenario. Signed-off-by: Thomas Graf Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59402be133f0..63e49890ad31 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,7 +863,8 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + (rt->dst.dev != dev && + pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); -- cgit v1.2.3