summaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 18:38:14 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-06 18:38:14 +0200
commitae045e2455429c418a418a3376301a9e5753a0a8 (patch)
treeb445bdeecd3f38aa0d0a29c9585cee49e4ccb0f1 /net/core
parentMerge tag 'random_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git... (diff)
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (diff)
downloadlinux-ae045e2455429c418a418a3376301a9e5753a0a8.tar.xz
linux-ae045e2455429c418a418a3376301a9e5753a0a8.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Highlights: 1) Steady transitioning of the BPF instructure to a generic spot so all kernel subsystems can make use of it, from Alexei Starovoitov. 2) SFC driver supports busy polling, from Alexandre Rames. 3) Take advantage of hash table in UDP multicast delivery, from David Held. 4) Lighten locking, in particular by getting rid of the LRU lists, in inet frag handling. From Florian Westphal. 5) Add support for various RFC6458 control messages in SCTP, from Geir Ola Vaagland. 6) Allow to filter bridge forwarding database dumps by device, from Jamal Hadi Salim. 7) virtio-net also now supports busy polling, from Jason Wang. 8) Some low level optimization tweaks in pktgen from Jesper Dangaard Brouer. 9) Add support for ipv6 address generation modes, so that userland can have some input into the process. From Jiri Pirko. 10) Consolidate common TCP connection request code in ipv4 and ipv6, from Octavian Purdila. 11) New ARP packet logger in netfilter, from Pablo Neira Ayuso. 12) Generic resizable RCU hash table, with intial users in netlink and nftables. From Thomas Graf. 13) Maintain a name assignment type so that userspace can see where a network device name came from (enumerated by kernel, assigned explicitly by userspace, etc.) From Tom Gundersen. 14) Automatic flow label generation on transmit in ipv6, from Tom Herbert. 15) New packet timestamping facilities from Willem de Bruijn, meant to assist in measuring latencies going into/out-of the packet scheduler, latency from TCP data transmission to ACK, etc" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1536 commits) cxgb4 : Disable recursive mailbox commands when enabling vi net: reduce USB network driver config options. tg3: Modify tg3_tso_bug() to handle multiple TX rings amd-xgbe: Perform phy connect/disconnect at dev open/stop amd-xgbe: Use dma_set_mask_and_coherent to set DMA mask net: sun4i-emac: fix memory leak on bad packet sctp: fix possible seqlock seadlock in sctp_packet_transmit() Revert "net: phy: Set the driver when registering an MDIO bus device" cxgb4vf: Turn off SGE RX/TX Callback Timers and interrupts in PCI shutdown routine team: Simplify return path of team_newlink bridge: Update outdated comment on promiscuous mode net-timestamp: ACK timestamp for bytestreams net-timestamp: TCP timestamping net-timestamp: SCHED timestamp on entering packet scheduler net-timestamp: add key to disambiguate concurrent datagrams net-timestamp: move timestamp flags out of sk_flags net-timestamp: extend SCM_TIMESTAMPING ancillary data struct cxgb4i : Move stray CPL definitions to cxgb4 driver tcp: reduce spurious retransmits due to transient SACK reneging qlcnic: Initialize dcbnl_ops before register_netdev ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c53
-rw-r--r--net/core/drop_monitor.c2
-rw-r--r--net/core/filter.c728
-rw-r--r--net/core/flow_dissector.c85
-rw-r--r--net/core/net-sysfs.c154
-rw-r--r--net/core/netpoll.c3
-rw-r--r--net/core/pktgen.c111
-rw-r--r--net/core/ptp_classifier.c70
-rw-r--r--net/core/request_sock.c43
-rw-r--r--net/core/rtnetlink.c107
-rw-r--r--net/core/skbuff.c22
-rw-r--r--net/core/sock.c52
-rw-r--r--net/core/sock_diag.c4
-rw-r--r--net/core/timestamping.c57
14 files changed, 549 insertions, 942 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 367a586d0c8a..1c15b189c52b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
+#include <linux/errqueue.h>
#include "net-sysfs.h"
@@ -1085,6 +1086,7 @@ static int dev_get_valid_name(struct net *net,
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
+ unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
@@ -1112,10 +1114,17 @@ int dev_change_name(struct net_device *dev, const char *newname)
return err;
}
+ if (oldname[0] && !strchr(oldname, '%'))
+ netdev_info(dev, "renamed from %s\n", oldname);
+
+ old_assign_type = dev->name_assign_type;
+ dev->name_assign_type = NET_NAME_RENAMED;
+
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
write_seqcount_end(&devnet_rename_seq);
return ret;
}
@@ -1144,6 +1153,8 @@ rollback:
write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
+ dev->name_assign_type = old_assign_type;
+ old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
pr_err("%s: name change rollback failed: %d\n",
@@ -2316,7 +2327,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
*/
if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
if (vlan_depth) {
- if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
+ if (WARN_ON(vlan_depth < VLAN_HLEN))
return 0;
vlan_depth -= VLAN_HLEN;
} else {
@@ -2414,8 +2425,8 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
skb_warn_bad_offload(skb);
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
return ERR_PTR(err);
}
@@ -2745,8 +2756,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
/*
* Heuristic to force contended enqueues to serialize on a
* separate lock before trying to get qdisc main lock.
- * This permits __QDISC_STATE_RUNNING owner to get the lock more often
- * and dequeue packets faster.
+ * This permits __QDISC___STATE_RUNNING owner to get the lock more
+ * often and dequeue packets faster.
*/
contended = qdisc_is_running(q);
if (unlikely(contended))
@@ -2866,6 +2877,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_reset_mac_header(skb);
+ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
@@ -5440,13 +5454,9 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
*/
ret = 0;
- if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
+ if ((old_flags ^ flags) & IFF_UP)
ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
- if (!ret)
- dev_set_rx_mode(dev);
- }
-
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
unsigned int old_flags = dev->flags;
@@ -6446,17 +6456,19 @@ void netdev_freemem(struct net_device *dev)
/**
* alloc_netdev_mqs - allocate network device
- * @sizeof_priv: size of private data to allocate space for
- * @name: device name format string
- * @setup: callback to initialize device
- * @txqs: the number of TX subqueues to allocate
- * @rxqs: the number of RX subqueues to allocate
+ * @sizeof_priv: size of private data to allocate space for
+ * @name: device name format string
+ * @name_assign_type: origin of device name
+ * @setup: callback to initialize device
+ * @txqs: the number of TX subqueues to allocate
+ * @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
@@ -6535,6 +6547,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
#endif
strcpy(dev->name, name);
+ dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
@@ -6946,12 +6959,14 @@ static int __netdev_printk(const char *level, const struct net_device *dev,
if (dev && dev->dev.parent) {
r = dev_printk_emit(level[1] - '0',
dev->dev.parent,
- "%s %s %s: %pV",
+ "%s %s %s%s: %pV",
dev_driver_string(dev->dev.parent),
dev_name(dev->dev.parent),
- netdev_name(dev), vaf);
+ netdev_name(dev), netdev_reg_state(dev),
+ vaf);
} else if (dev) {
- r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
+ r = printk("%s%s%s: %pV", level, netdev_name(dev),
+ netdev_reg_state(dev), vaf);
} else {
r = printk("%s(NULL net_device): %pV", level, vaf);
}
@@ -7103,7 +7118,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
rtnl_lock_unregistering(net_list);
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
else
unregister_netdevice_queue(dev, &dev_kill_list);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index e70301eb7a4a..50f9a9db5792 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -289,10 +289,8 @@ static int net_dm_cmd_trace(struct sk_buff *skb,
switch (info->genlhdr->cmd) {
case NET_DM_CMD_START:
return set_all_monitor_traces(TRACE_ON);
- break;
case NET_DM_CMD_STOP:
return set_all_monitor_traces(TRACE_OFF);
- break;
}
return -ENOTSUPP;
diff --git a/net/core/filter.c b/net/core/filter.c
index 1dbf6462f766..d814b8a89d0f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,7 +18,7 @@
* 2 of the License, or (at your option) any later version.
*
* Andi Kleen - Fix a few bad bugs and races.
- * Kris Katterjohn - Added many additional checks in sk_chk_filter()
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
#include <linux/module.h>
@@ -45,54 +45,6 @@
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
-/* Registers */
-#define BPF_R0 regs[BPF_REG_0]
-#define BPF_R1 regs[BPF_REG_1]
-#define BPF_R2 regs[BPF_REG_2]
-#define BPF_R3 regs[BPF_REG_3]
-#define BPF_R4 regs[BPF_REG_4]
-#define BPF_R5 regs[BPF_REG_5]
-#define BPF_R6 regs[BPF_REG_6]
-#define BPF_R7 regs[BPF_REG_7]
-#define BPF_R8 regs[BPF_REG_8]
-#define BPF_R9 regs[BPF_REG_9]
-#define BPF_R10 regs[BPF_REG_10]
-
-/* Named registers */
-#define DST regs[insn->dst_reg]
-#define SRC regs[insn->src_reg]
-#define FP regs[BPF_REG_FP]
-#define ARG1 regs[BPF_REG_ARG1]
-#define CTX regs[BPF_REG_CTX]
-#define IMM insn->imm
-
-/* No hurry in this branch
- *
- * Exported for the bpf jit load helper.
- */
-void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
-{
- u8 *ptr = NULL;
-
- if (k >= SKF_NET_OFF)
- ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
- ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
- if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
- return ptr;
-
- return NULL;
-}
-
-static inline void *load_pointer(const struct sk_buff *skb, int k,
- unsigned int size, void *buffer)
-{
- if (k >= 0)
- return skb_header_pointer(skb, k, size, buffer);
-
- return bpf_internal_load_pointer_neg_helper(skb, k, size);
-}
-
/**
* sk_filter - run a packet through a socket filter
* @sk: sock associated with &sk_buff
@@ -135,451 +87,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sk_filter);
-/* Base function for offset calculation. Needs to go into .text section,
- * therefore keeping it non-static as well; will also be used by JITs
- * anyway later on, so do not let the compiler omit it.
- */
-noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return 0;
-}
-
-/**
- * __sk_run_filter - run a filter on a given context
- * @ctx: buffer to run the filter on
- * @insn: filter to apply
- *
- * Decode and apply filter instructions to the skb->data. Return length to
- * keep, 0 for none. @ctx is the data we are operating on, @insn is the
- * array of filter instructions.
- */
-static unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
-{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
- static const void *jumptable[256] = {
- [0 ... 255] = &&default_label,
- /* Now overwrite non-defaults ... */
- /* 32 bit ALU operations */
- [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
- [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
- [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
- [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
- [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
- [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
- [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
- [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
- [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
- [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
- [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
- [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
- [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
- [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
- [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
- [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
- [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
- [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
- [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
- [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
- [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
- [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
- [BPF_ALU | BPF_NEG] = &&ALU_NEG,
- [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
- [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
- /* 64 bit ALU operations */
- [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
- [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
- [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
- [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
- [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
- [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
- [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
- [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
- [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
- [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
- [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
- [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
- [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
- [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
- [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
- [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
- [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
- [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
- [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
- [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
- [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
- [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
- [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
- [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
- [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
- /* Call instruction */
- [BPF_JMP | BPF_CALL] = &&JMP_CALL,
- /* Jumps */
- [BPF_JMP | BPF_JA] = &&JMP_JA,
- [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
- [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
- [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
- [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
- [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
- [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
- [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
- [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
- [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
- [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
- [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
- [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
- [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
- [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
- /* Program return */
- [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
- /* Store instructions */
- [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
- [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
- [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
- [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
- [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
- [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
- [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
- [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
- [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
- [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
- /* Load instructions */
- [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
- [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
- [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
- [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
- [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
- [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
- [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
- [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
- [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
- [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
- };
- void *ptr;
- int off;
-
-#define CONT ({ insn++; goto select_insn; })
-#define CONT_JMP ({ insn++; goto select_insn; })
-
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
-select_insn:
- goto *jumptable[insn->code];
-
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
- CONT;
-
- ALU(ADD, +)
- ALU(SUB, -)
- ALU(AND, &)
- ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
- ALU(XOR, ^)
- ALU(MUL, *)
-#undef ALU
- ALU_NEG:
- DST = (u32) -DST;
- CONT;
- ALU64_NEG:
- DST = -DST;
- CONT;
- ALU_MOV_X:
- DST = (u32) SRC;
- CONT;
- ALU_MOV_K:
- DST = (u32) IMM;
- CONT;
- ALU64_MOV_X:
- DST = SRC;
- CONT;
- ALU64_MOV_K:
- DST = IMM;
- CONT;
- ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
- CONT;
- ALU64_ARSH_K:
- (*(s64 *) &DST) >>= IMM;
- CONT;
- ALU64_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
- tmp = DST;
- DST = do_div(tmp, SRC);
- CONT;
- ALU_MOD_X:
- if (unlikely(SRC == 0))
- return 0;
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) SRC);
- CONT;
- ALU64_MOD_K:
- tmp = DST;
- DST = do_div(tmp, IMM);
- CONT;
- ALU_MOD_K:
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) IMM);
- CONT;
- ALU64_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
- do_div(DST, SRC);
- CONT;
- ALU_DIV_X:
- if (unlikely(SRC == 0))
- return 0;
- tmp = (u32) DST;
- do_div(tmp, (u32) SRC);
- DST = (u32) tmp;
- CONT;
- ALU64_DIV_K:
- do_div(DST, IMM);
- CONT;
- ALU_DIV_K:
- tmp = (u32) DST;
- do_div(tmp, (u32) IMM);
- DST = (u32) tmp;
- CONT;
- ALU_END_TO_BE:
- switch (IMM) {
- case 16:
- DST = (__force u16) cpu_to_be16(DST);
- break;
- case 32:
- DST = (__force u32) cpu_to_be32(DST);
- break;
- case 64:
- DST = (__force u64) cpu_to_be64(DST);
- break;
- }
- CONT;
- ALU_END_TO_LE:
- switch (IMM) {
- case 16:
- DST = (__force u16) cpu_to_le16(DST);
- break;
- case 32:
- DST = (__force u32) cpu_to_le32(DST);
- break;
- case 64:
- DST = (__force u64) cpu_to_le64(DST);
- break;
- }
- CONT;
-
- /* CALL */
- JMP_CALL:
- /* Function call scratches BPF_R1-BPF_R5 registers,
- * preserves BPF_R6-BPF_R9, and stores return value
- * into BPF_R0.
- */
- BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
- BPF_R4, BPF_R5);
- CONT;
-
- /* JMP */
- JMP_JA:
- insn += insn->off;
- CONT;
- JMP_JEQ_X:
- if (DST == SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JEQ_K:
- if (DST == IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_X:
- if (DST != SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JNE_K:
- if (DST != IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_X:
- if (DST > SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGT_K:
- if (DST > IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_X:
- if (DST >= SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JGE_K:
- if (DST >= IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_X:
- if (((s64) DST) > ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGT_K:
- if (((s64) DST) > ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_X:
- if (((s64) DST) >= ((s64) SRC)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSGE_K:
- if (((s64) DST) >= ((s64) IMM)) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_X:
- if (DST & SRC) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_JSET_K:
- if (DST & IMM) {
- insn += insn->off;
- CONT_JMP;
- }
- CONT;
- JMP_EXIT:
- return BPF_R0;
-
- /* STX and ST and LDX*/
-#define LDST(SIZEOP, SIZE) \
- STX_MEM_##SIZEOP: \
- *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
- CONT; \
- ST_MEM_##SIZEOP: \
- *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
- CONT; \
- LDX_MEM_##SIZEOP: \
- DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
- CONT;
-
- LDST(B, u8)
- LDST(H, u16)
- LDST(W, u32)
- LDST(DW, u64)
-#undef LDST
- STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
- atomic_add((u32) SRC, (atomic_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
- atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
- off = IMM;
-load_word:
- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
- * only appearing in the programs where ctx ==
- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
- * == BPF_R6, sk_convert_filter() saves it in BPF_R6,
- * internal BPF verifier will check that BPF_R6 ==
- * ctx.
- *
- * BPF_ABS and BPF_IND are wrappers of function calls,
- * so they scratch BPF_R1-BPF_R5 registers, preserve
- * BPF_R6-BPF_R9, and store return value into BPF_R0.
- *
- * Implicit input:
- * ctx == skb == BPF_R6 == CTX
- *
- * Explicit input:
- * SRC == any register
- * IMM == 32-bit immediate
- *
- * Output:
- * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
- */
-
- ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be32(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
- off = IMM;
-load_half:
- ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = get_unaligned_be16(ptr);
- CONT;
- }
-
- return 0;
- LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
- off = IMM;
-load_byte:
- ptr = load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
- if (likely(ptr != NULL)) {
- BPF_R0 = *(u8 *)ptr;
- CONT;
- }
-
- return 0;
- LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_word;
- LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
- off = IMM + SRC;
- goto load_half;
- LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
- off = IMM + SRC;
- goto load_byte;
-
- default_label:
- /* If we ever reach this, we have a bug somewhere. */
- WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
- return 0;
-}
-
/* Helper to find the offset of pkt_type in sk_buff structure. We want
* to make sure its still a 3bit field starting at a byte boundary;
* taken from arch/x86/net/bpf_jit_comp.c.
@@ -667,9 +174,9 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5)
}
static bool convert_bpf_extensions(struct sock_filter *fp,
- struct sock_filter_int **insnp)
+ struct bpf_insn **insnp)
{
- struct sock_filter_int *insn = *insnp;
+ struct bpf_insn *insn = *insnp;
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PROTOCOL:
@@ -805,7 +312,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
}
/**
- * sk_convert_filter - convert filter program
+ * bpf_convert_filter - convert filter program
* @prog: the user passed filter program
* @len: the length of the user passed filter program
* @new_prog: buffer where converted program will be stored
@@ -815,12 +322,12 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* Conversion workflow:
*
* 1) First pass for calculating the new program length:
- * sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ * bpf_convert_filter(old_prog, old_len, NULL, &new_len)
*
* 2) 2nd pass to remap in two passes: 1st pass finds new
* jump offsets, 2nd pass remapping:
- * new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
- * sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
+ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
*
* User BPF's register A is mapped to our BPF register 6, user BPF
* register X is mapped to BPF register 7; frame pointer is always
@@ -828,11 +335,11 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* for socket filters: ctx == 'struct sk_buff *', for seccomp:
* ctx == 'struct seccomp_data *'.
*/
-int sk_convert_filter(struct sock_filter *prog, int len,
- struct sock_filter_int *new_prog, int *new_len)
+int bpf_convert_filter(struct sock_filter *prog, int len,
+ struct bpf_insn *new_prog, int *new_len)
{
int new_flen = 0, pass = 0, target, i;
- struct sock_filter_int *new_insn;
+ struct bpf_insn *new_insn;
struct sock_filter *fp;
int *addrs = NULL;
u8 bpf_src;
@@ -858,8 +365,8 @@ do_pass:
new_insn++;
for (i = 0; i < len; fp++, i++) {
- struct sock_filter_int tmp_insns[6] = { };
- struct sock_filter_int *insn = tmp_insns;
+ struct bpf_insn tmp_insns[6] = { };
+ struct bpf_insn *insn = tmp_insns;
if (addrs)
addrs[i] = new_insn - new_prog;
@@ -1094,7 +601,7 @@ err:
* a cell if not previously written, and we check all branches to be sure
* a malicious user doesn't try to abuse us.
*/
-static int check_load_and_stores(struct sock_filter *filter, int flen)
+static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
int pc, ret = 0;
@@ -1214,7 +721,7 @@ static bool chk_code_allowed(u16 code_to_probe)
}
/**
- * sk_chk_filter - verify socket filter code
+ * bpf_check_classic - verify socket filter code
* @filter: filter to verify
* @flen: length of filter
*
@@ -1227,7 +734,7 @@ static bool chk_code_allowed(u16 code_to_probe)
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
+int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
{
bool anc_found;
int pc;
@@ -1237,7 +744,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
/* Check the filter code now */
for (pc = 0; pc < flen; pc++) {
- struct sock_filter *ftest = &filter[pc];
+ const struct sock_filter *ftest = &filter[pc];
/* May we actually operate on this code? */
if (!chk_code_allowed(ftest->code))
@@ -1301,12 +808,12 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
return -EINVAL;
}
-EXPORT_SYMBOL(sk_chk_filter);
+EXPORT_SYMBOL(bpf_check_classic);
-static int sk_store_orig_filter(struct sk_filter *fp,
- const struct sock_fprog *fprog)
+static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
+ const struct sock_fprog *fprog)
{
- unsigned int fsize = sk_filter_proglen(fprog);
+ unsigned int fsize = bpf_classic_proglen(fprog);
struct sock_fprog_kern *fkprog;
fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
@@ -1324,7 +831,7 @@ static int sk_store_orig_filter(struct sk_filter *fp,
return 0;
}
-static void sk_release_orig_filter(struct sk_filter *fp)
+static void bpf_release_orig_filter(struct bpf_prog *fp)
{
struct sock_fprog_kern *fprog = fp->orig_prog;
@@ -1334,6 +841,18 @@ static void sk_release_orig_filter(struct sk_filter *fp)
}
}
+static void __bpf_prog_release(struct bpf_prog *prog)
+{
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+}
+
+static void __sk_filter_release(struct sk_filter *fp)
+{
+ __bpf_prog_release(fp->prog);
+ kfree(fp);
+}
+
/**
* sk_filter_release_rcu - Release a socket filter by rcu_head
* @rcu: rcu_head that contains the sk_filter to free
@@ -1342,8 +861,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu)
{
struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
- sk_release_orig_filter(fp);
- sk_filter_free(fp);
+ __sk_filter_release(fp);
}
/**
@@ -1360,44 +878,33 @@ static void sk_filter_release(struct sk_filter *fp)
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
- atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
- sk_filter_release(fp);
-}
+ u32 filter_size = bpf_prog_size(fp->prog->len);
-void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
-{
- atomic_inc(&fp->refcnt);
- atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
+ atomic_sub(filter_size, &sk->sk_omem_alloc);
+ sk_filter_release(fp);
}
-static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
- struct sock *sk,
- unsigned int len)
+/* try to charge the socket memory if there is space available
+ * return true on success
+ */
+bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
- struct sk_filter *fp_new;
-
- if (sk == NULL)
- return krealloc(fp, len, GFP_KERNEL);
-
- fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
- if (fp_new) {
- *fp_new = *fp;
- /* As we're keeping orig_prog in fp_new along,
- * we need to make sure we're not evicting it
- * from the old fp.
- */
- fp->orig_prog = NULL;
- sk_filter_uncharge(sk, fp);
+ u32 filter_size = bpf_prog_size(fp->prog->len);
+
+ /* same check as in sock_kmalloc() */
+ if (filter_size <= sysctl_optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ atomic_inc(&fp->refcnt);
+ atomic_add(filter_size, &sk->sk_omem_alloc);
+ return true;
}
-
- return fp_new;
+ return false;
}
-static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
- struct sock *sk)
+static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
struct sock_filter *old_prog;
- struct sk_filter *old_fp;
+ struct bpf_prog *old_fp;
int err, new_len, old_len = fp->len;
/* We are free to overwrite insns et al right here as it
@@ -1406,7 +913,7 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
* representation.
*/
BUILD_BUG_ON(sizeof(struct sock_filter) !=
- sizeof(struct sock_filter_int));
+ sizeof(struct bpf_insn));
/* Conversion cannot happen on overlapping memory areas,
* so we need to keep the user BPF around until the 2nd
@@ -1420,13 +927,13 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
}
/* 1st pass: calculate the new program length. */
- err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
if (err)
goto out_err_free;
/* Expand fp for appending the new filter representation. */
old_fp = fp;
- fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+ fp = krealloc(old_fp, bpf_prog_size(new_len), GFP_KERNEL);
if (!fp) {
/* The old_fp is still around in case we couldn't
* allocate new memory, so uncharge on that one.
@@ -1438,17 +945,17 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
fp->len = new_len;
- /* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
- err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
+ err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
if (err)
- /* 2nd sk_convert_filter() can fail only if it fails
+ /* 2nd bpf_convert_filter() can fail only if it fails
* to allocate memory, remapping must succeed. Note,
* that at this time old_fp has already been released
- * by __sk_migrate_realloc().
+ * by krealloc().
*/
goto out_err_free;
- sk_filter_select_runtime(fp);
+ bpf_prog_select_runtime(fp);
kfree(old_prog);
return fp;
@@ -1456,55 +963,20 @@ static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
out_err_free:
kfree(old_prog);
out_err:
- /* Rollback filter setup. */
- if (sk != NULL)
- sk_filter_uncharge(sk, fp);
- else
- kfree(fp);
+ __bpf_prog_release(fp);
return ERR_PTR(err);
}
-void __weak bpf_int_jit_compile(struct sk_filter *prog)
-{
-}
-
-/**
- * sk_filter_select_runtime - select execution runtime for BPF program
- * @fp: sk_filter populated with internal BPF program
- *
- * try to JIT internal BPF program, if JIT is not available select interpreter
- * BPF program will be executed via SK_RUN_FILTER() macro
- */
-void sk_filter_select_runtime(struct sk_filter *fp)
-{
- fp->bpf_func = (void *) __sk_run_filter;
-
- /* Probe if internal BPF can be JITed */
- bpf_int_jit_compile(fp);
-}
-EXPORT_SYMBOL_GPL(sk_filter_select_runtime);
-
-/* free internal BPF program */
-void sk_filter_free(struct sk_filter *fp)
-{
- bpf_jit_free(fp);
-}
-EXPORT_SYMBOL_GPL(sk_filter_free);
-
-static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
- struct sock *sk)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
{
int err;
fp->bpf_func = NULL;
fp->jited = 0;
- err = sk_chk_filter(fp->insns, fp->len);
+ err = bpf_check_classic(fp->insns, fp->len);
if (err) {
- if (sk != NULL)
- sk_filter_uncharge(sk, fp);
- else
- kfree(fp);
+ __bpf_prog_release(fp);
return ERR_PTR(err);
}
@@ -1517,13 +989,13 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
* internal BPF translation for the optimized interpreter.
*/
if (!fp->jited)
- fp = __sk_migrate_filter(fp, sk);
+ fp = bpf_migrate_filter(fp);
return fp;
}
/**
- * sk_unattached_filter_create - create an unattached filter
+ * bpf_prog_create - create an unattached filter
* @pfp: the unattached filter that is created
* @fprog: the filter program
*
@@ -1532,23 +1004,21 @@ static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
* If an error occurs or there is insufficient memory for the filter
* a negative errno code is returned. On success the return is zero.
*/
-int sk_unattached_filter_create(struct sk_filter **pfp,
- struct sock_fprog_kern *fprog)
+int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
- unsigned int fsize = sk_filter_proglen(fprog);
- struct sk_filter *fp;
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ struct bpf_prog *fp;
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return -EINVAL;
- fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL);
+ fp = kmalloc(bpf_prog_size(fprog->len), GFP_KERNEL);
if (!fp)
return -ENOMEM;
memcpy(fp->insns, fprog->filter, fsize);
- atomic_set(&fp->refcnt, 1);
fp->len = fprog->len;
/* Since unattached filters are not copied back to user
* space through sk_get_filter(), we do not need to hold
@@ -1556,23 +1026,23 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
*/
fp->orig_prog = NULL;
- /* __sk_prepare_filter() already takes care of uncharging
+ /* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- fp = __sk_prepare_filter(fp, NULL);
+ fp = bpf_prepare_filter(fp);
if (IS_ERR(fp))
return PTR_ERR(fp);
*pfp = fp;
return 0;
}
-EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
+EXPORT_SYMBOL_GPL(bpf_prog_create);
-void sk_unattached_filter_destroy(struct sk_filter *fp)
+void bpf_prog_destroy(struct bpf_prog *fp)
{
- sk_filter_release(fp);
+ __bpf_prog_release(fp);
}
-EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
+EXPORT_SYMBOL_GPL(bpf_prog_destroy);
/**
* sk_attach_filter - attach a socket filter
@@ -1587,8 +1057,9 @@ EXPORT_SYMBOL_GPL(sk_unattached_filter_destroy);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct sk_filter *fp, *old_fp;
- unsigned int fsize = sk_filter_proglen(fprog);
- unsigned int sk_fsize = sk_filter_size(fprog->len);
+ unsigned int fsize = bpf_classic_proglen(fprog);
+ unsigned int bpf_fsize = bpf_prog_size(fprog->len);
+ struct bpf_prog *prog;
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
@@ -1598,30 +1069,43 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
if (fprog->filter == NULL)
return -EINVAL;
- fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL);
- if (!fp)
+ prog = kmalloc(bpf_fsize, GFP_KERNEL);
+ if (!prog)
return -ENOMEM;
- if (copy_from_user(fp->insns, fprog->filter, fsize)) {
- sock_kfree_s(sk, fp, sk_fsize);
+ if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+ kfree(prog);
return -EFAULT;
}
- atomic_set(&fp->refcnt, 1);
- fp->len = fprog->len;
+ prog->len = fprog->len;
- err = sk_store_orig_filter(fp, fprog);
+ err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
- sk_filter_uncharge(sk, fp);
+ kfree(prog);
return -ENOMEM;
}
- /* __sk_prepare_filter() already takes care of uncharging
+ /* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- fp = __sk_prepare_filter(fp, sk);
- if (IS_ERR(fp))
- return PTR_ERR(fp);
+ prog = bpf_prepare_filter(prog);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ if (!fp) {
+ __bpf_prog_release(prog);
+ return -ENOMEM;
+ }
+ fp->prog = prog;
+
+ atomic_set(&fp->refcnt, 0);
+
+ if (!sk_filter_charge(sk, fp)) {
+ __sk_filter_release(fp);
+ return -ENOMEM;
+ }
old_fp = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
@@ -1670,7 +1154,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
/* We're copying the filter that has been originally attached,
* so no conversion/decode needed anymore.
*/
- fprog = filter->orig_prog;
+ fprog = filter->prog->orig_prog;
ret = fprog->len;
if (!len)
@@ -1682,7 +1166,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
ret = -EFAULT;
- if (copy_to_user(ubuf, fprog->filter, sk_filter_proglen(fprog)))
+ if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
goto out;
/* Instead of bytes, the API requests to return the number
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 107ed12a5323..5f362c1d0332 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -80,6 +80,8 @@ ip:
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
+ __be32 flow_label;
+
ipv6:
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
if (!iph)
@@ -89,6 +91,21 @@ ipv6:
flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
nhoff += sizeof(struct ipv6hdr);
+
+ flow_label = ip6_flowlabel(iph);
+ if (flow_label) {
+ /* Awesome, IPv6 packet has a flow label so we can
+ * use that to represent the ports without any
+ * further dissection.
+ */
+ flow->n_proto = proto;
+ flow->ip_proto = ip_proto;
+ flow->ports = flow_label;
+ flow->thoff = (u16)nhoff;
+
+ return true;
+ }
+
break;
}
case htons(ETH_P_8021AD):
@@ -175,6 +192,7 @@ ipv6:
break;
}
+ flow->n_proto = proto;
flow->ip_proto = ip_proto;
flow->ports = skb_flow_get_ports(skb, nhoff, ip_proto);
flow->thoff = (u16) nhoff;
@@ -195,12 +213,33 @@ static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
return jhash_3words(a, b, c, hashrnd);
}
-static __always_inline u32 __flow_hash_1word(u32 a)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
{
- __flow_hash_secret_init();
- return jhash_1word(a, hashrnd);
+ u32 hash;
+
+ /* get a consistent hash (same value on both flow directions) */
+ if (((__force u32)keys->dst < (__force u32)keys->src) ||
+ (((__force u32)keys->dst == (__force u32)keys->src) &&
+ ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
+ swap(keys->dst, keys->src);
+ swap(keys->port16[0], keys->port16[1]);
+ }
+
+ hash = __flow_hash_3words((__force u32)keys->dst,
+ (__force u32)keys->src,
+ (__force u32)keys->ports);
+ if (!hash)
+ hash = 1;
+
+ return hash;
}
+u32 flow_hash_from_keys(struct flow_keys *keys)
+{
+ return __flow_hash_from_keys(keys);
+}
+EXPORT_SYMBOL(flow_hash_from_keys);
+
/*
* __skb_get_hash: calculate a flow hash based on src/dst addresses
* and src/dst port numbers. Sets hash in skb to non-zero hash value
@@ -210,7 +249,6 @@ static __always_inline u32 __flow_hash_1word(u32 a)
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
- u32 hash;
if (!skb_flow_dissect(skb, &keys))
return;
@@ -218,21 +256,9 @@ void __skb_get_hash(struct sk_buff *skb)
if (keys.ports)
skb->l4_hash = 1;
- /* get a consistent hash (same value on both flow directions) */
- if (((__force u32)keys.dst < (__force u32)keys.src) ||
- (((__force u32)keys.dst == (__force u32)keys.src) &&
- ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
- swap(keys.dst, keys.src);
- swap(keys.port16[0], keys.port16[1]);
- }
-
- hash = __flow_hash_3words((__force u32)keys.dst,
- (__force u32)keys.src,
- (__force u32)keys.ports);
- if (!hash)
- hash = 1;
+ skb->sw_hash = 1;
- skb->hash = hash;
+ skb->hash = __flow_hash_from_keys(&keys);
}
EXPORT_SYMBOL(__skb_get_hash);
@@ -240,7 +266,7 @@ EXPORT_SYMBOL(__skb_get_hash);
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
-u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
@@ -260,13 +286,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
qcount = dev->tc_to_txq[tc].count;
}
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol;
- hash = __flow_hash_1word(hash);
-
- return (u16) (((u64) hash * qcount) >> 32) + qoffset;
+ return (u16) (((u64)skb_get_hash(skb) * qcount) >> 32) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
@@ -338,17 +358,10 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
- else {
- u32 hash;
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol ^
- skb->hash;
- hash = __flow_hash_1word(hash);
+ else
queue_index = map->queues[
- ((u64)hash * map->len) >> 32];
- }
+ ((u64)skb_get_hash(skb) * map->len) >> 32];
+
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1cac29ebb05b..9dd06699b09c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -43,12 +43,12 @@ static ssize_t netdev_show(const struct device *dev,
struct device_attribute *attr, char *buf,
ssize_t (*format)(const struct net_device *, char *))
{
- struct net_device *net = to_net_dev(dev);
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = (*format)(net, buf);
+ if (dev_isalive(ndev))
+ ret = (*format)(ndev, buf);
read_unlock(&dev_base_lock);
return ret;
@@ -56,9 +56,9 @@ static ssize_t netdev_show(const struct device *dev,
/* generate a show function for simple field */
#define NETDEVICE_SHOW(field, format_string) \
-static ssize_t format_##field(const struct net_device *net, char *buf) \
+static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, net->field); \
+ return sprintf(buf, format_string, dev->field); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -112,16 +112,35 @@ NETDEVICE_SHOW_RO(ifindex, fmt_dec);
NETDEVICE_SHOW_RO(type, fmt_dec);
NETDEVICE_SHOW_RO(link_mode, fmt_dec);
+static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
+{
+ return sprintf(buf, fmt_dec, dev->name_assign_type);
+}
+
+static ssize_t name_assign_type_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *ndev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ ret = netdev_show(dev, attr, buf, format_name_assign_type);
+
+ return ret;
+}
+static DEVICE_ATTR_RO(name_assign_type);
+
/* use same locking rules as GIFHWADDR ioctl's */
static ssize_t address_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct net_device *net = to_net_dev(dev);
+ struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
read_lock(&dev_base_lock);
- if (dev_isalive(net))
- ret = sysfs_format_mac(buf, net->dev_addr, net->addr_len);
+ if (dev_isalive(ndev))
+ ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
read_unlock(&dev_base_lock);
return ret;
}
@@ -130,18 +149,18 @@ static DEVICE_ATTR_RO(address);
static ssize_t broadcast_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct net_device *net = to_net_dev(dev);
- if (dev_isalive(net))
- return sysfs_format_mac(buf, net->broadcast, net->addr_len);
+ struct net_device *ndev = to_net_dev(dev);
+ if (dev_isalive(ndev))
+ return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
return -EINVAL;
}
static DEVICE_ATTR_RO(broadcast);
-static int change_carrier(struct net_device *net, unsigned long new_carrier)
+static int change_carrier(struct net_device *dev, unsigned long new_carrier)
{
- if (!netif_running(net))
+ if (!netif_running(dev))
return -EINVAL;
- return dev_change_carrier(net, (bool) new_carrier);
+ return dev_change_carrier(dev, (bool) new_carrier);
}
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
@@ -265,9 +284,9 @@ static DEVICE_ATTR_RO(carrier_changes);
/* read-write attributes */
-static int change_mtu(struct net_device *net, unsigned long new_mtu)
+static int change_mtu(struct net_device *dev, unsigned long new_mtu)
{
- return dev_set_mtu(net, (int) new_mtu);
+ return dev_set_mtu(dev, (int) new_mtu);
}
static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
@@ -277,9 +296,9 @@ static ssize_t mtu_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RW(mtu, fmt_dec);
-static int change_flags(struct net_device *net, unsigned long new_flags)
+static int change_flags(struct net_device *dev, unsigned long new_flags)
{
- return dev_change_flags(net, (unsigned int) new_flags);
+ return dev_change_flags(dev, (unsigned int) new_flags);
}
static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
@@ -289,9 +308,9 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
}
NETDEVICE_SHOW_RW(flags, fmt_hex);
-static int change_tx_queue_len(struct net_device *net, unsigned long new_len)
+static int change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
- net->tx_queue_len = new_len;
+ dev->tx_queue_len = new_len;
return 0;
}
@@ -344,9 +363,9 @@ static ssize_t ifalias_show(struct device *dev,
}
static DEVICE_ATTR_RW(ifalias);
-static int change_group(struct net_device *net, unsigned long new_group)
+static int change_group(struct net_device *dev, unsigned long new_group)
{
- dev_set_group(net, (int) new_group);
+ dev_set_group(dev, (int) new_group);
return 0;
}
@@ -387,6 +406,7 @@ static struct attribute *net_class_attrs[] = {
&dev_attr_dev_port.attr,
&dev_attr_iflink.attr,
&dev_attr_ifindex.attr,
+ &dev_attr_name_assign_type.attr,
&dev_attr_addr_assign_type.attr,
&dev_attr_addr_len.attr,
&dev_attr_link_mode.attr,
@@ -776,20 +796,20 @@ static struct kobj_type rx_queue_ktype = {
.namespace = rx_queue_namespace
};
-static int rx_queue_add_kobject(struct net_device *net, int index)
+static int rx_queue_add_kobject(struct net_device *dev, int index)
{
- struct netdev_rx_queue *queue = net->_rx + index;
+ struct netdev_rx_queue *queue = dev->_rx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
- kobj->kset = net->queues_kset;
+ kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error)
goto exit;
- if (net->sysfs_rx_queue_group) {
- error = sysfs_create_group(kobj, net->sysfs_rx_queue_group);
+ if (dev->sysfs_rx_queue_group) {
+ error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error)
goto exit;
}
@@ -805,18 +825,18 @@ exit:
#endif /* CONFIG_SYSFS */
int
-net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
{
#ifdef CONFIG_SYSFS
int i;
int error = 0;
#ifndef CONFIG_RPS
- if (!net->sysfs_rx_queue_group)
+ if (!dev->sysfs_rx_queue_group)
return 0;
#endif
for (i = old_num; i < new_num; i++) {
- error = rx_queue_add_kobject(net, i);
+ error = rx_queue_add_kobject(dev, i);
if (error) {
new_num = old_num;
break;
@@ -824,10 +844,10 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
while (--i >= new_num) {
- if (net->sysfs_rx_queue_group)
- sysfs_remove_group(&net->_rx[i].kobj,
- net->sysfs_rx_queue_group);
- kobject_put(&net->_rx[i].kobj);
+ if (dev->sysfs_rx_queue_group)
+ sysfs_remove_group(&dev->_rx[i].kobj,
+ dev->sysfs_rx_queue_group);
+ kobject_put(&dev->_rx[i].kobj);
}
return error;
@@ -1135,13 +1155,13 @@ static struct kobj_type netdev_queue_ktype = {
.namespace = netdev_queue_namespace,
};
-static int netdev_queue_add_kobject(struct net_device *net, int index)
+static int netdev_queue_add_kobject(struct net_device *dev, int index)
{
- struct netdev_queue *queue = net->_tx + index;
+ struct netdev_queue *queue = dev->_tx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
- kobj->kset = net->queues_kset;
+ kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
if (error)
@@ -1164,14 +1184,14 @@ exit:
#endif /* CONFIG_SYSFS */
int
-netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
+netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
{
#ifdef CONFIG_SYSFS
int i;
int error = 0;
for (i = old_num; i < new_num; i++) {
- error = netdev_queue_add_kobject(net, i);
+ error = netdev_queue_add_kobject(dev, i);
if (error) {
new_num = old_num;
break;
@@ -1179,7 +1199,7 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
while (--i >= new_num) {
- struct netdev_queue *queue = net->_tx + i;
+ struct netdev_queue *queue = dev->_tx + i;
#ifdef CONFIG_BQL
sysfs_remove_group(&queue->kobj, &dql_group);
@@ -1193,25 +1213,25 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
-static int register_queue_kobjects(struct net_device *net)
+static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
#ifdef CONFIG_SYSFS
- net->queues_kset = kset_create_and_add("queues",
- NULL, &net->dev.kobj);
- if (!net->queues_kset)
+ dev->queues_kset = kset_create_and_add("queues",
+ NULL, &dev->dev.kobj);
+ if (!dev->queues_kset)
return -ENOMEM;
- real_rx = net->real_num_rx_queues;
+ real_rx = dev->real_num_rx_queues;
#endif
- real_tx = net->real_num_tx_queues;
+ real_tx = dev->real_num_tx_queues;
- error = net_rx_queue_update_kobjects(net, 0, real_rx);
+ error = net_rx_queue_update_kobjects(dev, 0, real_rx);
if (error)
goto error;
rxq = real_rx;
- error = netdev_queue_update_kobjects(net, 0, real_tx);
+ error = netdev_queue_update_kobjects(dev, 0, real_tx);
if (error)
goto error;
txq = real_tx;
@@ -1219,24 +1239,24 @@ static int register_queue_kobjects(struct net_device *net)
return 0;
error:
- netdev_queue_update_kobjects(net, txq, 0);
- net_rx_queue_update_kobjects(net, rxq, 0);
+ netdev_queue_update_kobjects(dev, txq, 0);
+ net_rx_queue_update_kobjects(dev, rxq, 0);
return error;
}
-static void remove_queue_kobjects(struct net_device *net)
+static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
#ifdef CONFIG_SYSFS
- real_rx = net->real_num_rx_queues;
+ real_rx = dev->real_num_rx_queues;
#endif
- real_tx = net->real_num_tx_queues;
+ real_tx = dev->real_num_tx_queues;
- net_rx_queue_update_kobjects(net, real_rx, 0);
- netdev_queue_update_kobjects(net, real_tx, 0);
+ net_rx_queue_update_kobjects(dev, real_rx, 0);
+ netdev_queue_update_kobjects(dev, real_tx, 0);
#ifdef CONFIG_SYSFS
- kset_unregister(net->queues_kset);
+ kset_unregister(dev->queues_kset);
#endif
}
@@ -1329,13 +1349,13 @@ static struct class net_class = {
/* Delete sysfs entries but hold kobject reference until after all
* netdev references are gone.
*/
-void netdev_unregister_kobject(struct net_device * net)
+void netdev_unregister_kobject(struct net_device *ndev)
{
- struct device *dev = &(net->dev);
+ struct device *dev = &(ndev->dev);
kobject_get(&dev->kobj);
- remove_queue_kobjects(net);
+ remove_queue_kobjects(ndev);
pm_runtime_set_memalloc_noio(dev, false);
@@ -1343,18 +1363,18 @@ void netdev_unregister_kobject(struct net_device * net)
}
/* Create sysfs entries for network device. */
-int netdev_register_kobject(struct net_device *net)
+int netdev_register_kobject(struct net_device *ndev)
{
- struct device *dev = &(net->dev);
- const struct attribute_group **groups = net->sysfs_groups;
+ struct device *dev = &(ndev->dev);
+ const struct attribute_group **groups = ndev->sysfs_groups;
int error = 0;
device_initialize(dev);
dev->class = &net_class;
- dev->platform_data = net;
+ dev->platform_data = ndev;
dev->groups = groups;
- dev_set_name(dev, "%s", net->name);
+ dev_set_name(dev, "%s", ndev->name);
#ifdef CONFIG_SYSFS
/* Allow for a device specific group */
@@ -1364,10 +1384,10 @@ int netdev_register_kobject(struct net_device *net)
*groups++ = &netstat_group;
#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
- if (net->ieee80211_ptr)
+ if (ndev->ieee80211_ptr)
*groups++ = &wireless_group;
#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- else if (net->wireless_handlers)
+ else if (ndev->wireless_handlers)
*groups++ = &wireless_group;
#endif
#endif
@@ -1377,7 +1397,7 @@ int netdev_register_kobject(struct net_device *net)
if (error)
return error;
- error = register_queue_kobjects(net);
+ error = register_queue_kobjects(ndev);
if (error) {
device_del(dev);
return error;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e33937fb32a0..907fb5e36c02 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -822,7 +822,8 @@ void __netpoll_cleanup(struct netpoll *np)
RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
- }
+ } else
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fc17a9d309ac..8b849ddfef2e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -69,8 +69,9 @@
* for running devices in the if_list and sends packets until count is 0 it
* also the thread checks the thread->control which is used for inter-process
* communication. controlling process "posts" operations to the threads this
- * way. The if_lock should be possible to remove when add/rem_device is merged
- * into this too.
+ * way.
+ * The if_list is RCU protected, and the if_lock remains to protect updating
+ * of if_list, from "add_device" as it invoked from userspace (via proc write).
*
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
@@ -208,7 +209,7 @@
#define T_REMDEVALL (1<<2) /* Remove all devs */
#define T_REMDEV (1<<3) /* Remove one dev */
-/* If lock -- can be removed after some work */
+/* If lock -- protects updating of if_list */
#define if_lock(t) spin_lock(&(t->if_lock));
#define if_unlock(t) spin_unlock(&(t->if_lock));
@@ -241,6 +242,7 @@ struct pktgen_dev {
struct proc_dir_entry *entry; /* proc file */
struct pktgen_thread *pg_thread;/* the owner */
struct list_head list; /* chaining in the thread's run-queue */
+ struct rcu_head rcu; /* freed by RCU */
int running; /* if false, the test will stop */
@@ -802,7 +804,6 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\t':
case ' ':
goto done_str;
- break;
default:
break;
}
@@ -1737,14 +1738,14 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
seq_puts(seq, "Running: ");
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
seq_puts(seq, "\nStopped: ");
- list_for_each_entry(pkt_dev, &t->if_list, list)
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
if (!pkt_dev->running)
seq_printf(seq, "%s ", pkt_dev->odevname);
@@ -1753,7 +1754,7 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
else
seq_puts(seq, "\nResult: NA\n");
- if_unlock(t);
+ rcu_read_unlock();
return 0;
}
@@ -1878,10 +1879,8 @@ static struct pktgen_dev *__pktgen_NN_threads(const struct pktgen_net *pn,
pkt_dev = pktgen_find_dev(t, ifname, exact);
if (pkt_dev) {
if (remove) {
- if_lock(t);
pkt_dev->removal_mark = 1;
t->control |= T_REMDEV;
- if_unlock(t);
}
break;
}
@@ -1931,7 +1930,8 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
list_for_each_entry(t, &pn->pktgen_threads, th_list) {
struct pktgen_dev *pkt_dev;
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
if (pkt_dev->odev != dev)
continue;
@@ -1946,6 +1946,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
dev->name);
break;
}
+ rcu_read_unlock();
}
}
@@ -2997,8 +2998,8 @@ static void pktgen_run(struct pktgen_thread *t)
func_enter();
- if_lock(t);
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
/*
* setup odev and create initial packet.
@@ -3007,18 +3008,18 @@ static void pktgen_run(struct pktgen_thread *t)
if (pkt_dev->odev) {
pktgen_clear_counters(pkt_dev);
- pkt_dev->running = 1; /* Cranke yeself! */
pkt_dev->skb = NULL;
pkt_dev->started_at = pkt_dev->next_tx = ktime_get();
set_pkt_overhead(pkt_dev);
strcpy(pkt_dev->result, "Starting");
+ pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
strcpy(pkt_dev->result, "Error starting");
}
- if_unlock(t);
+ rcu_read_unlock();
if (started)
t->control &= ~(T_STOP);
}
@@ -3041,27 +3042,25 @@ static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
- list_for_each_entry(pkt_dev, &t->if_list, list)
- if (pkt_dev->running)
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list)
+ if (pkt_dev->running) {
+ rcu_read_unlock();
return 1;
+ }
+ rcu_read_unlock();
return 0;
}
static int pktgen_wait_thread_run(struct pktgen_thread *t)
{
- if_lock(t);
-
while (thread_is_running(t)) {
- if_unlock(t);
-
msleep_interruptible(100);
if (signal_pending(current))
goto signal;
- if_lock(t);
}
- if_unlock(t);
return 1;
signal:
return 0;
@@ -3166,10 +3165,10 @@ static int pktgen_stop_device(struct pktgen_dev *pkt_dev)
return -EINVAL;
}
+ pkt_dev->running = 0;
kfree_skb(pkt_dev->skb);
pkt_dev->skb = NULL;
pkt_dev->stopped_at = ktime_get();
- pkt_dev->running = 0;
show_results(pkt_dev, nr_frags);
@@ -3180,9 +3179,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
{
struct pktgen_dev *pkt_dev, *best = NULL;
- if_lock(t);
-
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
if (!pkt_dev->running)
continue;
if (best == NULL)
@@ -3190,7 +3188,8 @@ static struct pktgen_dev *next_to_run(struct pktgen_thread *t)
else if (ktime_compare(pkt_dev->next_tx, best->next_tx) < 0)
best = pkt_dev;
}
- if_unlock(t);
+ rcu_read_unlock();
+
return best;
}
@@ -3200,13 +3199,13 @@ static void pktgen_stop(struct pktgen_thread *t)
func_enter();
- if_lock(t);
+ rcu_read_lock();
- list_for_each_entry(pkt_dev, &t->if_list, list) {
+ list_for_each_entry_rcu(pkt_dev, &t->if_list, list) {
pktgen_stop_device(pkt_dev);
}
- if_unlock(t);
+ rcu_read_unlock();
}
/*
@@ -3220,8 +3219,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
func_enter();
- if_lock(t);
-
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
@@ -3235,8 +3232,6 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
break;
}
-
- if_unlock(t);
}
static void pktgen_rem_all_ifs(struct pktgen_thread *t)
@@ -3248,8 +3243,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
/* Remove all devices, free mem */
- if_lock(t);
-
list_for_each_safe(q, n, &t->if_list) {
cur = list_entry(q, struct pktgen_dev, list);
@@ -3258,8 +3251,6 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
pktgen_remove_device(t, cur);
}
-
- if_unlock(t);
}
static void pktgen_rem_thread(struct pktgen_thread *t)
@@ -3407,10 +3398,10 @@ static int pktgen_thread_worker(void *arg)
pr_debug("starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current));
- set_current_state(TASK_INTERRUPTIBLE);
-
set_freezable();
+ __set_current_state(TASK_RUNNING);
+
while (!kthread_should_stop()) {
pkt_dev = next_to_run(t);
@@ -3424,8 +3415,6 @@ static int pktgen_thread_worker(void *arg)
continue;
}
- __set_current_state(TASK_RUNNING);
-
if (likely(pkt_dev)) {
pktgen_xmit(pkt_dev);
@@ -3456,9 +3445,8 @@ static int pktgen_thread_worker(void *arg)
}
try_to_freeze();
-
- set_current_state(TASK_INTERRUPTIBLE);
}
+ set_current_state(TASK_INTERRUPTIBLE);
pr_debug("%s stopping all device\n", t->tsk->comm);
pktgen_stop(t);
@@ -3485,8 +3473,8 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
struct pktgen_dev *p, *pkt_dev = NULL;
size_t len = strlen(ifname);
- if_lock(t);
- list_for_each_entry(p, &t->if_list, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(p, &t->if_list, list)
if (strncmp(p->odevname, ifname, len) == 0) {
if (p->odevname[len]) {
if (exact || p->odevname[len] != '@')
@@ -3496,7 +3484,7 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
break;
}
- if_unlock(t);
+ rcu_read_unlock();
pr_debug("find_dev(%s) returning %p\n", ifname, pkt_dev);
return pkt_dev;
}
@@ -3510,6 +3498,12 @@ static int add_dev_to_thread(struct pktgen_thread *t,
{
int rv = 0;
+ /* This function cannot be called concurrently, as its called
+ * under pktgen_thread_lock mutex, but it can run from
+ * userspace on another CPU than the kthread. The if_lock()
+ * is used here to sync with concurrent instances of
+ * _rem_dev_from_if_list() invoked via kthread, which is also
+ * updating the if_list */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3518,9 +3512,9 @@ static int add_dev_to_thread(struct pktgen_thread *t,
goto out;
}
- list_add(&pkt_dev->list, &t->if_list);
- pkt_dev->pg_thread = t;
pkt_dev->running = 0;
+ pkt_dev->pg_thread = t;
+ list_add_rcu(&pkt_dev->list, &t->if_list);
out:
if_unlock(t);
@@ -3675,11 +3669,13 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
struct list_head *q, *n;
struct pktgen_dev *p;
+ if_lock(t);
list_for_each_safe(q, n, &t->if_list) {
p = list_entry(q, struct pktgen_dev, list);
if (p == pkt_dev)
- list_del(&p->list);
+ list_del_rcu(&p->list);
}
+ if_unlock(t);
}
static int pktgen_remove_device(struct pktgen_thread *t,
@@ -3699,20 +3695,22 @@ static int pktgen_remove_device(struct pktgen_thread *t,
pkt_dev->odev = NULL;
}
- /* And update the thread if_list */
-
- _rem_dev_from_if_list(t, pkt_dev);
-
+ /* Remove proc before if_list entry, because add_device uses
+ * list to determine if interface already exist, avoid race
+ * with proc_create_data() */
if (pkt_dev->entry)
proc_remove(pkt_dev->entry);
+ /* And update the thread if_list */
+ _rem_dev_from_if_list(t, pkt_dev);
+
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
#endif
vfree(pkt_dev->flows);
if (pkt_dev->page)
put_page(pkt_dev->page);
- kfree(pkt_dev);
+ kfree_rcu(pkt_dev, rcu);
return 0;
}
@@ -3812,6 +3810,7 @@ static void __exit pg_cleanup(void)
{
unregister_netdevice_notifier(&pktgen_notifier_block);
unregister_pernet_subsys(&pg_net_ops);
+ /* Don't need rcu_barrier() due to use of kfree_rcu() */
}
module_init(pg_init);
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index d3027a73fd4b..4eab4a94a59d 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -52,14 +52,43 @@
* test_8021q:
* jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
* ldh [16] ; load inner type
- * jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
+ * jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
* ldb [18] ; load payload
* and #0x8 ; as we don't have ports here, test
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
* ldh [18] ; reload payload
* and #0xf ; mask PTP_CLASS_VMASK
- * or #0x40 ; PTP_CLASS_V2_VLAN
+ * or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
+ * ret a ; return PTP class
+ *
+ * ; PTP over UDP over IPv4 over 802.1Q over Ethernet
+ * test_8021q_ipv4:
+ * jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
+ * ldb [27] ; load proto
+ * jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
+ * ldh [24] ; load frag offset field
+ * jset #0x1fff, drop_8021q_ipv4; don't allow fragments
+ * ldxb 4*([18]&0xf) ; load IP header len
+ * ldh [x + 20] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
+ * ldh [x + 26] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
+ * ret a ; return PTP class
+ * drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
+ *
+ * ; PTP over UDP over IPv6 over 802.1Q over Ethernet
+ * test_8021q_ipv6:
+ * jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
+ * ldb [24] ; load proto
+ * jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
+ * ldh [60] ; load UDP dst port
+ * jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
+ * ldh [66] ; load payload
+ * and #0xf ; mask PTP_CLASS_VMASK
+ * or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
* ret a ; return PTP class
+ * drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
*
* ; PTP over Ethernet
* test_ieee1588:
@@ -78,11 +107,11 @@
#include <linux/filter.h>
#include <linux/ptp_classify.h>
-static struct sk_filter *ptp_insns __read_mostly;
+static struct bpf_prog *ptp_insns __read_mostly;
unsigned int ptp_classify_raw(const struct sk_buff *skb)
{
- return SK_RUN_FILTER(ptp_insns, skb);
+ return BPF_PROG_RUN(ptp_insns, skb);
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
@@ -113,16 +142,39 @@ void __init ptp_classifier_init(void)
{ 0x44, 0, 0, 0x00000020 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
- { 0x15, 0, 9, 0x00008100 },
+ { 0x15, 0, 32, 0x00008100 },
{ 0x28, 0, 0, 0x00000010 },
- { 0x15, 0, 15, 0x000088f7 },
+ { 0x15, 0, 7, 0x000088f7 },
{ 0x30, 0, 0, 0x00000012 },
{ 0x54, 0, 0, 0x00000008 },
- { 0x15, 0, 12, 0x00000000 },
+ { 0x15, 0, 35, 0x00000000 },
{ 0x28, 0, 0, 0x00000012 },
{ 0x54, 0, 0, 0x0000000f },
- { 0x44, 0, 0, 0x00000040 },
+ { 0x44, 0, 0, 0x00000070 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x15, 0, 12, 0x00000800 },
+ { 0x30, 0, 0, 0x0000001b },
+ { 0x15, 0, 9, 0x00000011 },
+ { 0x28, 0, 0, 0x00000018 },
+ { 0x45, 7, 0, 0x00001fff },
+ { 0xb1, 0, 0, 0x00000012 },
+ { 0x48, 0, 0, 0x00000014 },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x48, 0, 0, 0x0000001a },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000050 },
+ { 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
+ { 0x15, 0, 8, 0x000086dd },
+ { 0x30, 0, 0, 0x00000018 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x28, 0, 0, 0x0000003c },
+ { 0x15, 0, 4, 0x0000013f },
+ { 0x28, 0, 0, 0x00000042 },
+ { 0x54, 0, 0, 0x0000000f },
+ { 0x44, 0, 0, 0x00000060 },
{ 0x16, 0, 0, 0x00000000 },
+ { 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 7, 0x000088f7 },
{ 0x30, 0, 0, 0x0000000e },
{ 0x54, 0, 0, 0x00000008 },
@@ -137,5 +189,5 @@ void __init ptp_classifier_init(void)
.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
};
- BUG_ON(sk_unattached_filter_create(&ptp_insns, &ptp_prog));
+ BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 467f326126e0..04db318e6218 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -41,27 +41,27 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
- struct listen_sock *lopt;
+ struct listen_sock *lopt = NULL;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
- if (lopt_size > PAGE_SIZE)
+
+ if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+ lopt = kzalloc(lopt_size, GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
+ if (!lopt)
lopt = vzalloc(lopt_size);
- else
- lopt = kzalloc(lopt_size, GFP_KERNEL);
- if (lopt == NULL)
+ if (!lopt)
return -ENOMEM;
- for (lopt->max_qlen_log = 3;
- (1 << lopt->max_qlen_log) < nr_table_entries;
- lopt->max_qlen_log++);
-
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
+ lopt->max_qlen_log = ilog2(nr_table_entries);
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
@@ -72,22 +72,8 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
void __reqsk_queue_destroy(struct request_sock_queue *queue)
{
- struct listen_sock *lopt;
- size_t lopt_size;
-
- /*
- * this is an error recovery path only
- * no locking needed and the lopt is not NULL
- */
-
- lopt = queue->listen_opt;
- lopt_size = sizeof(struct listen_sock) +
- lopt->nr_table_entries * sizeof(struct request_sock *);
-
- if (lopt_size > PAGE_SIZE)
- vfree(lopt);
- else
- kfree(lopt);
+ /* This is an error recovery path only, no locking needed */
+ kvfree(queue->listen_opt);
}
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
@@ -107,8 +93,6 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
- size_t lopt_size = sizeof(struct listen_sock) +
- lopt->nr_table_entries * sizeof(struct request_sock *);
if (lopt->qlen != 0) {
unsigned int i;
@@ -125,10 +109,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
}
WARN_ON(lopt->qlen != 0);
- if (lopt_size > PAGE_SIZE)
- vfree(lopt);
- else
- kfree(lopt);
+ kvfree(lopt);
}
/*
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1063996f8317..8d39071f32d7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -299,7 +299,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (rtnl_link_ops_get(ops->kind))
return -EEXIST;
- if (!ops->dellink)
+ /* The check for setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if (ops->setup && !ops->dellink)
ops->dellink = unregister_netdevice_queue;
list_add_tail(&ops->list, &link_ops);
@@ -1777,7 +1782,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
return -ENODEV;
ops = dev->rtnl_link_ops;
- if (!ops)
+ if (!ops || !ops->dellink)
return -EOPNOTSUPP;
ops->dellink(dev, &list_kill);
@@ -1805,7 +1810,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
EXPORT_SYMBOL(rtnl_configure_link);
struct net_device *rtnl_create_link(struct net *net,
- char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[])
+ char *ifname, unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops, struct nlattr *tb[])
{
int err;
struct net_device *dev;
@@ -1823,8 +1829,8 @@ struct net_device *rtnl_create_link(struct net *net,
num_rx_queues = ops->get_num_rx_queues();
err = -ENOMEM;
- dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
- num_tx_queues, num_rx_queues);
+ dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
+ ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
goto err;
@@ -1889,6 +1895,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
struct nlattr *linkinfo[IFLA_INFO_MAX+1];
+ unsigned char name_assign_type = NET_NAME_USER;
int err;
#ifdef CONFIG_MODULES
@@ -2038,14 +2045,19 @@ replay:
return -EOPNOTSUPP;
}
- if (!ifname[0])
+ if (!ops->setup)
+ return -EOPNOTSUPP;
+
+ if (!ifname[0]) {
snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
dest_net = rtnl_link_get_net(net, tb);
if (IS_ERR(dest_net))
return PTR_ERR(dest_net);
- dev = rtnl_create_link(dest_net, ifname, ops, tb);
+ dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
@@ -2380,22 +2392,20 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct net_device *dev,
const unsigned char *addr)
{
- int err = -EOPNOTSUPP;
+ int err = -EINVAL;
/* If aging addresses are supported device will need to
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
pr_info("%s: FDB only supports static addresses\n", dev->name);
- return -EINVAL;
+ return err;
}
if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
err = dev_uc_del(dev, addr);
else if (is_multicast_ether_addr(addr))
err = dev_mc_del(dev, addr);
- else
- err = -EINVAL;
return err;
}
@@ -2509,6 +2519,7 @@ skip:
int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
+ struct net_device *filter_dev,
int idx)
{
int err;
@@ -2526,28 +2537,72 @@ EXPORT_SYMBOL(ndo_dflt_fdb_dump);
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int idx = 0;
- struct net *net = sock_net(skb->sk);
struct net_device *dev;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *bdev = NULL;
+ struct net_device *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ const struct net_device_ops *cops = NULL;
+ struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
+ struct net *net = sock_net(skb->sk);
+ int brport_idx = 0;
+ int br_idx = 0;
+ int idx = 0;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- struct net_device *br_dev;
- const struct net_device_ops *ops;
+ if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX,
+ ifla_policy) == 0) {
+ if (tb[IFLA_MASTER])
+ br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ }
+
+ brport_idx = ifm->ifi_index;
- br_dev = netdev_master_upper_dev_get(dev);
- ops = br_dev->netdev_ops;
- if (ops->ndo_fdb_dump)
- idx = ops->ndo_fdb_dump(skb, cb, dev, idx);
+ if (br_idx) {
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev)
+ return -ENODEV;
+
+ ops = br_dev->netdev_ops;
+ bdev = br_dev;
+ }
+
+ for_each_netdev(net, dev) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
+
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+
+ bdev = dev;
+ } else {
+ if (dev != br_dev &&
+ !(dev->priv_flags & IFF_BRIDGE_PORT))
+ continue;
+
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !(dev->priv_flags & IFF_EBRIDGE))
+ continue;
+
+ bdev = br_dev;
+ cops = ops;
+ }
+
+ if (dev->priv_flags & IFF_BRIDGE_PORT) {
+ if (cops && cops->ndo_fdb_dump)
+ idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ idx);
}
+ idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
if (dev->netdev_ops->ndo_fdb_dump)
- idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
- else
- idx = ndo_dflt_fdb_dump(skb, cb, dev, idx);
+ idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, bdev, dev,
+ idx);
+
+ cops = NULL;
}
- rcu_read_unlock();
cb->args[0] = idx;
return skb->len;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c1a33033cbe2..224506a6fa80 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2976,9 +2976,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
tail = nskb;
__copy_skb_header(nskb, head_skb);
- nskb->mac_len = head_skb->mac_len;
skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
+ skb_reset_mac_len(nskb);
skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
nskb->data - tnl_hlen,
@@ -3490,10 +3490,10 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_queue_err_skb);
-void skb_tstamp_tx(struct sk_buff *orig_skb,
- struct skb_shared_hwtstamps *hwtstamps)
+void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk, int tstype)
{
- struct sock *sk = orig_skb->sk;
struct sock_exterr_skb *serr;
struct sk_buff *skb;
int err;
@@ -3521,12 +3521,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = tstype;
+ if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk->sk_protocol == IPPROTO_TCP)
+ serr->ee.ee_data -= sk->sk_tskey;
+ }
err = sock_queue_err_skb(sk, skb);
if (err)
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
+
+void skb_tstamp_tx(struct sk_buff *orig_skb,
+ struct skb_shared_hwtstamps *hwtstamps)
+{
+ return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ SCM_TSTAMP_SND);
+}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
diff --git a/net/core/sock.c b/net/core/sock.c
index 026e01f70274..2714811afbd8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -491,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+ if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
atomic_inc(&sk->sk_drops);
goto discard_and_relse;
}
@@ -848,24 +848,25 @@ set_rcvbuf:
ret = -EINVAL;
break;
}
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
- val & SOF_TIMESTAMPING_TX_HARDWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
- val & SOF_TIMESTAMPING_TX_SOFTWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
- val & SOF_TIMESTAMPING_RX_HARDWARE);
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ ret = -EINVAL;
+ break;
+ }
+ sk->sk_tskey = tcp_sk(sk)->snd_una;
+ } else {
+ sk->sk_tskey = 0;
+ }
+ }
+ sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
(1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
- val & SOF_TIMESTAMPING_SOFTWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
- val & SOF_TIMESTAMPING_SYS_HARDWARE);
- sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
- val & SOF_TIMESTAMPING_RAW_HARDWARE);
break;
case SO_RCVLOWAT:
@@ -1091,21 +1092,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_TIMESTAMPING:
- v.val = 0;
- if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
- v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
- v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
- v.val |= SOF_TIMESTAMPING_SOFTWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
- v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
- if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
- v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ v.val = sk->sk_tsflags;
break;
case SO_RCVTIMEO:
@@ -1478,6 +1465,7 @@ static void sk_update_clone(const struct sock *sk, struct sock *newsk)
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
+ bool is_charged = true;
newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
if (newsk != NULL) {
@@ -1522,9 +1510,13 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
filter = rcu_dereference_protected(newsk->sk_filter, 1);
if (filter != NULL)
- sk_filter_charge(newsk, filter);
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
+ */
+ is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a4216a4c9572..ad704c757bb4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -68,8 +68,8 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
if (!filter)
goto out;
- fprog = filter->orig_prog;
- flen = sk_filter_proglen(fprog);
+ fprog = filter->prog->orig_prog;
+ flen = bpf_classic_proglen(fprog);
attr = nla_reserve(skb, attrtype, flen);
if (attr == NULL) {
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 6521dfd8b7c8..a8770391ea5b 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -43,31 +43,22 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
return;
type = classify(skb);
+ if (type == PTP_CLASS_NONE)
+ return;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->txtstamp)) {
+ if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ return;
- switch (type) {
- case PTP_CLASS_V1_IPV4:
- case PTP_CLASS_V1_IPV6:
- case PTP_CLASS_V2_IPV4:
- case PTP_CLASS_V2_IPV6:
- case PTP_CLASS_V2_L2:
- case PTP_CLASS_V2_VLAN:
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->txtstamp)) {
- if (!atomic_inc_not_zero(&sk->sk_refcnt))
- return;
-
- clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone) {
- sock_put(sk);
- return;
- }
-
- clone->sk = sk;
- phydev->drv->txtstamp(phydev, clone, type);
+ clone = skb_clone(skb, GFP_ATOMIC);
+ if (!clone) {
+ sock_put(sk);
+ return;
}
- break;
- default:
- break;
+
+ clone->sk = sk;
+ phydev->drv->txtstamp(phydev, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
@@ -114,20 +105,12 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
__skb_pull(skb, ETH_HLEN);
- switch (type) {
- case PTP_CLASS_V1_IPV4:
- case PTP_CLASS_V1_IPV6:
- case PTP_CLASS_V2_IPV4:
- case PTP_CLASS_V2_IPV6:
- case PTP_CLASS_V2_L2:
- case PTP_CLASS_V2_VLAN:
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->rxtstamp))
- return phydev->drv->rxtstamp(phydev, skb, type);
- break;
- default:
- break;
- }
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ phydev = skb->dev->phydev;
+ if (likely(phydev->drv->rxtstamp))
+ return phydev->drv->rxtstamp(phydev, skb, type);
return false;
}