From fe0b082fedd1d09c73c48883f04a9fe2967b5899 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 23 Jun 2018 13:46:39 -0700 Subject: net_sched: remove unused htb drop_list After commit a09ceb0e0814 ("sched: remove qdisc->drop"), it is no longer used. Cc: Florian Westphal Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 2a4ab7caf553..43c4bfe625a9 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -126,7 +126,6 @@ struct htb_class { union { struct htb_class_leaf { - struct list_head drop_list; int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; } leaf; @@ -171,7 +170,6 @@ struct htb_sched { struct qdisc_watchdog watchdog; s64 now; /* cached dequeue time */ - struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */ /* time of nearest event per level (row) */ s64 near_ev_cache[TC_HTB_MAXDEPTH]; @@ -562,8 +560,6 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; htb_activate_prios(q, cl); - list_add_tail(&cl->un.leaf.drop_list, - q->drops + cl->prio); } } @@ -579,7 +575,6 @@ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) htb_deactivate_prios(q, cl); cl->prio_activity = 0; - list_del_init(&cl->un.leaf.drop_list); } static void htb_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, @@ -981,7 +976,6 @@ static void htb_reset(struct Qdisc *sch) else { if (cl->un.leaf.q) qdisc_reset(cl->un.leaf.q); - INIT_LIST_HEAD(&cl->un.leaf.drop_list); } cl->prio_activity = 0; cl->cmode = HTB_CAN_SEND; @@ -993,8 +987,6 @@ static void htb_reset(struct Qdisc *sch) sch->qstats.backlog = 0; memset(q->hlevel, 0, sizeof(q->hlevel)); memset(q->row_mask, 0, sizeof(q->row_mask)); - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); } static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = { @@ -1024,7 +1016,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, struct nlattr *tb[TCA_HTB_MAX + 1]; struct tc_htb_glob *gopt; int err; - int i; qdisc_watchdog_init(&q->watchdog, sch); INIT_WORK(&q->work, htb_work_func); @@ -1050,8 +1041,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; - for (i = 0; i < TC_HTB_NUMPRIO; i++) - INIT_LIST_HEAD(q->drops + i); qdisc_skb_head_init(&q->direct_queue); @@ -1224,7 +1213,6 @@ static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl, parent->level = 0; memset(&parent->un.inner, 0, sizeof(parent->un.inner)); - INIT_LIST_HEAD(&parent->un.leaf.drop_list); parent->un.leaf.q = new_q ? new_q : &noop_qdisc; parent->tokens = parent->buffer; parent->ctokens = parent->cbuffer; @@ -1418,7 +1406,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } cl->children = 0; - INIT_LIST_HEAD(&cl->un.leaf.drop_list); RB_CLEAR_NODE(&cl->pq_node); for (prio = 0; prio < TC_HTB_NUMPRIO; prio++) -- cgit v1.2.3 From 60513bd82c825b659c05957e4f8106ba06f0797f Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:04 -0700 Subject: net: sched: pass extack pointer to block binds and cb registration Pass the extact struct from a tc qdisc add to the block bind function and, in turn, to the setup_tc ndo of binding device via the tc_block_offload struct. Pass this back to any block callback registrations to allow netlink logging of fails in the bind process. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c | 2 +- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 10 +++++---- drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +- .../net/ethernet/netronome/nfp/flower/offload.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/netdevsim/netdev.c | 2 +- include/net/pkt_cls.h | 11 ++++++---- net/dsa/slave.c | 2 +- net/sched/cls_api.c | 25 ++++++++++++++-------- 17 files changed, 43 insertions(+), 31 deletions(-) (limited to 'net/sched') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 176fc9f4d7de..b5fc6414a951 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7984,7 +7984,7 @@ static int bnxt_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_setup_tc_block_cb, - bp, bp); + bp, bp, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_setup_tc_block_cb, bp); return 0; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c index 05d405905906..0745f2dfc80c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_vfr.c @@ -173,7 +173,7 @@ static int bnxt_vf_rep_setup_tc_block(struct net_device *dev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, bnxt_vf_rep_setup_tc_block_cb, - vf_rep, vf_rep); + vf_rep, vf_rep, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, bnxt_vf_rep_setup_tc_block_cb, vf_rep); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index bc03c175a3cd..96bc177d54de 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3016,7 +3016,7 @@ static int cxgb_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, cxgb_setup_tc_block_cb, - pi, dev); + pi, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cxgb_setup_tc_block_cb, pi); return 0; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 7ad2b1b0b125..426b0ccb1fc6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -7554,7 +7554,7 @@ static int i40e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40e_setup_tc_block_cb, - np, np); + np, np, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40e_setup_tc_block_cb, np); return 0; diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index dc56a8667495..5906c1c1d19d 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -2926,7 +2926,7 @@ static int i40evf_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, i40evf_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, i40evf_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 6a78d8272eb2..f1e3397bd405 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2728,7 +2728,7 @@ static int igb_setup_tc_block(struct igb_adapter *adapter, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, igb_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, igb_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 3e87dbbc9024..d29bd8fc3ff3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9325,7 +9325,7 @@ static int ixgbe_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, ixgbe_setup_tc_block_cb, - adapter, adapter); + adapter, adapter, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, ixgbe_setup_tc_block_cb, adapter); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 56c1b6f5593e..134f20a182b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3371,7 +3371,7 @@ static int mlx5e_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_setup_tc_block_cb, priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 57987f6546e8..3f2fe95e01d9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -797,7 +797,7 @@ static int mlx5e_rep_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, mlx5e_rep_setup_tc_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, mlx5e_rep_setup_tc_cb, priv); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 968b88af2ef5..d2bc335dda11 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1503,7 +1503,8 @@ static int mlxsw_sp_setup_tc_block_cb_flower(enum tc_setup_type type, static int mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, - struct tcf_block *block, bool ingress) + struct tcf_block *block, bool ingress, + struct netlink_ext_ack *extack) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_acl_block *acl_block; @@ -1518,7 +1519,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, return -ENOMEM; block_cb = __tcf_block_cb_register(block, mlxsw_sp_setup_tc_block_cb_flower, - mlxsw_sp, acl_block); + mlxsw_sp, acl_block, extack); if (IS_ERR(block_cb)) { err = PTR_ERR(block_cb); goto err_cb_register; @@ -1596,11 +1597,12 @@ static int mlxsw_sp_setup_tc_block(struct mlxsw_sp_port *mlxsw_sp_port, switch (f->command) { case TC_BLOCK_BIND: err = tcf_block_cb_register(f->block, cb, mlxsw_sp_port, - mlxsw_sp_port); + mlxsw_sp_port, f->extack); if (err) return err; err = mlxsw_sp_setup_tc_block_flower_bind(mlxsw_sp_port, - f->block, ingress); + f->block, ingress, + f->extack); if (err) { tcf_block_cb_unregister(f->block, cb, mlxsw_sp_port); return err; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index fcdfb8e7fdea..bf46f7bff912 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -206,7 +206,7 @@ static int nfp_bpf_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_bpf_setup_tc_block_cb, - nn, nn); + nn, nn, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_bpf_setup_tc_block_cb, diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index c0e74aa4cb5e..a427dab4bf49 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -627,7 +627,7 @@ static int nfp_flower_setup_tc_block(struct net_device *netdev, case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nfp_flower_setup_tc_block_cb, - repr, repr); + repr, repr, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nfp_flower_setup_tc_block_cb, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index cba46b62a1cd..2354e30caa78 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3776,7 +3776,7 @@ static int stmmac_setup_tc_block(struct stmmac_priv *priv, switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, stmmac_setup_tc_block_cb, - priv, priv); + priv, priv, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, stmmac_setup_tc_block_cb, priv); return 0; diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index ec68f38213d9..c9dacc6fcd59 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -260,7 +260,7 @@ nsim_setup_tc_block(struct net_device *dev, struct tc_block_offload *f) switch (f->command) { case TC_BLOCK_BIND: return tcf_block_cb_register(f->block, nsim_setup_tc_block_cb, - ns, ns); + ns, ns, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, nsim_setup_tc_block_cb, ns); return 0; diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3c1a2c47cd4..a2c6d35ba057 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -73,10 +73,11 @@ void tcf_block_cb_incref(struct tcf_block_cb *block_cb); unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, + struct netlink_ext_ack *extack); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv); + void *cb_priv, struct netlink_ext_ack *extack); void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -161,7 +162,8 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) static inline struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { return NULL; } @@ -169,7 +171,7 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, static inline int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { return 0; } @@ -596,6 +598,7 @@ struct tc_block_offload { enum tc_block_command command; enum tcf_block_binder_type binder_type; struct tcf_block *block; + struct netlink_ext_ack *extack; }; struct tc_cls_common_offload { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1e3b6a6d8a40..71536c435132 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -900,7 +900,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, switch (f->command) { case TC_BLOCK_BIND: - return tcf_block_cb_register(f->block, cb, dev, dev); + return tcf_block_cb_register(f->block, cb, dev, dev, f->extack); case TC_BLOCK_UNBIND: tcf_block_cb_unregister(f->block, cb, dev); return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdc3c87c53e6..8c9fb4b827a1 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -277,18 +277,21 @@ static bool tcf_block_offload_in_use(struct tcf_block *block) static int tcf_block_offload_cmd(struct tcf_block *block, struct net_device *dev, struct tcf_block_ext_info *ei, - enum tc_block_command command) + enum tc_block_command command, + struct netlink_ext_ack *extack) { struct tc_block_offload bo = {}; bo.command = command; bo.binder_type = ei->binder_type; bo.block = block; + bo.extack = extack; return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); } static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, - struct tcf_block_ext_info *ei) + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { struct net_device *dev = q->dev_queue->dev; int err; @@ -299,10 +302,12 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) + if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); return -EOPNOTSUPP; + } - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_BIND, extack); if (err == -EOPNOTSUPP) goto no_offload_dev_inc; return err; @@ -322,7 +327,7 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, if (!dev->netdev_ops->ndo_setup_tc) goto no_offload_dev_dec; - err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND); + err = tcf_block_offload_cmd(block, dev, ei, TC_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; return; @@ -612,7 +617,7 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, if (err) goto err_chain_head_change_cb_add; - err = tcf_block_offload_bind(block, q, ei); + err = tcf_block_offload_bind(block, q, ei, extack); if (err) goto err_block_offload_bind; @@ -748,7 +753,8 @@ EXPORT_SYMBOL(tcf_block_cb_decref); struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, + struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; @@ -772,11 +778,12 @@ EXPORT_SYMBOL(__tcf_block_cb_register); int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, - void *cb_priv) + void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; - block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv); + block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, + extack); return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; } EXPORT_SYMBOL(tcf_block_cb_register); -- cgit v1.2.3 From 31533cba4327aefeafe8a7d57de0c737a3b2faa6 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:06 -0700 Subject: net: sched: cls_flower: implement offload tcf_proto_op Add the reoffload tcf_proto_op in flower to generate an offload message for each filter in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. A filter contains a flag to indicate if it is in hardware or not. To ensure the reoffload function properly maintains this flag, keep a reference counter for the number of instances of the filter that are in hardware. Only update the flag when this counter changes from or to 0. Add a generic helper function to implement this behaviour. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 15 +++++++++++++++ net/sched/cls_flower.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'net/sched') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 18adc9142b18..7432100027b7 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -336,6 +336,21 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) block->offloadcnt--; } +static inline void +tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, + u32 *flags, bool add) +{ + if (add) { + if (!*cnt) + tcf_block_offload_inc(block, flags); + (*cnt)++; + } else { + (*cnt)--; + if (!*cnt) + tcf_block_offload_dec(block, flags); + } +} + static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9e8b26a80fb3..352876bb901b 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -87,6 +87,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -289,6 +290,7 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, fl_hw_destroy_filter(tp, f, NULL); return err; } else if (err > 0) { + f->in_hw_count = err; tcf_block_offload_inc(block, &f->flags); } @@ -1087,6 +1089,47 @@ skip: } } +static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_fl_head *head = rtnl_dereference(tp->root); + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = tp->chain->block; + struct fl_flow_mask *mask; + struct cls_fl_filter *f; + int err; + + list_for_each_entry(mask, &head->masks, list) { + list_for_each_entry(f, &mask->filters, list) { + if (tc_skip_hw(f->flags)) + continue; + + tc_cls_common_offload_init(&cls_flower.common, tp, + f->flags, extack); + cls_flower.command = add ? + TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; + cls_flower.cookie = (unsigned long)f; + cls_flower.dissector = &mask->dissector; + cls_flower.mask = &f->mkey; + cls_flower.key = &f->key; + cls_flower.exts = &f->exts; + cls_flower.classid = f->res.classid; + + err = cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); + if (err) { + if (add && tc_skip_sw(f->flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &f->in_hw_count, + &f->flags, add); + } + } + + return 0; +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1438,6 +1481,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .change = fl_change, .delete = fl_delete, .walk = fl_walk, + .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 0efd1b3a13bfabc8b70e79bd22aa413d6d2ad7a5 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:07 -0700 Subject: net: sched: cls_matchall: implement offload tcf_proto_op Add the reoffload tcf_proto_op in matchall to generate an offload message for each filter in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. Ensure matchall flags correctly report if the rule is in hw by keeping a reference counter for the number of instances of the rule offloaded. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 47b207ef7762..af16f36ed578 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -21,6 +21,7 @@ struct cls_mall_head { struct tcf_result res; u32 handle; u32 flags; + unsigned int in_hw_count; struct rcu_work rwork; }; @@ -95,6 +96,7 @@ static int mall_replace_hw_filter(struct tcf_proto *tp, mall_destroy_hw_filter(tp, head, cookie, NULL); return err; } else if (err > 0) { + head->in_hw_count = err; tcf_block_offload_inc(block, &head->flags); } @@ -235,6 +237,35 @@ skip: arg->count++; } +static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_mall_head *head = rtnl_dereference(tp->root); + struct tc_cls_matchall_offload cls_mall = {}; + struct tcf_block *block = tp->chain->block; + int err; + + if (tc_skip_hw(head->flags)) + return 0; + + tc_cls_common_offload_init(&cls_mall.common, tp, head->flags, extack); + cls_mall.command = add ? + TC_CLSMATCHALL_REPLACE : TC_CLSMATCHALL_DESTROY; + cls_mall.exts = &head->exts; + cls_mall.cookie = (unsigned long)head; + + err = cb(TC_SETUP_CLSMATCHALL, &cls_mall, cb_priv); + if (err) { + if (add && tc_skip_sw(head->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &head->in_hw_count, &head->flags, add); + + return 0; +} + static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { @@ -289,6 +320,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .change = mall_change, .delete = mall_delete, .walk = mall_walk, + .reoffload = mall_reoffload, .dump = mall_dump, .bind_class = mall_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 530d995123fe647d28566d81ff9562fe6cbaff94 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:08 -0700 Subject: net: sched: cls_u32: implement offload tcf_proto_op Add the offload tcf_proto_op in cls_u32 to generate an offload message for each filter and the hashtable in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' rule. A filter contains a flag to indicate if it is in hardware or not. To ensure the offload function properly maintains this flag, keep a reference counter for the number of instances of the filter that are in hardware. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index fb861f90fde6..d5d2a6dc3921 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -62,6 +62,7 @@ struct tc_u_knode { struct tc_u32_pcnt __percpu *pf; #endif u32 flags; + unsigned int in_hw_count; #ifdef CONFIG_CLS_U32_MARK u32 val; u32 mask; @@ -571,6 +572,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32_remove_hw_knode(tp, n, NULL); return err; } else if (err > 0) { + n->in_hw_count = err; tcf_block_offload_inc(block, &n->flags); } @@ -1199,6 +1201,114 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) } } +static int u32_reoffload_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, ht->flags, extack); + cls_u32.command = add ? TC_CLSU32_NEW_HNODE : TC_CLSU32_DELETE_HNODE; + cls_u32.hnode.divisor = ht->divisor; + cls_u32.hnode.handle = ht->handle; + cls_u32.hnode.prio = ht->prio; + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err && add && tc_skip_sw(ht->flags)) + return err; + + return 0; +} + +static int u32_reoffload_knode(struct tcf_proto *tp, struct tc_u_knode *n, + bool add, tc_setup_cb_t *cb, void *cb_priv, + struct netlink_ext_ack *extack) +{ + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); + struct tcf_block *block = tp->chain->block; + struct tc_cls_u32_offload cls_u32 = {}; + int err; + + tc_cls_common_offload_init(&cls_u32.common, tp, n->flags, extack); + cls_u32.command = add ? + TC_CLSU32_REPLACE_KNODE : TC_CLSU32_DELETE_KNODE; + cls_u32.knode.handle = n->handle; + + if (add) { + cls_u32.knode.fshift = n->fshift; +#ifdef CONFIG_CLS_U32_MARK + cls_u32.knode.val = n->val; + cls_u32.knode.mask = n->mask; +#else + cls_u32.knode.val = 0; + cls_u32.knode.mask = 0; +#endif + cls_u32.knode.sel = &n->sel; + cls_u32.knode.exts = &n->exts; + if (n->ht_down) + cls_u32.knode.link_handle = ht->handle; + } + + err = cb(TC_SETUP_CLSU32, &cls_u32, cb_priv); + if (err) { + if (add && tc_skip_sw(n->flags)) + return err; + return 0; + } + + tc_cls_offload_cnt_update(block, &n->in_hw_count, &n->flags, add); + + return 0; +} + +static int u32_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned int h; + int err; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) { + if (ht->prio != tp->prio) + continue; + + /* When adding filters to a new dev, try to offload the + * hashtable first. When removing, do the filters before the + * hashtable. + */ + if (add && !tc_skip_hw(ht->flags)) { + err = u32_reoffload_hnode(tp, ht, add, cb, cb_priv, + extack); + if (err) + return err; + } + + for (h = 0; h <= ht->divisor; h++) { + for (n = rtnl_dereference(ht->ht[h]); + n; + n = rtnl_dereference(n->next)) { + if (tc_skip_hw(n->flags)) + continue; + + err = u32_reoffload_knode(tp, n, add, cb, + cb_priv, extack); + if (err) + return err; + } + } + + if (!add && !tc_skip_hw(ht->flags)) + u32_reoffload_hnode(tp, ht, add, cb, cb_priv, extack); + } + + return 0; +} + static void u32_bind_class(void *fh, u32 classid, unsigned long cl) { struct tc_u_knode *n = fh; @@ -1336,6 +1446,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .change = u32_change, .delete = u32_delete, .walk = u32_walk, + .reoffload = u32_reoffload, .dump = u32_dump, .bind_class = u32_bind_class, .owner = THIS_MODULE, -- cgit v1.2.3 From 7e916b76805f11c1686a43ab5ead9a9b1a0a5945 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:09 -0700 Subject: net: sched: cls_bpf: implement offload tcf_proto_op Add the offload tcf_proto_op in cls_bpf to generate an offload message for each bpf prog in the given tcf_proto. Call the specified callback with this new offload message. The function only returns an error if the callback rejects adding a 'hardware only' prog. A prog contains a flag to indicate if it is in hardware or not. To ensure the offload function properly maintains this flag, keep a reference counter for the number of instances of the prog that are in hardware. Only update the flag when this counter changes from or to 0. Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1aa7f6511065..66e0ac9811f9 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -43,6 +43,7 @@ struct cls_bpf_prog { struct tcf_result res; bool exts_integrated; u32 gen_flags; + unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; @@ -174,6 +175,7 @@ static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } else if (err > 0) { + prog->in_hw_count = err; tcf_block_offload_inc(block, &prog->gen_flags); } } @@ -652,6 +654,42 @@ skip: } } +static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, + void *cb_priv, struct netlink_ext_ack *extack) +{ + struct cls_bpf_head *head = rtnl_dereference(tp->root); + struct tcf_block *block = tp->chain->block; + struct tc_cls_bpf_offload cls_bpf = {}; + struct cls_bpf_prog *prog; + int err; + + list_for_each_entry(prog, &head->plist, link) { + if (tc_skip_hw(prog->gen_flags)) + continue; + + tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, + extack); + cls_bpf.command = TC_CLSBPF_OFFLOAD; + cls_bpf.exts = &prog->exts; + cls_bpf.prog = add ? prog->filter : NULL; + cls_bpf.oldprog = add ? NULL : prog->filter; + cls_bpf.name = prog->bpf_name; + cls_bpf.exts_integrated = prog->exts_integrated; + + err = cb(TC_SETUP_CLSBPF, &cls_bpf, cb_priv); + if (err) { + if (add && tc_skip_sw(prog->gen_flags)) + return err; + continue; + } + + tc_cls_offload_cnt_update(block, &prog->in_hw_count, + &prog->gen_flags, add); + } + + return 0; +} + static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, @@ -662,6 +700,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, + .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; -- cgit v1.2.3 From 326367427cc09d38e4c1d145131ee2e228ac94c5 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Mon, 25 Jun 2018 14:30:10 -0700 Subject: net: sched: call reoffload op on block callback reg Call the reoffload tcf_proto_op on all tcf_proto nodes in all chains of a block when a callback tries to register to a block that already has offloaded rules. If all existing rules cannot be offloaded then the registration is rejected. This replaces the previous policy of rejecting such callback registration outright. On unregistration of a callback, the rules are flushed for that given cb. The implementation of block sharing in the NFP driver, for example, duplicates shared rules to all devs bound to a block. This meant that rules could still exist in hw even after a device is unbound from a block (assuming the block still remains active). Signed-off-by: John Hurley Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 +- include/net/pkt_cls.h | 6 ++- net/sched/cls_api.c | 54 ++++++++++++++++++++++---- 3 files changed, 52 insertions(+), 12 deletions(-) (limited to 'net/sched') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index d2bc335dda11..52437363766a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1542,7 +1542,7 @@ mlxsw_sp_setup_tc_block_flower_bind(struct mlxsw_sp_port *mlxsw_sp_port, err_block_bind: if (!tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); err_cb_register: mlxsw_sp_acl_block_destroy(acl_block); } @@ -1572,7 +1572,7 @@ mlxsw_sp_setup_tc_block_flower_unbind(struct mlxsw_sp_port *mlxsw_sp_port, err = mlxsw_sp_acl_block_unbind(mlxsw_sp, acl_block, mlxsw_sp_port, ingress); if (!err && !tcf_block_cb_decref(block_cb)) { - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); mlxsw_sp_acl_block_destroy(acl_block); } } diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a2c6d35ba057..4070b8eb6d14 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -78,7 +78,8 @@ struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, int tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb); +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb); void tcf_block_cb_unregister(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident); @@ -177,7 +178,8 @@ int tcf_block_cb_register(struct tcf_block *block, } static inline -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 8c9fb4b827a1..bbf8dda96b0e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -751,19 +751,53 @@ unsigned int tcf_block_cb_decref(struct tcf_block_cb *block_cb) } EXPORT_SYMBOL(tcf_block_cb_decref); +static int +tcf_block_playback_offloads(struct tcf_block *block, tc_setup_cb_t *cb, + void *cb_priv, bool add, bool offload_in_use, + struct netlink_ext_ack *extack) +{ + struct tcf_chain *chain; + struct tcf_proto *tp; + int err; + + list_for_each_entry(chain, &block->chain_list, list) { + for (tp = rtnl_dereference(chain->filter_chain); tp; + tp = rtnl_dereference(tp->next)) { + if (tp->ops->reoffload) { + err = tp->ops->reoffload(tp, add, cb, cb_priv, + extack); + if (err && add) + goto err_playback_remove; + } else if (add && offload_in_use) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Filter HW offload failed - classifier without re-offloading support"); + goto err_playback_remove; + } + } + } + + return 0; + +err_playback_remove: + tcf_block_playback_offloads(block, cb, cb_priv, false, offload_in_use, + extack); + return err; +} + struct tcf_block_cb *__tcf_block_cb_register(struct tcf_block *block, tc_setup_cb_t *cb, void *cb_ident, void *cb_priv, struct netlink_ext_ack *extack) { struct tcf_block_cb *block_cb; + int err; - /* At this point, playback of previous block cb calls is not supported, - * so forbid to register to block which already has some offloaded - * filters present. - */ - if (tcf_block_offload_in_use(block)) - return ERR_PTR(-EOPNOTSUPP); + /* Replay any already present rules */ + err = tcf_block_playback_offloads(block, cb, cb_priv, true, + tcf_block_offload_in_use(block), + extack); + if (err) + return ERR_PTR(err); block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) @@ -788,8 +822,12 @@ int tcf_block_cb_register(struct tcf_block *block, } EXPORT_SYMBOL(tcf_block_cb_register); -void __tcf_block_cb_unregister(struct tcf_block_cb *block_cb) +void __tcf_block_cb_unregister(struct tcf_block *block, + struct tcf_block_cb *block_cb) { + tcf_block_playback_offloads(block, block_cb->cb, block_cb->cb_priv, + false, tcf_block_offload_in_use(block), + NULL); list_del(&block_cb->list); kfree(block_cb); } @@ -803,7 +841,7 @@ void tcf_block_cb_unregister(struct tcf_block *block, block_cb = tcf_block_cb_lookup(block, cb, cb_ident); if (!block_cb) return; - __tcf_block_cb_unregister(block_cb); + __tcf_block_cb_unregister(block, block_cb); } EXPORT_SYMBOL(tcf_block_cb_unregister); -- cgit v1.2.3 From 0a9fe5c375b57fab6d18ed0a6a7f935eefb09db3 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Wed, 27 Jun 2018 10:32:19 -0700 Subject: netem: slotting with non-uniform distribution Extend slotting with support for non-uniform distributions. This is similar to netem's non-uniform distribution delay feature. Commit f043efeae2f1 ("netem: support delivering packets in delayed time slots") added the slotting feature to approximate the behaviors of media with packet aggregation but only supported a uniform distribution for delays between transmission attempts. Tests with TCP BBR with emulated wifi links with non-uniform distributions produced more useful results. Syntax: slot dist DISTRIBUTION DELAY JITTER [packets MAX_PACKETS] \ [bytes MAX_BYTES] The syntax and use of the distribution table is the same as in the non-uniform distribution delay feature. A file DISTRIBUTION must be present in TC_LIB_DIR (e.g. /usr/lib/tc) containing numbers scaled by NETEM_DIST_SCALE. A random value x is selected from the table and it takes DELAY + ( x * JITTER ) as delay. Correlation between values is not supported. Examples: Normal distribution delay with mean = 800us and stdev = 100us. > tc qdisc add dev eth0 root netem slot dist normal 800us 100us Optionally set the max slot size in bytes and/or packets. > tc qdisc add dev eth0 root netem slot dist normal 800us 100us \ bytes 64k packets 42 Signed-off-by: Yousuk Seung Acked-by: Eric Dumazet Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 3 ++ net/sched/sch_netem.c | 73 ++++++++++++++++++++++++++++-------------- 2 files changed, 52 insertions(+), 24 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 37b5096ae97b..bad3c03bcf43 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -539,6 +539,7 @@ enum { TCA_NETEM_LATENCY64, TCA_NETEM_JITTER64, TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, __TCA_NETEM_MAX, }; @@ -581,6 +582,8 @@ struct tc_netem_slot { __s64 max_delay; __s32 max_packets; __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ }; enum { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 7d6801fc5340..ad18a2052416 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -68,6 +68,11 @@ Fabio Ludovici */ +struct disttable { + u32 size; + s16 table[0]; +}; + struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; @@ -99,10 +104,7 @@ struct netem_sched_data { u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; - struct disttable { - u32 size; - s16 table[0]; - } *delay_dist; + struct disttable *delay_dist; enum { CLG_RANDOM, @@ -142,6 +144,7 @@ struct netem_sched_data { s32 bytes_left; } slot; + struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block @@ -180,7 +183,7 @@ static u32 get_crandom(struct crndstate *state) u64 value, rho; unsigned long answer; - if (state->rho == 0) /* no correlation */ + if (!state || state->rho == 0) /* no correlation */ return prandom_u32(); value = prandom_u32(); @@ -601,10 +604,19 @@ finish_segs: static void get_slot_next(struct netem_sched_data *q, u64 now) { - q->slot.slot_next = now + q->slot_config.min_delay + - (prandom_u32() * - (q->slot_config.max_delay - - q->slot_config.min_delay) >> 32); + s64 next_delay; + + if (!q->slot_dist) + next_delay = q->slot_config.min_delay + + (prandom_u32() * + (q->slot_config.max_delay - + q->slot_config.min_delay) >> 32); + else + next_delay = tabledist(q->slot_config.dist_delay, + (s32)(q->slot_config.dist_jitter), + NULL, q->slot_dist); + + q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } @@ -721,9 +733,9 @@ static void dist_free(struct disttable *d) * signed 16 bit values. */ -static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) +static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, + const struct nlattr *attr) { - struct netem_sched_data *q = qdisc_priv(sch); size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); spinlock_t *root_lock; @@ -744,7 +756,7 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) root_lock = qdisc_root_sleeping_lock(sch); spin_lock_bh(root_lock); - swap(q->delay_dist, d); + swap(*tbl, d); spin_unlock_bh(root_lock); dist_free(d); @@ -762,7 +774,8 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_bytes = INT_MAX; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; - if (q->slot_config.min_delay | q->slot_config.max_delay) + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; @@ -926,16 +939,17 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, } if (tb[TCA_NETEM_DELAY_DIST]) { - ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]); - if (ret) { - /* recover clg and loss_model, in case of - * q->clg and q->loss_model were modified - * in get_loss_clg() - */ - q->clg = old_clg; - q->loss_model = old_loss_model; - return ret; - } + ret = get_dist_table(sch, &q->delay_dist, + tb[TCA_NETEM_DELAY_DIST]); + if (ret) + goto get_table_failure; + } + + if (tb[TCA_NETEM_SLOT_DIST]) { + ret = get_dist_table(sch, &q->slot_dist, + tb[TCA_NETEM_SLOT_DIST]); + if (ret) + goto get_table_failure; } sch->limit = qopt->limit; @@ -983,6 +997,15 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, get_slot(q, tb[TCA_NETEM_SLOT]); return ret; + +get_table_failure: + /* recover clg and loss_model, in case of + * q->clg and q->loss_model were modified + * in get_loss_clg() + */ + q->clg = old_clg; + q->loss_model = old_loss_model; + return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, @@ -1011,6 +1034,7 @@ static void netem_destroy(struct Qdisc *sch) if (q->qdisc) qdisc_destroy(q->qdisc); dist_free(q->delay_dist); + dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, @@ -1127,7 +1151,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) if (dump_loss_model(q, skb) != 0) goto nla_put_failure; - if (q->slot_config.min_delay | q->slot_config.max_delay) { + if (q->slot_config.min_delay | q->slot_config.max_delay | + q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; -- cgit v1.2.3 From 80f0f574cc615b2c61bdfb0e3c2449478d63c488 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:30 -0400 Subject: net sched actions: fix coding style in pedit action Fix coding style issues in tc pedit action detected by the checkpatch script. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 8a925c72db5f..e4b29ee79ba8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -136,15 +136,15 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; - struct nlattr *pattr; - struct tc_pedit *parm; - int ret = 0, err; - struct tcf_pedit *p; struct tc_pedit_key *keys = NULL; struct tcf_pedit_key_ex *keys_ex; + struct tc_pedit *parm; + struct nlattr *pattr; + struct tcf_pedit *p; + int ret = 0, err; int ksize; - if (nla == NULL) + if (!nla) return -EINVAL; err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); @@ -175,7 +175,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return ret; p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); - if (keys == NULL) { + if (!keys) { tcf_idr_release(*a, bind); kfree(keys_ex); return -ENOMEM; @@ -220,6 +220,7 @@ static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tc_pedit_key *keys = p->tcfp_keys; + kfree(keys); kfree(p->tcfp_keys_ex); } @@ -284,7 +285,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, if (p->tcfp_nkeys > 0) { struct tc_pedit_key *tkey = p->tcfp_keys; struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_header_type htype = + TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { @@ -316,16 +318,15 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, hoffset + tkey->at); goto bad; } - d = skb_header_pointer(skb, hoffset + tkey->at, 1, - &_d); + d = skb_header_pointer(skb, hoffset + tkey->at, + 1, &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { - pr_info("tc filter pedit" - " offset must be on 32 bit boundaries\n"); + pr_info("tc filter pedit offset must be on 32 bit boundaries\n"); goto bad; } @@ -335,7 +336,8 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, goto bad; } - ptr = skb_header_pointer(skb, hoffset + offset, 4, &_data); + ptr = skb_header_pointer(skb, hoffset + offset, + 4, &_data); if (!ptr) goto bad; /* just do it, baby */ @@ -358,8 +360,9 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } goto done; - } else + } else { WARN(1, "pedit BUG: index %d\n", p->tcf_index); + } bad: p->tcf_qstats.overlimits++; -- cgit v1.2.3 From 544377cd2545f33cc6cd5458301749d828adacb0 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:32 -0400 Subject: net sched actions: fix sparse warning The variable _data in include/asm-generic/sections.h defines sections, this causes sparse warning in pedit: net/sched/act_pedit.c:293:35: warning: symbol '_data' shadows an earlier one ./include/asm-generic/sections.h:36:13: originally declared here Therefore rename the variable. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e4b29ee79ba8..9c2d8a31a5c5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -290,7 +290,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, _data; + u32 *ptr, hdata; int offset = tkey->off; int hoffset; u32 val; @@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } ptr = skb_header_pointer(skb, hoffset + offset, - 4, &_data); + 4, &hdata); if (!ptr) goto bad; /* just do it, baby */ @@ -355,7 +355,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &_data) + if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } -- cgit v1.2.3 From 6ff7586e382cb4274adefd56501d428ea39a5af3 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:33 -0400 Subject: net sched actions: use sizeof operator for buffer length Replace constant integer with sizeof() to clearly indicate the destination buffer length in skb_header_pointer() calls. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 9c2d8a31a5c5..3b775f54cee5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -319,7 +319,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, - 1, &_d); + sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; @@ -337,7 +337,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } ptr = skb_header_pointer(skb, hoffset + offset, - 4, &hdata); + sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ -- cgit v1.2.3 From 95b0d2dc13c7e7ea51675836680732e8c16e378a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:34 -0400 Subject: net sched actions: fix misleading text strings in pedit action Change "tc filter pedit .." to "tc actions pedit .." in error messages to clearly refer to pedit action. Reviewed-by: Simon Horman Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3b775f54cee5..caa6927a992c 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -305,7 +305,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { - pr_info("tc filter pedit bad header type specified (0x%x)\n", + pr_info("tc action pedit bad header type specified (0x%x)\n", htype); goto bad; } @@ -314,7 +314,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, char *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc filter pedit 'at' offset %d out of bounds\n", + pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } @@ -326,12 +326,12 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } if (offset % 4) { - pr_info("tc filter pedit offset must be on 32 bit boundaries\n"); + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc filter pedit offset %d out of bounds\n", + pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } @@ -349,7 +349,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, val = (*ptr + tkey->val) & ~tkey->mask; break; default: - pr_info("tc filter pedit bad command (%d)\n", + pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } -- cgit v1.2.3 From 430527415398cf7e741f5e2f11324a8df9093327 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 27 Jun 2018 13:33:35 -0400 Subject: net sched actions: avoid bitwise operation on signed value in pedit Since char can be unsigned or signed, and bitwise operators may have implementation-dependent results when performed on signed operands, declare 'u8 *' operand instead. Suggested-by: Davide Caratti Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index caa6927a992c..ab151346d3d4 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -311,7 +311,7 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, } if (tkey->offmask) { - char *d, _d; + u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info("tc action pedit 'at' offset %d out of bounds\n", -- cgit v1.2.3 From a1165b591925551d7c8f1ed45484ebc2847ec8fa Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:34 -0700 Subject: net/sched: act_tunnel_key: disambiguate metadata dst error cases Metadata may be NULL for one of two reasons: * Missing user input * Failure to allocate the metadata dst Disambiguate these case by returning -EINVAL for the former and -ENOMEM for the latter rather than -EINVAL for both cases. This is in preparation for using extended ack to provide more information to users when parsing their input. Signed-off-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 626dac81a48a..2edd389e7c92 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -143,10 +143,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, 0, flags, key_id, 0); + } else { + ret = -EINVAL; + goto err_out; } if (!metadata) { - ret = -EINVAL; + ret = -ENOMEM; goto err_out; } -- cgit v1.2.3 From 9d7298cd1dc55ebe053686f9bce74bfdcc812399 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:35 -0700 Subject: net/sched: act_tunnel_key: add extended ack support Add extended ack support for the tunnel key action by using NL_SET_ERR_MSG during validation of user input. Cc: Alexander Aring Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2edd389e7c92..20e98ed8d498 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -86,16 +86,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, int ret = 0; int err; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, - NULL); - if (err < 0) + extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; + } - if (!tb[TCA_TUNNEL_KEY_PARMS]) + if (!tb[TCA_TUNNEL_KEY_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; + } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -107,6 +113,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, break; case TCA_TUNNEL_KEY_ACT_SET: if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key id"); ret = -EINVAL; goto err_out; } @@ -144,11 +151,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, 0, flags, key_id, 0); } else { + NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL; goto err_out; } if (!metadata) { + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); ret = -ENOMEM; goto err_out; } @@ -156,6 +165,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: + NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } @@ -163,14 +173,18 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_tunnel_key_ops, bind, true); - if (ret) + if (ret) { + NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + NL_SET_ERR_MSG(extack, "TC IDR already exists"); return -EEXIST; + } } t = to_tunnel_key(*a); @@ -180,6 +194,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (unlikely(!params_new)) { if (ret == ACT_P_CREATED) tcf_idr_release(*a, bind); + NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } -- cgit v1.2.3 From 0ed5269f9e41f495c8e9020c85f5e1644c1afc57 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 26 Jun 2018 21:39:37 -0700 Subject: net/sched: add tunnel option support to act_tunnel_key Allow setting tunnel options using the act_tunnel_key action. Options are expressed as class:type:data and multiple options may be listed using a comma delimiter. # ip link add name geneve0 type geneve dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ geneve_opts 0102:80:00800022,0102:80:00800022 \ action mirred egress redirect dev geneve0 Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 26 ++++ net/sched/act_tunnel_key.c | 214 +++++++++++++++++++++++++++++- 2 files changed, 236 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 72bbefe5d1d1..e284fec8c467 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -36,9 +36,35 @@ enum { TCA_TUNNEL_KEY_PAD, TCA_TUNNEL_KEY_ENC_DST_PORT, /* be16 */ TCA_TUNNEL_KEY_NO_CSUM, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPTS, /* Nested TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_MAX, }; #define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ + __TCA_TUNNEL_KEY_ENC_OPTS_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPTS_MAX (__TCA_TUNNEL_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, /* be16 */ + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) + #endif diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 20e98ed8d498..ea203e386a92 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,135 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, return action; } +static const struct nla_policy +enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, +}; + +static int +tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; + int err, data_len, opt_len; + u8 *data; + + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + opt_len = sizeof(struct geneve_opt) + data_len; + if (dst) { + struct geneve_opt *opt = dst; + + WARN_ON(dst_len < opt_len); + + opt->opt_class = + nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); + opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); + opt->length = data_len / 4; /* length is in units of 4 bytes */ + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + memcpy(opt + 1, data, data_len); + } + + return opt_len; +} + +static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, + int dst_len, struct netlink_ext_ack *extack) +{ + int err, rem, opt_len, len = nla_len(nla), opts_len = 0; + const struct nlattr *attr, *head = nla_data(nla); + + err = nla_validate(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, + enc_opts_policy, extack); + if (err) + return err; + + nla_for_each_attr(attr, head, len, rem) { + switch (nla_type(attr)) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: + opt_len = tunnel_key_copy_geneve_opt(attr, dst, + dst_len, extack); + if (opt_len < 0) + return opt_len; + opts_len += opt_len; + if (dst) { + dst_len -= opt_len; + dst += opt_len; + } + break; + } + } + + if (!opts_len) { + NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); + return -EINVAL; + } + + if (rem > 0) { + NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); + return -EINVAL; + } + + return opts_len; +} + +static int tunnel_key_get_opts_len(struct nlattr *nla, + struct netlink_ext_ack *extack) +{ + return tunnel_key_copy_opts(nla, NULL, 0, extack); +} + +static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, + int opts_len, struct netlink_ext_ack *extack) +{ + info->options_len = opts_len; + switch (nla_type(nla_data(nla))) { + case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: +#if IS_ENABLED(CONFIG_INET) + info->key.tun_flags |= TUNNEL_GENEVE_OPT; + return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), + opts_len, extack); +#else + return -EAFNOSUPPORT; +#endif + default: + NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); + return -EINVAL; + } +} + static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, @@ -66,6 +196,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, @@ -81,6 +212,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; + int opts_len = 0; __be64 key_id; __be16 flags; int ret = 0; @@ -128,6 +260,15 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); + if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { + opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], + extack); + if (opts_len < 0) { + ret = opts_len; + goto err_out; + } + } + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -138,7 +279,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, dst_port, flags, - key_id, 0); + key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; @@ -162,6 +303,14 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, goto err_out; } + if (opts_len) { + ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], + &metadata->u.tun_info, + opts_len, extack); + if (ret < 0) + goto err_out; + } + metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: @@ -234,6 +383,61 @@ static void tunnel_key_release(struct tc_action *a) } } +static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + int len = info->options_len; + u8 *src = (u8 *)(info + 1); + struct nlattr *start; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); + if (!start) + return -EMSGSIZE; + + while (len > 0) { + struct geneve_opt *opt = (struct geneve_opt *)src; + + if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class) || + nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, + opt->type) || + nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt + 1)) + return -EMSGSIZE; + + len -= sizeof(struct geneve_opt) + opt->length * 4; + src += sizeof(struct geneve_opt) + opt->length * 4; + } + + nla_nest_end(skb, start); + return 0; +} + +static int tunnel_key_opts_dump(struct sk_buff *skb, + const struct ip_tunnel_info *info) +{ + struct nlattr *start; + int err; + + if (!info->options_len) + return 0; + + start = nla_nest_start(skb, TCA_TUNNEL_KEY_ENC_OPTS); + if (!start) + return -EMSGSIZE; + + if (info->key.tun_flags & TUNNEL_GENEVE_OPT) { + err = tunnel_key_geneve_opts_dump(skb, info); + if (err) + return err; + } else { + return -EINVAL; + } + + nla_nest_end(skb, start); + return 0; +} + static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { @@ -284,8 +488,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { - struct ip_tunnel_key *key = - ¶ms->tcft_enc_metadata->u.tun_info.key; + struct ip_tunnel_info *info = + ¶ms->tcft_enc_metadata->u.tun_info; + struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) || @@ -293,7 +498,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, ¶ms->tcft_enc_metadata->u.tun_info) || nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, - !(key->tun_flags & TUNNEL_CSUM))) + !(key->tun_flags & TUNNEL_CSUM)) || + tunnel_key_opts_dump(skb, info)) goto nla_put_failure; } -- cgit v1.2.3 From 9868c0b2eb18470a91d6f0f0df318738a50554e2 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Mon, 2 Jul 2018 00:02:02 -0400 Subject: net sched actions: add extack messages in pedit action Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index ab151346d3d4..55bc96b610e8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -144,8 +144,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, int ret = 0, err; int ksize; - if (!nla) + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; + } err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) @@ -154,21 +156,27 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; - if (!pattr) + if (!pattr) { + NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; + } parm = nla_data(pattr); ksize = parm->nkeys * sizeof(struct tc_pedit_key); - if (nla_len(pattr) < sizeof(*parm) + ksize) + if (nla_len(pattr) < sizeof(*parm) + ksize) { + NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); return -EINVAL; + } keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); if (!tcf_idr_check(tn, parm->index, a, bind)) { - if (!parm->nkeys) + if (!parm->nkeys) { + NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); return -EINVAL; + } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); if (ret) -- cgit v1.2.3 From e7e3728bd776d1d1450212ad266832f1003f833f Mon Sep 17 00:00:00 2001 From: Qiaobin Fu Date: Sun, 1 Jul 2018 15:16:27 -0400 Subject: net:sched: add action inheritdsfield to skbedit The new action inheritdsfield copies the field DS of IPv4 and IPv6 packets into skb->priority. This enables later classification of packets based on the DS field. v5: *Update the drop counter for TC_ACT_SHOT v4: *Not allow setting flags other than the expected ones. *Allow dumping the pure flags. v3: *Use optional flags, so that it won't break old versions of tc. *Allow users to set both SKBEDIT_F_PRIORITY and SKBEDIT_F_INHERITDSFIELD flags. v2: *Fix the style issue *Move the code from skbmod to skbedit Original idea by Jamal Hadi Salim Signed-off-by: Qiaobin Fu Reviewed-by: Michel Machado Acked-by: Jamal Hadi Salim Reviewed-by: Marcelo Ricardo Leitner Acked-by: Davide Caratti Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_skbedit.h | 2 ++ net/sched/act_skbedit.c | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) (limited to 'net/sched') diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index fbcfe27a4e6c..6de6071ebed6 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -30,6 +30,7 @@ #define SKBEDIT_F_MARK 0x4 #define SKBEDIT_F_PTYPE 0x8 #define SKBEDIT_F_MASK 0x10 +#define SKBEDIT_F_INHERITDSFIELD 0x20 struct tc_skbedit { tc_gen; @@ -45,6 +46,7 @@ enum { TCA_SKBEDIT_PAD, TCA_SKBEDIT_PTYPE, TCA_SKBEDIT_MASK, + TCA_SKBEDIT_FLAGS, __TCA_SKBEDIT_MAX }; #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 6138d1d71900..dfaf5d8028dd 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -23,6 +23,9 @@ #include #include #include +#include +#include +#include #include #include @@ -41,6 +44,25 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, if (d->flags & SKBEDIT_F_PRIORITY) skb->priority = d->priority; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + int wlen = skb_network_offset(skb); + + switch (tc_skb_protocol(skb)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + break; + + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen)) + goto err; + skb->priority = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + break; + } + } if (d->flags & SKBEDIT_F_QUEUE_MAPPING && skb->dev->real_num_tx_queues > d->queue_mapping) skb_set_queue_mapping(skb, d->queue_mapping); @@ -53,6 +75,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, spin_unlock(&d->tcf_lock); return d->tcf_action; + +err: + d->tcf_qstats.drops++; + spin_unlock(&d->tcf_lock); + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -62,6 +89,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { [TCA_SKBEDIT_MARK] = { .len = sizeof(u32) }, [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, + [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, }; static int tcf_skbedit_init(struct net *net, struct nlattr *nla, @@ -114,6 +142,13 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, mask = nla_data(tb[TCA_SKBEDIT_MASK]); } + if (tb[TCA_SKBEDIT_FLAGS] != NULL) { + u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); + + if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) + flags |= SKBEDIT_F_INHERITDSFIELD; + } + parm = nla_data(tb[TCA_SKBEDIT_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); @@ -178,6 +213,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, .action = d->tcf_action, }; struct tcf_t t; + u64 pure_flags = 0; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,6 +232,11 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, if ((d->flags & SKBEDIT_F_MASK) && nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) goto nla_put_failure; + if (d->flags & SKBEDIT_F_INHERITDSFIELD) + pure_flags |= SKBEDIT_F_INHERITDSFIELD; + if (pure_flags != 0 && + nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) + goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) -- cgit v1.2.3 From 30e99ed6dbdde68f5ad23db3a5872c3c247526b6 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 3 Jul 2018 13:45:12 +0000 Subject: net: sched: act_pedit: fix possible memory leak in tcf_pedit_init() 'keys_ex' is malloced by tcf_pedit_keys_ex_parse() in tcf_pedit_init() but not all of the error handle path free it, this may cause memory leak. This patch fix it. Fixes: 71d0ed7079df ("net/act_pedit: Support using offset relative to the conventional network headers") Signed-off-by: Wei Yongjun Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 55bc96b610e8..e43aef28fdac 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -175,32 +175,35 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_idr_check(tn, parm->index, a, bind)) { if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); - return -EINVAL; + ret = -EINVAL; + goto out_free; } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); if (ret) - return ret; + goto out_free; p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { tcf_idr_release(*a, bind); - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } ret = ACT_P_CREATED; } else { if (bind) - return 0; + goto out_free; tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + if (!ovr) { + ret = -EEXIST; + goto out_free; + } p = to_pedit(*a); if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { - kfree(keys_ex); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } } } @@ -222,6 +225,10 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; +out_free: + kfree(keys_ex); + return ret; + } static void tcf_pedit_cleanup(struct tc_action *a) -- cgit v1.2.3 From 860b642b9c33ea4a6ae2f416607b0b98a9d11bb0 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:52 -0700 Subject: net/sched: Allow creating a Qdisc watchdog with other clocks This adds 'qdisc_watchdog_init_clockid()' that allows a clockid to be passed, this allows other time references to be used when scheduling the Qdisc to run. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 ++ net/sched/sch_api.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 815b92a23936..2466ea143d01 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -72,6 +72,8 @@ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 54eca685420f..98541c6399db 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -596,12 +596,19 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) -- cgit v1.2.3 From 25db26a91364db00f5a30da2fea8e9afe14a163c Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Tue, 3 Jul 2018 15:42:53 -0700 Subject: net/sched: Introduce the ETF Qdisc The ETF (Earliest TxTime First) qdisc uses the information added earlier in this series (the socket option SO_TXTIME and the new role of sk_buff->tstamp) to schedule packets transmission based on absolute time. For some workloads, just bandwidth enforcement is not enough, and precise control of the transmission of packets is necessary. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf delta 100000 \ clockid CLOCK_TAI In this example, the Qdisc will provide SW best-effort for the control of the transmission time to the network adapter, the time stamp in the socket will be in reference to the clockid CLOCK_TAI and packets will leave the qdisc "delta" (100000) nanoseconds before its transmission time. The ETF qdisc will buffer packets sorted by their txtime. It will drop packets on enqueue() if their skbuff clockid does not match the clock reference of the Qdisc. Moreover, on dequeue(), a packet will be dropped if it expires while being enqueued. The qdisc also supports the SO_TXTIME deadline mode. For this mode, it will dequeue a packet as soon as possible and change the skb timestamp to 'now' during etf_dequeue(). Note that both the qdisc's and the SO_TXTIME ABIs allow for a clockid to be configured, but it's been decided that usage of CLOCK_TAI should be enforced until we decide to allow for other clockids to be used. The rationale here is that PTP times are usually in the TAI scale, thus no other clocks should be necessary. For now, the qdisc will return EINVAL if any clocks other than CLOCK_TAI are used. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/uapi/linux/pkt_sched.h | 17 ++ net/sched/Kconfig | 11 ++ net/sched/Makefile | 1 + net/sched/sch_etf.c | 384 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+) create mode 100644 net/sched/sch_etf.c (limited to 'net/sched') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c1ef749b6f9f..f06ee8f91e74 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -798,6 +798,7 @@ enum tc_setup_type { TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, + TC_SETUP_QDISC_ETF, }; /* These structures hold the attributes of bpf state that are being passed diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index bad3c03bcf43..d5e933ce1447 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -937,4 +937,21 @@ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index a01169fb5325..fcc89706745b 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -183,6 +183,17 @@ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_ETF + tristate "Earliest TxTime First (ETF)" + help + Say Y here if you want to use the Earliest TxTime First (ETF) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_etf. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 8811d3804878..9a5a7077d217 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c new file mode 100644 index 000000000000..4b7f4903ac17 --- /dev/null +++ b/net/sched/sch_etf.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_etf.c Earliest TxTime First queueing discipline. + * + * Authors: Jesus Sanchez-Palencia + * Vinicius Costa Gomes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) + +struct etf_sched_data { + bool deadline_mode; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + ktime_t (*get_time)(void); +}; + +static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { + [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, +}; + +static inline int validate_input_params(struct tc_etf_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * Clockid and delta must be valid. + * + * * Dynamic clockids are not supported. + * + * * Delta must be a positive integer. + */ + if (qopt->clockid < 0) { + NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); + return -ENOTSUPP; + } + + if (qopt->clockid != CLOCK_TAI) { + NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); + return -EINVAL; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (!sk) + return false; + + if (!sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (sk->sk_clockid != q->clockid) + return false; + + if (sk->sk_txtime_deadline_mode != q->deadline_mode) + return false; + + now = q->get_time(); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = etf_peek_timesortedlist(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; +} + +static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = etf_peek_timesortedlist(sch); + if (!skb) + return NULL; + + now = q->get_time(); + + /* Drop if packet has expired while in queue. */ + /* FIXME: Must return error on the socket's error queue */ + if (ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + /* When in deadline mode, dequeue as soon as possible and change the + * txtime from deadline to (now + delta). + */ + if (q->deadline_mode) { + timesortedlist_erase(sch, skb, false); + skb->tstamp = now; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static int etf_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_ETF_MAX + 1]; + struct tc_etf_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, + "Missing ETF qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_ETF_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_ETF_PARMS]); + + pr_debug("delta %d clockid %d deadline %s\n", + qopt->delta, qopt->clockid, + DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void etf_reset(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void etf_destroy(struct Qdisc *sch) +{ + struct etf_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); +} + +static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct etf_sched_data *q = qdisc_priv(sch); + struct tc_etf_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->deadline_mode) + opt.flags |= TC_ETF_DEADLINE_MODE_ON; + + if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops etf_qdisc_ops __read_mostly = { + .id = "etf", + .priv_size = sizeof(struct etf_sched_data), + .enqueue = etf_enqueue_timesortedlist, + .dequeue = etf_dequeue_timesortedlist, + .peek = etf_peek_timesortedlist, + .init = etf_init, + .reset = etf_reset, + .destroy = etf_destroy, + .dump = etf_dump, + .owner = THIS_MODULE, +}; + +static int __init etf_module_init(void) +{ + return register_qdisc(&etf_qdisc_ops); +} + +static void __exit etf_module_exit(void) +{ + unregister_qdisc(&etf_qdisc_ops); +} +module_init(etf_module_init) +module_exit(etf_module_exit) +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 88cab77162e86e0f6a2b7e4f859c1435c4e24feb Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:42:54 -0700 Subject: net/sched: Add HW offloading capability to ETF Add infra so etf qdisc supports HW offload of time-based transmission. For hw offload, the time sorted list is still used, so packets are dequeued always in order of txtime. Example: $ tc qdisc replace dev enp2s0 parent root handle 100 mqprio num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0 $ tc qdisc add dev enp2s0 parent 100:1 etf offload delta 100000 \ clockid CLOCK_REALTIME In this example, the Qdisc will use HW offload for the control of the transmission time through the network adapter. The hrtimer used for packets scheduling inside the qdisc will use the clockid CLOCK_REALTIME as reference and packets leave the Qdisc "delta" (100000) nanoseconds before their transmission time. Because this will be using HW offload and since dynamic clocks are not supported by the hrtimer, the system clock and the PHC clock must be synchronized for this mode to behave as expected. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 5 +++ include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_etf.c | 71 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2466ea143d01..7dc769e5452b 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -155,4 +155,9 @@ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_etf_qopt_offload { + u8 enable; + s32 queue; +}; + #endif diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d5e933ce1447..949118461009 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -944,6 +944,7 @@ struct tc_etf_qopt { __s32 clockid; __u32 flags; #define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) }; enum { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4b7f4903ac17..932a136db568 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -20,8 +20,10 @@ #include #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) +#define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) struct etf_sched_data { + bool offload; bool deadline_mode; int clockid; int queue; @@ -45,6 +47,9 @@ static inline int validate_input_params(struct tc_etf_qopt *qopt, * * Dynamic clockids are not supported. * * * Delta must be a positive integer. + * + * Also note that for the HW offload case, we must + * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); @@ -225,6 +230,56 @@ out: return skb; } +static void etf_disable_offload(struct net_device *dev, + struct etf_sched_data *q) +{ + struct tc_etf_qopt_offload etf = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + etf.queue = q->queue; + etf.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) + pr_warn("Couldn't disable ETF offload for queue %d\n", + etf.queue); +} + +static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_etf_qopt_offload etf = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); + return -EOPNOTSUPP; + } + + etf.queue = q->queue; + etf.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); + return err; + } + + return 0; +} + static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -251,8 +306,9 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, qopt = nla_data(tb[TCA_ETF_PARMS]); - pr_debug("delta %d clockid %d deadline %s\n", + pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); @@ -261,9 +317,16 @@ static int etf_init(struct Qdisc *sch, struct nlattr *opt, q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + if (OFFLOAD_IS_ON(qopt)) { + err = etf_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); switch (q->clockid) { @@ -326,10 +389,13 @@ static void etf_reset(struct Qdisc *sch) static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); + + etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -344,6 +410,9 @@ static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) opt.delta = q->delta; opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_ETF_OFFLOAD_ON; + if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; -- cgit v1.2.3 From 4b15c7075352668d4467ced7594b676707d11cae Mon Sep 17 00:00:00 2001 From: Jesus Sanchez-Palencia Date: Tue, 3 Jul 2018 15:43:00 -0700 Subject: net/sched: Make etf report drops on error_queue Use the socket error queue for reporting dropped packets if the socket has enabled that feature through the SO_TXTIME API. Packets are dropped either on enqueue() if they aren't accepted by the qdisc or on dequeue() if the system misses their deadline. Those are reported as different errors so applications can react accordingly. Userspace can retrieve the errors through the socket error queue and the corresponding cmsg interfaces. A struct sock_extended_err* is used for returning the error data, and the packet's timestamp can be retrieved by adding both ee_data and ee_info fields as e.g.: ((__u64) serr->ee_data << 32) + serr->ee_info This feature is disabled by default and must be explicitly enabled by applications. Enabling it can bring some overhead for the Tx cycles of the application. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: David S. Miller --- include/net/sock.h | 3 ++- include/uapi/linux/errqueue.h | 4 ++++ include/uapi/linux/net_tstamp.h | 5 ++++- net/core/sock.c | 4 ++++ net/sched/sch_etf.c | 35 +++++++++++++++++++++++++++++++++-- 5 files changed, 47 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/include/net/sock.h b/include/net/sock.h index 68347b9821c6..e0eac9ef44b5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,7 +481,8 @@ struct sock { u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, - sk_txtime_unused : 7; + sk_txtime_report_errors : 1, + sk_txtime_unused : 6; struct socket *sk_socket; void *sk_user_data; diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index dc64cfaf13da..c0151200f7d1 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -20,12 +20,16 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 #define SO_EE_ORIGIN_ZEROCOPY 5 +#define SO_EE_ORIGIN_TXTIME 6 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) #define SO_EE_CODE_ZEROCOPY_COPIED 1 +#define SO_EE_CODE_TXTIME_INVALID_PARAM 1 +#define SO_EE_CODE_TXTIME_MISSED 2 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index c9a77c353b98..f8f4539f1135 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -147,8 +147,11 @@ struct scm_ts_pktinfo { */ enum txtime_flags { SOF_TXTIME_DEADLINE_MODE = (1 << 0), + SOF_TXTIME_REPORT_ERRORS = (1 << 1), - SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE) + SOF_TXTIME_FLAGS_LAST = SOF_TXTIME_REPORT_ERRORS, + SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_FLAGS_LAST - 1) | + SOF_TXTIME_FLAGS_LAST }; struct sock_txtime { diff --git a/net/core/sock.c b/net/core/sock.c index fe64b839f1b2..03fdea5b0f57 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1087,6 +1087,8 @@ set_rcvbuf: sk->sk_clockid = sk_txtime.clockid; sk->sk_txtime_deadline_mode = !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); } break; @@ -1429,6 +1431,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.txtime.clockid = sk->sk_clockid; v.txtime.flags |= sk->sk_txtime_deadline_mode ? SOF_TXTIME_DEADLINE_MODE : 0; + v.txtime.flags |= sk->sk_txtime_report_errors ? + SOF_TXTIME_REPORT_ERRORS : 0; break; default: diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 932a136db568..1538d6fa8165 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -123,6 +124,32 @@ static void reset_watchdog(struct Qdisc *sch) qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) +{ + struct sock_exterr_skb *serr; + struct sk_buff *clone; + ktime_t txtime = skb->tstamp; + + if (!skb->sk || !(skb->sk->sk_txtime_report_errors)) + return; + + clone = skb_clone(skb, GFP_ATOMIC); + if (!clone) + return; + + serr = SKB_EXT_ERR(clone); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; + serr->ee.ee_type = 0; + serr->ee.ee_code = code; + serr->ee.ee_pad = 0; + serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ + serr->ee.ee_info = txtime; /* low part of tstamp */ + + if (sock_queue_err_skb(skb->sk, clone)) + kfree_skb(clone); +} + static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -130,8 +157,11 @@ static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct rb_node **p = &q->head.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; - if (!is_packet_valid(sch, nskb)) + if (!is_packet_valid(sch, nskb)) { + report_sock_error(nskb, EINVAL, + SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); + } while (*p) { struct sk_buff *skb; @@ -174,6 +204,8 @@ static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, if (drop) { struct sk_buff *to_free = NULL; + report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); + qdisc_drop(skb, sch, &to_free); kfree_skb_list(to_free); qdisc_qstats_overlimit(sch); @@ -199,7 +231,6 @@ static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) now = q->get_time(); /* Drop if packet has expired while in queue. */ - /* FIXME: Must return error on the socket's error queue */ if (ktime_before(skb->tstamp, now)) { timesortedlist_erase(sch, skb, true); skb = NULL; -- cgit v1.2.3 From aaab08344d2670e5c119b7b497d5063d7ddb8364 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:13 +0000 Subject: net/sched: flower: Add support for matching on vlan ethertype As flow dissector stores vlan ethertype, tc flower now can match on that. It is to make preparation for supporting QinQ. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 352876bb901b..da9ec30763fe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -500,6 +500,7 @@ static int fl_set_key_mpls(struct nlattr **tb, } static void fl_set_key_vlan(struct nlattr **tb, + __be16 ethertype, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { @@ -516,6 +517,8 @@ static void fl_set_key_vlan(struct nlattr **tb, VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } + key_val->vlan_tpid = ethertype; + key_mask->vlan_tpid = cpu_to_be16(~0); } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, @@ -592,8 +595,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); - if (ethertype == htons(ETH_P_8021Q)) { - fl_set_key_vlan(tb, &key->vlan, &mask->vlan); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, &key->vlan, &mask->vlan); fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_VLAN_ETH_TYPE, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, -- cgit v1.2.3 From d30695126f0ac5bca85d09c7946ad9a1deab5d25 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:15 +0000 Subject: net/sched: flower: Dump the ethertype encapsulated in vlan Currently the encapsulated ethertype is not dumped as it's the same as TCA_FLOWER_KEY_ETH_TYPE keyvalue. But the dumping result is inconsistent with input, we add dumping it with TCA_FLOWER_KEY_VLAN_ETH_TYPE. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index da9ec30763fe..e93b13d2cb81 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1313,6 +1313,10 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) goto nla_put_failure; + if (mask->vlan.vlan_tpid && + nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto)) + goto nla_put_failure; + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, -- cgit v1.2.3 From d64efd0926ba4f32e657e615a4f4a6170d5cc0fa Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 6 Jul 2018 05:38:16 +0000 Subject: net/sched: flower: Add supprt for matching on QinQ vlan headers As support dissecting of QinQ inner and outer vlan headers, user can add rules to match on QinQ vlan headers. Signed-off-by: Jianbo Liu Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 4 +++ net/sched/cls_flower.c | 65 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 55 insertions(+), 14 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 84e4c1d0f874..c4262d911596 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -469,6 +469,10 @@ enum { TCA_FLOWER_KEY_IP_TTL, /* u8 */ TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e93b13d2cb81..487a152a852c 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -35,6 +35,7 @@ struct fl_flow_key { struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; @@ -449,6 +450,9 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -501,19 +505,20 @@ static int fl_set_key_mpls(struct nlattr **tb, static void fl_set_key_vlan(struct nlattr **tb, __be16 ethertype, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { #define VLAN_PRIORITY_MASK 0x7 - if (tb[TCA_FLOWER_KEY_VLAN_ID]) { + if (tb[vlan_id_key]) { key_val->vlan_id = - nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; + nla_get_u16(tb[vlan_id_key]) & VLAN_VID_MASK; key_mask->vlan_id = VLAN_VID_MASK; } - if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { + if (tb[vlan_prio_key]) { key_val->vlan_priority = - nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & + nla_get_u8(tb[vlan_prio_key]) & VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } @@ -596,11 +601,25 @@ static int fl_set_key(struct net *net, struct nlattr **tb, ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); if (eth_type_vlan(ethertype)) { - fl_set_key_vlan(tb, ethertype, &key->vlan, &mask->vlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_VLAN_ETH_TYPE, - &mask->basic.n_proto, TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); + fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, + &mask->vlan); + + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } } else { key->basic.n_proto = ethertype; mask->basic.n_proto = cpu_to_be16(~0); @@ -825,6 +844,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) FLOW_DISSECTOR_KEY_MPLS, mpls); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_CVLAN, cvlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -1201,6 +1222,7 @@ static int fl_dump_key_ip(struct sk_buff *skb, } static int fl_dump_key_vlan(struct sk_buff *skb, + int vlan_id_key, int vlan_prio_key, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) { @@ -1209,13 +1231,13 @@ static int fl_dump_key_vlan(struct sk_buff *skb, if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) return 0; if (vlan_mask->vlan_id) { - err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, + err = nla_put_u16(skb, vlan_id_key, vlan_key->vlan_id); if (err) return err; } if (vlan_mask->vlan_priority) { - err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, + err = nla_put_u8(skb, vlan_prio_key, vlan_key->vlan_priority); if (err) return err; @@ -1310,13 +1332,28 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) goto nla_put_failure; - if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_VLAN_ID, + TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan)) goto nla_put_failure; - if (mask->vlan.vlan_tpid && - nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, key->basic.n_proto)) + if (fl_dump_key_vlan(skb, TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan) || + (mask->cvlan.vlan_tpid && + nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) goto nla_put_failure; + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } + if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, -- cgit v1.2.3 From eec94fdb04806790c7b7e6ea347820064cc6d467 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:23 +0300 Subject: net: sched: use rcu for action cookie update Implement functions to atomically update and free action cookie using rcu mechanism. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/pkt_cls.h | 1 + net/sched/act_api.c | 44 ++++++++++++++++++++++++++++++-------------- 3 files changed, 32 insertions(+), 15 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 5ff11adbe2a6..ffc3ef321776 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -37,7 +37,7 @@ struct tc_action { spinlock_t tcfa_lock; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct tc_cookie *act_cookie; + struct tc_cookie __rcu *act_cookie; struct tcf_chain *goto_chain; }; #define tcf_index common.tcfa_index diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 6641584b27f1..2081e4219f81 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -781,6 +781,7 @@ struct tc_mqprio_qopt_offload { struct tc_cookie { u8 *data; u32 len; + struct rcu_head rcu; }; struct tc_qopt_offload_stats { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3f4cf930f809..02670c7489e3 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -55,6 +55,24 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } +static void tcf_free_cookie_rcu(struct rcu_head *p) +{ + struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); + + kfree(cookie->data); + kfree(cookie); +} + +static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, + struct tc_cookie *new_cookie) +{ + struct tc_cookie *old; + + old = xchg(old_cookie, new_cookie); + if (old) + call_rcu(&old->rcu, tcf_free_cookie_rcu); +} + /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected @@ -65,10 +83,7 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); - if (p->act_cookie) { - kfree(p->act_cookie->data); - kfree(p->act_cookie); - } + tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); @@ -567,16 +582,22 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; + struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; - if (a->act_cookie) { - if (nla_put(skb, TCA_ACT_COOKIE, a->act_cookie->len, - a->act_cookie->data)) + + rcu_read_lock(); + cookie = rcu_dereference(a->act_cookie); + if (cookie) { + if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { + rcu_read_unlock(); goto nla_put_failure; + } } + rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) @@ -719,13 +740,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) goto err_mod; - if (name == NULL && tb[TCA_ACT_COOKIE]) { - if (a->act_cookie) { - kfree(a->act_cookie->data); - kfree(a->act_cookie); - } - a->act_cookie = cookie; - } + if (!name && tb[TCA_ACT_COOKIE]) + tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then -- cgit v1.2.3 From 036bb44327f50273e85ee4a2c9b56eebce1c0838 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:24 +0300 Subject: net: sched: change type of reference and bind counters Change type of action reference counter to refcount_t. Change type of action bind counter to atomic_t. This type is used to allow decrementing bind counter without testing for 0 result. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 5 +++-- net/sched/act_api.c | 32 ++++++++++++++++++++++---------- net/sched/act_bpf.c | 4 ++-- net/sched/act_connmark.c | 4 ++-- net/sched/act_csum.c | 4 ++-- net/sched/act_gact.c | 4 ++-- net/sched/act_ife.c | 4 ++-- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 4 ++-- net/sched/act_nat.c | 4 ++-- net/sched/act_pedit.c | 4 ++-- net/sched/act_police.c | 4 ++-- net/sched/act_sample.c | 4 ++-- net/sched/act_simple.c | 4 ++-- net/sched/act_skbedit.c | 4 ++-- net/sched/act_skbmod.c | 4 ++-- net/sched/act_tunnel_key.c | 4 ++-- net/sched/act_vlan.c | 4 ++-- 18 files changed, 57 insertions(+), 44 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index ffc3ef321776..2759226527a2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -6,6 +6,7 @@ * Public action API for classifiers/qdiscs */ +#include #include #include #include @@ -26,8 +27,8 @@ struct tc_action { struct tcf_idrinfo *idrinfo; u32 tcfa_index; - int tcfa_refcnt; - int tcfa_bindcnt; + refcount_t tcfa_refcnt; + atomic_t tcfa_bindcnt; u32 tcfa_capab; int tcfa_action; struct tcf_t tcfa_tm; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 02670c7489e3..4f064ecab882 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -105,14 +105,26 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) ASSERT_RTNL(); + /* Release with strict==1 and bind==0 is only called through act API + * interface (classifiers always bind). Only case when action with + * positive reference count and zero bind count can exist is when it was + * also created with act API (unbinding last classifier will destroy the + * action if it was created by classifier). So only case when bind count + * can be changed after initial check is when unbound action is + * destroyed by act API while classifier binds to action with same id + * concurrently. This result either creation of new action(same behavior + * as before), or reusing existing action if concurrent process + * increments reference count before action is deleted. Both scenarios + * are acceptable. + */ if (p) { if (bind) - p->tcfa_bindcnt--; - else if (strict && p->tcfa_bindcnt > 0) + atomic_dec(&p->tcfa_bindcnt); + else if (strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - p->tcfa_refcnt--; - if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) { + if (atomic_read(&p->tcfa_bindcnt) <= 0 && + refcount_dec_and_test(&p->tcfa_refcnt)) { if (p->ops->cleanup) p->ops->cleanup(p); tcf_idr_remove(p->idrinfo, p); @@ -304,8 +316,8 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, if (index && p) { if (bind) - p->tcfa_bindcnt++; - p->tcfa_refcnt++; + atomic_inc(&p->tcfa_bindcnt); + refcount_inc(&p->tcfa_refcnt); *a = p; return true; } @@ -324,9 +336,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, if (unlikely(!p)) return -ENOMEM; - p->tcfa_refcnt = 1; + refcount_set(&p->tcfa_refcnt, 1); if (bind) - p->tcfa_bindcnt = 1; + atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); @@ -782,7 +794,7 @@ static void cleanup_a(struct list_head *actions, int ovr) return; list_for_each_entry(a, actions, list) - a->tcfa_refcnt--; + refcount_dec(&a->tcfa_refcnt); } int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, @@ -810,7 +822,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, act->order = i; sz += tcf_action_fill_size(act); if (ovr) - act->tcfa_refcnt++; + refcount_inc(&act->tcfa_refcnt); list_add_tail(&act->list, actions); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 18089c02e557..15a2a53cbde1 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -141,8 +141,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, struct tcf_bpf *prog = to_bpf(act); struct tc_act_bpf opt = { .index = prog->tcf_index, - .refcnt = prog->tcf_refcnt - ref, - .bindcnt = prog->tcf_bindcnt - bind, + .refcnt = refcount_read(&prog->tcf_refcnt) - ref, + .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, .action = prog->tcf_action, }; struct tcf_t tm; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e4b880fa51fe..188865034f9a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -154,8 +154,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, struct tc_connmark opt = { .index = ci->tcf_index, - .refcnt = ci->tcf_refcnt - ref, - .bindcnt = ci->tcf_bindcnt - bind, + .refcnt = refcount_read(&ci->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, .action = ci->tcf_action, .zone = ci->zone, }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..da865f7b390a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -597,8 +597,8 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4dc4f153cad8..ca83debd5a70 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -169,8 +169,8 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_gact *gact = to_gact(a); struct tc_gact opt = { .index = gact->tcf_index, - .refcnt = gact->tcf_refcnt - ref, - .bindcnt = gact->tcf_bindcnt - bind, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, .action = gact->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 20d7d36b2fc9..3536a23f46b5 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -596,8 +596,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, - .refcnt = ife->tcf_refcnt - ref, - .bindcnt = ife->tcf_bindcnt - bind, + .refcnt = refcount_read(&ife->tcf_refcnt) - ref, + .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, .action = ife->tcf_action, .flags = p->flags, }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 14c312d7908f..7bce88dc11c9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -280,8 +280,8 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (unlikely(!t)) goto nla_put_failure; - c.bindcnt = ipt->tcf_bindcnt - bind; - c.refcnt = ipt->tcf_refcnt - ref; + c.bindcnt = atomic_read(&ipt->tcf_bindcnt) - bind; + c.refcnt = refcount_read(&ipt->tcf_refcnt) - ref; strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name); if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) || diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fd34015331ab..82a8bdd67c47 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -250,8 +250,8 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, struct tc_mirred opt = { .index = m->tcf_index, .action = m->tcf_action, - .refcnt = m->tcf_refcnt - ref, - .bindcnt = m->tcf_bindcnt - bind, + .refcnt = refcount_read(&m->tcf_refcnt) - ref, + .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, .eaction = m->tcfm_eaction, .ifindex = dev ? dev->ifindex : 0, }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4b5848b6c252..457c2ae3de46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -257,8 +257,8 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .index = p->tcf_index, .action = p->tcf_action, - .refcnt = p->tcf_refcnt - ref, - .bindcnt = p->tcf_bindcnt - bind, + .refcnt = refcount_read(&p->tcf_refcnt) - ref, + .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index e43aef28fdac..889690e0ec39 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -409,8 +409,8 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, opt->nkeys = p->tcfp_nkeys; opt->flags = p->tcfp_flags; opt->action = p->tcf_action; - opt->refcnt = p->tcf_refcnt - ref; - opt->bindcnt = p->tcf_bindcnt - bind; + opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; + opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (p->tcfp_keys_ex) { tcf_pedit_key_ex_dump(skb, p->tcfp_keys_ex, p->tcfp_nkeys); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 4e72bc2a0dfb..a789b8060968 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -274,8 +274,8 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, .action = police->tcf_action, .mtu = police->tcfp_mtu, .burst = PSCHED_NS2TICKS(police->tcfp_burst), - .refcnt = police->tcf_refcnt - ref, - .bindcnt = police->tcf_bindcnt - bind, + .refcnt = refcount_read(&police->tcf_refcnt) - ref, + .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 5db358497c9e..4a46978db092 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -173,8 +173,8 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, - .refcnt = s->tcf_refcnt - ref, - .bindcnt = s->tcf_bindcnt - bind, + .refcnt = refcount_read(&s->tcf_refcnt) - ref, + .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 98c4afe7c15b..c3a761097b01 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -145,8 +145,8 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index dfaf5d8028dd..cfd20d3d2ca9 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -208,8 +208,8 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbedit *d = to_skbedit(a); struct tc_skbedit opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ad050d7d4b46..ff90d720eda3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -205,8 +205,8 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); struct tc_skbmod opt = { .index = d->tcf_index, - .refcnt = d->tcf_refcnt - ref, - .bindcnt = d->tcf_bindcnt - bind, + .refcnt = refcount_read(&d->tcf_refcnt) - ref, + .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; struct tcf_t t; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ea203e386a92..2354f07eba15 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -474,8 +474,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, - .refcnt = t->tcf_refcnt - ref, - .bindcnt = t->tcf_bindcnt - bind, + .refcnt = refcount_read(&t->tcf_refcnt) - ref, + .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, }; struct tcf_t tm; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 1fb39e1f9d07..799e3deb44ac 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -239,8 +239,8 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); struct tc_vlan opt = { .index = v->tcf_index, - .refcnt = v->tcf_refcnt - ref, - .bindcnt = v->tcf_bindcnt - bind, + .refcnt = refcount_read(&v->tcf_refcnt) - ref, + .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, .action = v->tcf_action, .v_action = p->tcfv_action, }; -- cgit v1.2.3 From 789871bb2a0381425b106d2a995bde1460d35a34 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:25 +0300 Subject: net: sched: implement unlocked action init API Add additional 'rtnl_held' argument to act API init functions. It is required to implement actions that need to release rtnl lock before loading kernel module and reacquire if afterwards. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 6 ++++-- net/sched/act_api.c | 18 +++++++++++------- net/sched/act_bpf.c | 3 ++- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ife.c | 3 ++- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 5 +++-- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 3 ++- net/sched/act_police.c | 2 +- net/sched/act_sample.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_skbmod.c | 3 ++- net/sched/act_tunnel_key.c | 3 ++- net/sched/act_vlan.c | 3 ++- net/sched/cls_api.c | 5 +++-- 19 files changed, 50 insertions(+), 29 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 2759226527a2..27823f4e24c4 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -92,7 +92,8 @@ struct tc_action_ops { struct netlink_ext_ack *extack); int (*init)(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, int ovr, - int bind, struct netlink_ext_ack *extack); + int bind, bool rtnl_held, + struct netlink_ext_ack *extack); int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *, @@ -168,10 +169,11 @@ int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack); + bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack); int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 4f064ecab882..256b0c93916c 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -671,6 +671,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; @@ -721,9 +722,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("act_%s", act_name); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); a_o = tc_lookup_action_n(act_name); @@ -746,9 +749,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - extack); + rtnl_held, extack); else - err = a_o->init(net, nla, est, &a, ovr, bind, extack); + err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, + extack); if (err < 0) goto err_mod; @@ -800,7 +804,7 @@ static void cleanup_a(struct list_head *actions, int ovr) int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; @@ -814,7 +818,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - extack); + rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1173,7 +1177,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, LIST_HEAD(actions); ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, - &attr_size, extack); + &attr_size, true, extack); if (ret) return ret; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 15a2a53cbde1..8ebf40a3506c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -276,7 +276,8 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, struct netlink_ext_ack *extack) + int replace, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 188865034f9a..e3787aa0025a 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,7 +96,7 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index da865f7b390a..334261943f9f 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -46,7 +46,8 @@ static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); struct tcf_csum_params *params_old, *params_new; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ca83debd5a70..b4dfb2b4addc 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -56,7 +56,8 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); struct nlattr *tb[TCA_GACT_MAX + 1]; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3536a23f46b5..576ffbba61c3 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -448,7 +448,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 7bce88dc11c9..9c21663a86a6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -196,7 +196,8 @@ err1: static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, bind); @@ -204,7 +205,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool unlocked, + struct netlink_ext_ack *extack) { return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, bind); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 82a8bdd67c47..5434f08f2eb7 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -68,8 +68,9 @@ static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); struct nlattr *tb[TCA_MIRRED_MAX + 1]; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 457c2ae3de46..e6487ad1e4a8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,7 +38,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, - struct netlink_ext_ack *extack) + bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); struct nlattr *tb[TCA_NAT_MAX + 1]; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 889690e0ec39..f7965f35585b 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -132,7 +132,8 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb, static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); struct nlattr *tb[TCA_PEDIT_MAX + 1]; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a789b8060968..0e1c2fb0ebea 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -75,7 +75,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, + int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { int ret = 0, err; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 4a46978db092..316fc645595d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -37,7 +37,8 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, - int bind, struct netlink_ext_ack *extack) + int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index c3a761097b01..dc591cc87f4a 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -79,7 +79,8 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); struct nlattr *tb[TCA_DEF_MAX + 1]; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfd20d3d2ca9..c4ae4bd830aa 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -94,7 +94,8 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index ff90d720eda3..026d6f58eda1 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -84,7 +84,8 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); struct nlattr *tb[TCA_SKBMOD_MAX + 1]; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2354f07eba15..15ea5ce0f9ed 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -201,7 +201,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 799e3deb44ac..c61775250722 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,8 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, struct netlink_ext_ack *extack) + int ovr, int bind, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index bbf8dda96b0e..ebc2b9dd783f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1632,7 +1632,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { act = tcf_action_init_1(net, tp, tb[exts->police], rate_tlv, "police", ovr, - TCA_ACT_BIND, extack); + TCA_ACT_BIND, true, extack); if (IS_ERR(act)) return PTR_ERR(act); @@ -1645,7 +1645,8 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, extack); + &actions, &attr_size, true, + extack); if (err) return err; list_for_each_entry(act, &actions, list) -- cgit v1.2.3 From 3f7c72bc4227b169ba2c924a7987324e24bbc4b2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:26 +0300 Subject: net: sched: always take reference to action Without rtnl lock protection it is no longer safe to use pointer to tc action without holding reference to it. (it can be destroyed concurrently) Remove unsafe action idr lookup function. Instead of it, implement safe tcf idr check function that atomically looks up action in idr and increments its reference and bind counters. Implement both action search and check using new safe function Reference taken by idr check is temporal and should not be accounted by userspace clients (both logically and to preserver current API behavior). Subtract temporal reference when dumping action to userspace using existing tca_get_fill function arguments. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 256b0c93916c..aa304d36fee0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -284,44 +284,38 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, } EXPORT_SYMBOL(tcf_generic_walker); -static struct tc_action *tcf_idr_lookup(u32 index, struct tcf_idrinfo *idrinfo) +static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, + struct tc_action **a, int bind) { - struct tc_action *p = NULL; + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + } spin_unlock(&idrinfo->lock); - return p; + if (p) { + *a = p; + return true; + } + return false; } int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (p) { - *a = p; - return 1; - } - return 0; + return __tcf_idr_check(tn, index, a, 0); } EXPORT_SYMBOL(tcf_idr_search); bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, int bind) { - struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct tc_action *p = tcf_idr_lookup(index, idrinfo); - - if (index && p) { - if (bind) - atomic_inc(&p->tcfa_bindcnt); - refcount_inc(&p->tcfa_refcnt); - *a = p; - return true; - } - return false; + return __tcf_idr_check(tn, index, a, bind); } EXPORT_SYMBOL(tcf_idr_check); @@ -932,7 +926,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 0) <= 0) { + 0, 1) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1072,7 +1066,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 1) <= 0) { + 0, 2) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1131,14 +1125,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ + cleanup_a(&actions, 1); /* lookup took reference */ ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; return ret; } err: - if (event != RTM_GETACTION) - tcf_action_destroy(&actions, 0); + tcf_action_destroy(&actions, 0); return ret; } -- cgit v1.2.3 From 2a2ea349704fffade9526d5122299edbbfd122ca Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:27 +0300 Subject: net: sched: implement action API that deletes action by index Implement new action API function that atomically finds and deletes action from idr by index. Intended to be used by lockless actions that do not rely on rtnl lock. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_api.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 27823f4e24c4..a8eaae67c264 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -153,6 +153,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); static inline int tcf_idr_release(struct tc_action *a, bool bind) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aa304d36fee0..0f31f09946ab 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -319,6 +319,45 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a, } EXPORT_SYMBOL(tcf_idr_check); +int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret = 0; + + spin_lock(&idrinfo->lock); + p = idr_find(&idrinfo->action_idr, index); + if (!p) { + spin_unlock(&idrinfo->lock); + return -ENOENT; + } + + if (!atomic_read(&p->tcfa_bindcnt)) { + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + struct module *owner = p->ops->owner; + + WARN_ON(p != idr_remove(&idrinfo->action_idr, + p->tcfa_index)); + spin_unlock(&idrinfo->lock); + + if (p->ops->cleanup) + p->ops->cleanup(p); + + gen_kill_estimator(&p->tcfa_rate_est); + free_tcf(p); + module_put(owner); + return 0; + } + ret = 0; + } else { + ret = -EPERM; + } + + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_delete_index); + int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) -- cgit v1.2.3 From b409074e6693bcdaa7abbee2a035f22a9eabda53 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:28 +0300 Subject: net: sched: add 'delete' function to action ops Extend action ops with 'delete' function. Each action type to implements its own delete function that doesn't depend on rtnl lock. Implement delete function that is required to delete actions without holding rtnl lock. Use action API function that atomically deletes action only if it is still in action idr. This implementation prevents concurrent threads from deleting same action twice. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_bpf.c | 8 ++++++++ net/sched/act_connmark.c | 8 ++++++++ net/sched/act_csum.c | 8 ++++++++ net/sched/act_gact.c | 8 ++++++++ net/sched/act_ife.c | 8 ++++++++ net/sched/act_ipt.c | 16 ++++++++++++++++ net/sched/act_mirred.c | 8 ++++++++ net/sched/act_nat.c | 8 ++++++++ net/sched/act_pedit.c | 8 ++++++++ net/sched/act_police.c | 8 ++++++++ net/sched/act_sample.c | 8 ++++++++ net/sched/act_simple.c | 8 ++++++++ net/sched/act_skbedit.c | 8 ++++++++ net/sched/act_skbmod.c | 8 ++++++++ net/sched/act_tunnel_key.c | 8 ++++++++ net/sched/act_vlan.c | 8 ++++++++ 17 files changed, 137 insertions(+) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index a8eaae67c264..b9ed2b8256a5 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + int (*delete)(struct net *net, u32 index); }; struct tc_action_net { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 8ebf40a3506c..7941dd66ff83 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -388,6 +388,13 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_bpf_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, bpf_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, @@ -398,6 +405,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .walk = tcf_bpf_walker, .lookup = tcf_bpf_search, + .delete = tcf_bpf_delete, .size = sizeof(struct tcf_bpf), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e3787aa0025a..143c2d3de723 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -193,6 +193,13 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_connmark_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, connmark_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, @@ -202,6 +209,7 @@ static struct tc_action_ops act_connmark_ops = { .init = tcf_connmark_init, .walk = tcf_connmark_walker, .lookup = tcf_connmark_search, + .delete = tcf_connmark_delete, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 334261943f9f..3768539340e0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -654,6 +654,13 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) return nla_total_size(sizeof(struct tc_csum)); } +static int tcf_csum_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, csum_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, @@ -665,6 +672,7 @@ static struct tc_action_ops act_csum_ops = { .walk = tcf_csum_walker, .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, + .delete = tcf_csum_delete, .size = sizeof(struct tcf_csum), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b4dfb2b4addc..a431a711f0dd 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -231,6 +231,13 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) return sz; } +static int tcf_gact_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gact_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, @@ -242,6 +249,7 @@ static struct tc_action_ops act_gact_ops = { .walk = tcf_gact_walker, .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, + .delete = tcf_gact_delete, .size = sizeof(struct tcf_gact), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 576ffbba61c3..89a761395c94 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -844,6 +844,13 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ife_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ife_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ife_ops = { .kind = "ife", .type = TCA_ACT_IFE, @@ -854,6 +861,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .walk = tcf_ife_walker, .lookup = tcf_ife_search, + .delete = tcf_ife_delete, .size = sizeof(struct tcf_ife_info), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 9c21663a86a6..6c234411c771 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -324,6 +324,13 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_ipt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ipt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, @@ -334,6 +341,7 @@ static struct tc_action_ops act_ipt_ops = { .init = tcf_ipt_init, .walk = tcf_ipt_walker, .lookup = tcf_ipt_search, + .delete = tcf_ipt_delete, .size = sizeof(struct tcf_ipt), }; @@ -374,6 +382,13 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_xt_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, xt_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, @@ -384,6 +399,7 @@ static struct tc_action_ops act_xt_ops = { .init = tcf_xt_init, .walk = tcf_xt_walker, .lookup = tcf_xt_search, + .delete = tcf_xt_delete, .size = sizeof(struct tcf_ipt), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5434f08f2eb7..3d8300bce7e4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -322,6 +322,13 @@ static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) return rtnl_dereference(m->tcfm_dev); } +static int tcf_mirred_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, mirred_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, @@ -335,6 +342,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .delete = tcf_mirred_delete, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index e6487ad1e4a8..9eb27c89dc46 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -294,6 +294,13 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_nat_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, nat_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, @@ -303,6 +310,7 @@ static struct tc_action_ops act_nat_ops = { .init = tcf_nat_init, .walk = tcf_nat_walker, .lookup = tcf_nat_search, + .delete = tcf_nat_delete, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index f7965f35585b..45871052840f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -454,6 +454,13 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_pedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, pedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, @@ -464,6 +471,7 @@ static struct tc_action_ops act_pedit_ops = { .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, + .delete = tcf_pedit_delete, .size = sizeof(struct tcf_pedit), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0e1c2fb0ebea..c955fb0d4f3f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -314,6 +314,13 @@ static int tcf_police_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_police_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, police_net_id); + + return tcf_idr_delete_index(tn, index); +} + MODULE_AUTHOR("Alexey Kuznetsov"); MODULE_DESCRIPTION("Policing actions"); MODULE_LICENSE("GPL"); @@ -327,6 +334,7 @@ static struct tc_action_ops act_police_ops = { .init = tcf_act_police_init, .walk = tcf_act_police_walker, .lookup = tcf_police_search, + .delete = tcf_police_delete, .size = sizeof(struct tcf_police), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 316fc645595d..6f79d2afcba2 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -220,6 +220,13 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_sample_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, sample_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, @@ -230,6 +237,7 @@ static struct tc_action_ops act_sample_ops = { .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, + .delete = tcf_sample_delete, .size = sizeof(struct tcf_sample), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index dc591cc87f4a..446c750f3d3c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -184,6 +184,13 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_simp_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, simp_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, @@ -194,6 +201,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .walk = tcf_simp_walker, .lookup = tcf_simp_search, + .delete = tcf_simp_delete, .size = sizeof(struct tcf_defact), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index c4ae4bd830aa..b3eaa120c7f4 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -267,6 +267,13 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbedit_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbedit_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, @@ -276,6 +283,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, + .delete = tcf_skbedit_delete, .size = sizeof(struct tcf_skbedit), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 026d6f58eda1..30be3f767495 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -253,6 +253,13 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_skbmod_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, skbmod_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, @@ -263,6 +270,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .walk = tcf_skbmod_walker, .lookup = tcf_skbmod_search, + .delete = tcf_skbmod_delete, .size = sizeof(struct tcf_skbmod), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 15ea5ce0f9ed..655ed0b3fc67 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -534,6 +534,13 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tunnel_key_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .type = TCA_ACT_TUNNEL_KEY, @@ -544,6 +551,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .cleanup = tunnel_key_release, .walk = tunnel_key_walker, .lookup = tunnel_key_search, + .delete = tunnel_key_delete, .size = sizeof(struct tcf_tunnel_key), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c61775250722..e334d2751784 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -287,6 +287,13 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index, return tcf_idr_search(tn, a, index); } +static int tcf_vlan_delete(struct net *net, u32 index) +{ + struct tc_action_net *tn = net_generic(net, vlan_net_id); + + return tcf_idr_delete_index(tn, index); +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, @@ -297,6 +304,7 @@ static struct tc_action_ops act_vlan_ops = { .cleanup = tcf_vlan_cleanup, .walk = tcf_vlan_walker, .lookup = tcf_vlan_search, + .delete = tcf_vlan_delete, .size = sizeof(struct tcf_vlan), }; -- cgit v1.2.3 From 16af6067392c40e454e49eec834843ab03643d96 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:29 +0300 Subject: net: sched: implement reference counted action release Implement helper delete function that uses new action ops 'delete', instead of destroying action directly. This is required so act API could delete actions by index, without holding any references to action that is being deleted. Implement function __tcf_action_put() that releases reference to action and frees it, if necessary. Refactor action deletion code to use new put function and not to rely on rtnl lock. Remove rtnl lock assertions that are no longer needed. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 84 +++++++++++++++++++++++++++++++++++++++-------------- net/sched/cls_api.c | 1 - 2 files changed, 62 insertions(+), 23 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 0f31f09946ab..a023873db713 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -90,21 +90,39 @@ static void free_tcf(struct tc_action *p) kfree(p); } -static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) +static void tcf_action_cleanup(struct tc_action *p) { - spin_lock(&idrinfo->lock); - idr_remove(&idrinfo->action_idr, p->tcfa_index); - spin_unlock(&idrinfo->lock); + if (p->ops->cleanup) + p->ops->cleanup(p); + gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } +static int __tcf_action_put(struct tc_action *p, bool bind) +{ + struct tcf_idrinfo *idrinfo = p->idrinfo; + + if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { + if (bind) + atomic_dec(&p->tcfa_bindcnt); + idr_remove(&idrinfo->action_idr, p->tcfa_index); + spin_unlock(&idrinfo->lock); + + tcf_action_cleanup(p); + return 1; + } + + if (bind) + atomic_dec(&p->tcfa_bindcnt); + + return 0; +} + int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; - ASSERT_RTNL(); - /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was @@ -118,18 +136,11 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) * are acceptable. */ if (p) { - if (bind) - atomic_dec(&p->tcfa_bindcnt); - else if (strict && atomic_read(&p->tcfa_bindcnt) > 0) + if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; - if (atomic_read(&p->tcfa_bindcnt) <= 0 && - refcount_dec_and_test(&p->tcfa_refcnt)) { - if (p->ops->cleanup) - p->ops->cleanup(p); - tcf_idr_remove(p->idrinfo, p); + if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; - } } return ret; @@ -340,11 +351,7 @@ int tcf_idr_delete_index(struct tc_action_net *tn, u32 index) p->tcfa_index)); spin_unlock(&idrinfo->lock); - if (p->ops->cleanup) - p->ops->cleanup(p); - - gen_kill_estimator(&p->tcfa_rate_est); - free_tcf(p); + tcf_action_cleanup(p); module_put(owner); return 0; } @@ -615,6 +622,11 @@ int tcf_action_destroy(struct list_head *actions, int bind) return ret; } +static int tcf_action_put(struct tc_action *p) +{ + return __tcf_action_put(p, false); +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -1092,6 +1104,35 @@ err_out: return err; } +static int tcf_action_delete(struct net *net, struct list_head *actions, + struct netlink_ext_ack *extack) +{ + struct tc_action *a, *tmp; + u32 act_index; + int ret; + + list_for_each_entry_safe(a, tmp, actions, list) { + const struct tc_action_ops *ops = a->ops; + + /* Actions can be deleted concurrently so we must save their + * type and id to search again after reference is released. + */ + act_index = a->tcfa_index; + + list_del(&a->list); + if (tcf_action_put(a)) { + /* last reference, action was deleted concurrently */ + module_put(ops->owner); + } else { + /* now do the delete */ + ret = ops->delete(net, act_index); + if (ret < 0) + return ret; + } + } + return 0; +} + static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, u32 portid, size_t attr_size, struct netlink_ext_ack *extack) @@ -1112,7 +1153,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } /* now do the delete */ - ret = tcf_action_destroy(actions, 0); + ret = tcf_action_delete(net, actions, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1164,7 +1205,6 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, &actions, event, extack); else { /* delete */ - cleanup_a(&actions, 1); /* lookup took reference */ ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); if (ret) goto err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ebc2b9dd783f..9041f0e43e9a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1611,7 +1611,6 @@ void tcf_exts_destroy(struct tcf_exts *exts) #ifdef CONFIG_NET_CLS_ACT LIST_HEAD(actions); - ASSERT_RTNL(); tcf_exts_to_list(exts, &actions); tcf_action_destroy(&actions, TCA_ACT_UNBIND); kfree(exts->actions); -- cgit v1.2.3 From 4e8ddd7f1758ca4ddd0c1f7cf3e66fce736241d2 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:30 +0300 Subject: net: sched: don't release reference on action overwrite Return from action init function with reference to action taken, even when overwriting existing action. Action init API initializes its fourth argument (pointer to pointer to tc action) to either existing action with same index or newly created action. In case of existing index(and bind argument is zero), init function returns without incrementing action reference counter. Caller of action init then proceeds working with action, without actually holding reference to it. This means that action could be deleted concurrently. Change action init behavior to always take reference to action before returning successfully, in order to protect from concurrent deletion. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 -- net/sched/act_bpf.c | 8 ++++---- net/sched/act_connmark.c | 5 +++-- net/sched/act_csum.c | 8 ++++---- net/sched/act_gact.c | 5 +++-- net/sched/act_ife.c | 10 +++++----- net/sched/act_ipt.c | 5 +++-- net/sched/act_mirred.c | 5 ++--- net/sched/act_nat.c | 5 +++-- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 8 +++----- net/sched/act_sample.c | 8 +++----- net/sched/act_simple.c | 5 +++-- net/sched/act_skbedit.c | 5 +++-- net/sched/act_skbmod.c | 8 +++----- net/sched/act_tunnel_key.c | 11 ++++------- net/sched/act_vlan.c | 8 +++----- 17 files changed, 50 insertions(+), 58 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a023873db713..f019f0464cec 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -870,8 +870,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } act->order = i; sz += tcf_action_fill_size(act); - if (ovr) - refcount_inc(&act->tcfa_refcnt); list_add_tail(&act->list, actions); } diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 7941dd66ff83..d3f4ac6f2c4b 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - tcf_idr_release(*act, bind); - if (!replace) + if (!replace) { + tcf_idr_release(*act, bind); return -EEXIST; + } } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; @@ -356,8 +357,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, return res; out: - if (res == ACT_P_CREATED) - tcf_idr_release(*act, bind); + tcf_idr_release(*act, bind); return ret; } diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 143c2d3de723..701e90244eff 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -135,9 +135,10 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci = to_connmark(*a); if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3768539340e0..5dbee136b0a1 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -76,9 +76,10 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } p = to_tcf_csum(*a); @@ -86,8 +87,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } params_old = rtnl_dereference(p->params); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index a431a711f0dd..11c4de3f344e 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -100,9 +100,10 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } gact = to_gact(*a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 89a761395c94..acea3feae762 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -498,12 +498,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - kfree(p); - return -EEXIST; - } + kfree(p); + return -EEXIST; } ife = to_ife(*a); @@ -548,6 +546,8 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + tcf_idr_release(*a, bind); + kfree(p); return err; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 6c234411c771..85e85dfba401 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -145,10 +145,11 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, } else { if (bind)/* dont override defaults */ return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } hook = nla_get_u32(tb[TCA_IPT_HOOK]); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 3d8300bce7e4..e08aed06d7f8 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -132,10 +132,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } m = to_mirred(*a); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9eb27c89dc46..1f91e8e66c0f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -66,9 +66,10 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, } else { if (bind) return 0; - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } p = to_tcf_nat(*a); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 45871052840f..3a0e2f762f4e 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -194,8 +194,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } else { if (bind) goto out_free; - tcf_idr_release(*a, bind); if (!ovr) { + tcf_idr_release(*a, bind); ret = -EEXIST; goto out_free; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index c955fb0d4f3f..99335cca739e 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -111,10 +111,9 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } police = to_police(*a); @@ -195,8 +194,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 6f79d2afcba2..a8582e1347db 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -69,10 +69,9 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (ret) return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } s = to_sample(*a); @@ -81,8 +80,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } RCU_INIT_POINTER(s->psample_group, psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 446c750f3d3c..2da47c682a30 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -127,9 +127,10 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, } else { d = to_defact(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } reset_policy(d, tb[TCA_DEF_DATA], parm); } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b3eaa120c7f4..4616a2c1821f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -172,9 +172,10 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + tcf_idr_release(*a, bind); return -EEXIST; + } } spin_lock_bh(&d->tcf_lock); diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 30be3f767495..e844381af066 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -145,10 +145,9 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } d = to_skbmod(*a); @@ -156,8 +155,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 655ed0b3fc67..ab5bf5c13f87 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -329,12 +329,10 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) { - NL_SET_ERR_MSG(extack, "TC IDR already exists"); - return -EEXIST; - } + NL_SET_ERR_MSG(extack, "TC IDR already exists"); + return -EEXIST; } t = to_tunnel_key(*a); @@ -342,8 +340,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index e334d2751784..9b600faaccbb 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -187,10 +187,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, return ret; ret = ACT_P_CREATED; - } else { + } else if (!ovr) { tcf_idr_release(*a, bind); - if (!ovr) - return -EEXIST; + return -EEXIST; } v = to_vlan(*a); @@ -198,8 +197,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } -- cgit v1.2.3 From cae422f379f37fe9105d2a113259788f989e7df5 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:31 +0300 Subject: net: sched: use reference counting action init Change action API to assume that action init function always takes reference to action, even when overwriting existing action. This is necessary because action API continues to use action pointer after init function is done. At this point action becomes accessible for concurrent modifications, so user must always hold reference to it. Implement helper put list function to atomically release list of actions after action API init code is done using them. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index f019f0464cec..eefe8c2fe667 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -627,6 +627,18 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } +static void tcf_action_put_lst(struct list_head *actions) +{ + struct tc_action *a, *tmp; + + list_for_each_entry_safe(a, tmp, actions, list) { + const struct tc_action_ops *ops = a->ops; + + if (tcf_action_put(a)) + module_put(ops->owner); + } +} + int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -835,17 +847,6 @@ err_out: return ERR_PTR(err); } -static void cleanup_a(struct list_head *actions, int ovr) -{ - struct tc_action *a; - - if (!ovr) - return; - - list_for_each_entry(a, actions, list) - refcount_dec(&a->tcfa_refcnt); -} - int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct list_head *actions, size_t *attr_size, @@ -874,11 +875,6 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } *attr_size = tcf_action_full_attrs_size(sz); - - /* Remove the temp refcnt which was necessary to protect against - * destroying an existing action which was being replaced - */ - cleanup_a(actions, ovr); return 0; err: @@ -1209,7 +1205,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, return ret; } err: - tcf_action_destroy(&actions, 0); + tcf_action_put_lst(&actions); return ret; } @@ -1251,8 +1247,11 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, &attr_size, true, extack); if (ret) return ret; + ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack); + if (ovr) + tcf_action_put_lst(&actions); - return tcf_add_notify(net, n, &actions, portid, attr_size, extack); + return ret; } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; -- cgit v1.2.3 From 0190c1d452a91c38a3462abdd81752be1b9006a8 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:32 +0300 Subject: net: sched: atomically check-allocate action Implement function that atomically checks if action exists and either takes reference to it, or allocates idr slot for action index to prevent concurrent allocations of actions with same index. Use EBUSY error pointer to indicate that idr slot is reserved. Implement cleanup helper function that removes temporary error pointer from idr. (in case of error between idr allocation and insertion of newly created action to specified index) Refactor all action init functions to insert new action to idr using this API. Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 3 ++ net/sched/act_api.c | 92 ++++++++++++++++++++++++++++++++++++---------- net/sched/act_bpf.c | 11 ++++-- net/sched/act_connmark.c | 10 +++-- net/sched/act_csum.c | 11 ++++-- net/sched/act_gact.c | 11 ++++-- net/sched/act_ife.c | 6 ++- net/sched/act_ipt.c | 13 ++++++- net/sched/act_mirred.c | 16 ++++++-- net/sched/act_nat.c | 11 ++++-- net/sched/act_pedit.c | 12 ++++-- net/sched/act_police.c | 9 ++++- net/sched/act_sample.c | 11 ++++-- net/sched/act_simple.c | 11 +++++- net/sched/act_skbedit.c | 11 +++++- net/sched/act_skbmod.c | 11 +++++- net/sched/act_tunnel_key.c | 9 ++++- net/sched/act_vlan.c | 17 ++++++++- 18 files changed, 216 insertions(+), 59 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index b9ed2b8256a5..8090de2edab7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -154,6 +154,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, int bind, bool cpustats); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a); +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind); int tcf_idr_delete_index(struct tc_action_net *tn, u32 index); int __tcf_idr_release(struct tc_action *a, bool bind, bool strict); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index eefe8c2fe667..9511502e1cbb 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -303,7 +303,9 @@ static bool __tcf_idr_check(struct tc_action_net *tn, u32 index, spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); - if (p) { + if (IS_ERR(p)) { + p = NULL; + } else if (p) { refcount_inc(&p->tcfa_refcnt); if (bind) atomic_inc(&p->tcfa_bindcnt); @@ -371,7 +373,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; - struct idr *idr = &idrinfo->action_idr; int err = -ENOMEM; if (unlikely(!p)) @@ -389,20 +390,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, goto err2; } spin_lock_init(&p->tcfa_lock); - idr_preload(GFP_KERNEL); - spin_lock(&idrinfo->lock); - /* user doesn't specify an index */ - if (!index) { - index = 1; - err = idr_alloc_u32(idr, NULL, &index, UINT_MAX, GFP_ATOMIC); - } else { - err = idr_alloc_u32(idr, NULL, &index, index, GFP_ATOMIC); - } - spin_unlock(&idrinfo->lock); - idr_preload_end(); - if (err) - goto err3; - p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; @@ -412,7 +399,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) - goto err4; + goto err3; } p->idrinfo = idrinfo; @@ -420,8 +407,6 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, INIT_LIST_HEAD(&p->list); *a = p; return 0; -err4: - idr_remove(idr, index); err3: free_percpu(p->cpu_qstats); err2: @@ -437,11 +422,78 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); - idr_replace(&idrinfo->action_idr, a, a->tcfa_index); + /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); +/* Cleanup idr index that was allocated but not initialized. */ + +void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + + spin_lock(&idrinfo->lock); + /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ + WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); + spin_unlock(&idrinfo->lock); +} +EXPORT_SYMBOL(tcf_idr_cleanup); + +/* Check if action with specified index exists. If actions is found, increments + * its reference and bind counters, and return 1. Otherwise insert temporary + * error pointer (to prevent concurrent users from inserting actions with same + * index) and return 0. + */ + +int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, + struct tc_action **a, int bind) +{ + struct tcf_idrinfo *idrinfo = tn->idrinfo; + struct tc_action *p; + int ret; + +again: + spin_lock(&idrinfo->lock); + if (*index) { + p = idr_find(&idrinfo->action_idr, *index); + if (IS_ERR(p)) { + /* This means that another process allocated + * index but did not assign the pointer yet. + */ + spin_unlock(&idrinfo->lock); + goto again; + } + + if (p) { + refcount_inc(&p->tcfa_refcnt); + if (bind) + atomic_inc(&p->tcfa_bindcnt); + *a = p; + ret = 1; + } else { + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + *index, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, + ERR_PTR(-EBUSY), *index); + } + } else { + *index = 1; + *a = NULL; + ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, + UINT_MAX, GFP_ATOMIC); + if (!ret) + idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), + *index); + } + spin_unlock(&idrinfo->lock); + return ret; +} +EXPORT_SYMBOL(tcf_idr_check_alloc); + void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index d3f4ac6f2c4b..06f743d8ed41 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -299,14 +299,17 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - if (!tcf_idr_check(tn, parm->index, act, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, act, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, act, &act_bpf_ops, bind, true); - if (ret < 0) + if (ret < 0) { + tcf_idr_cleanup(tn, parm->index); return ret; + } res = ACT_P_CREATED; - } else { + } else if (ret > 0) { /* Don't override defaults. */ if (bind) return 0; @@ -315,6 +318,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, tcf_idr_release(*act, bind); return -EEXIST; } + } else { + return ret; } is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 701e90244eff..1e31f0e448e2 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -118,11 +118,14 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + ret = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!ret) { ret = tcf_idr_create(tn, parm->index, est, a, &act_connmark_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ci = to_connmark(*a); ci->tcf_action = parm->action; @@ -131,7 +134,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, tcf_idr_insert(tn, *a); ret = ACT_P_CREATED; - } else { + } else if (ret > 0) { ci = to_connmark(*a); if (bind) return 0; @@ -142,6 +145,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, /* replacing action and zone */ ci->tcf_action = parm->action; ci->zone = parm->zone; + ret = 0; } return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5dbee136b0a1..bd232d3bd022 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -67,19 +67,24 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_csum_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_csum(*a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 11c4de3f344e..661b72b9147d 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -91,19 +91,24 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } #endif - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_gact_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind)/* dont override defaults */ return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } gact = to_gact(*a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index acea3feae762..a3eef00cd711 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -484,7 +484,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (!p) return -ENOMEM; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) { kfree(p); return 0; @@ -494,6 +497,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, bind, true); if (ret) { + tcf_idr_cleanup(tn, parm->index); kfree(p); return ret; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 85e85dfba401..0dc787a57798 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -119,13 +119,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (tb[TCA_IPT_INDEX] != NULL) index = nla_get_u32(tb[TCA_IPT_INDEX]); - exists = tcf_idr_check(tn, index, a, bind); + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_IPT_HOOK] == NULL || tb[TCA_IPT_TARG] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } @@ -133,14 +138,18 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, index); return ret; + } ret = ACT_P_CREATED; } else { if (bind)/* dont override defaults */ diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e08aed06d7f8..6afd89a36c69 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -79,7 +79,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_mirred *m; struct net_device *dev; bool exists = false; - int ret; + int ret, err; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); @@ -94,7 +94,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_MIRRED_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -107,6 +110,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } @@ -115,6 +120,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ENODEV; } mac_header_xmit = dev_is_mac_header_xmit(dev); @@ -124,13 +131,16 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!exists) { if (!dev) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; } ret = tcf_idr_create(tn, parm->index, est, a, &act_mirred_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 1f91e8e66c0f..4dd9188a72fd 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -57,19 +57,24 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return -EINVAL; parm = nla_data(tb[TCA_NAT_PARMS]); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { ret = tcf_idr_create(tn, parm->index, est, a, &act_nat_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) return 0; if (!ovr) { tcf_idr_release(*a, bind); return -EEXIST; } + } else { + return err; } p = to_tcf_nat(*a); diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3a0e2f762f4e..cc8ffcd1ddb5 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -173,16 +173,20 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (IS_ERR(keys_ex)) return PTR_ERR(keys_ex); - if (!tcf_idr_check(tn, parm->index, a, bind)) { + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (!err) { if (!parm->nkeys) { + tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_free; } ret = tcf_idr_create(tn, parm->index, est, a, &act_pedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); goto out_free; + } p = to_pedit(*a); keys = kmalloc(ksize, GFP_KERNEL); if (!keys) { @@ -191,7 +195,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } ret = ACT_P_CREATED; - } else { + } else if (err > 0) { if (bind) goto out_free; if (!ovr) { @@ -207,6 +211,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, goto out_free; } } + } else { + return err; } spin_lock_bh(&p->tcf_lock); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 99335cca739e..1f3192ea8df7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -101,15 +101,20 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_POLICE_TBF]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, NULL, a, &act_police_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index a8582e1347db..3079e7be5bde 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -46,7 +46,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tc_sample *parm; struct tcf_sample *s; bool exists = false; - int ret; + int ret, err; if (!nla) return -EINVAL; @@ -59,15 +59,20 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SAMPLE_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { tcf_idr_release(*a, bind); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 2da47c682a30..aa51152e0066 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -100,21 +100,28 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_simp_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_defact(*a); ret = alloc_defdata(d, tb[TCA_DEF_DATA]); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 4616a2c1821f..86521a74ecdd 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -152,21 +152,28 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!flags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbedit_ops, bind, false); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } d = to_skbedit(*a); ret = ACT_P_CREATED; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e844381af066..cdc6bacfb190 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -128,21 +128,28 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; if (!lflags) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_skbmod_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index ab5bf5c13f87..3ec585d58762 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -237,7 +237,10 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -325,7 +328,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, &act_tunnel_key_ops, bind, true); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); - return ret; + goto err_out; } ret = ACT_P_CREATED; @@ -364,6 +367,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, err_out: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return ret; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 9b600faaccbb..ad37f308175a 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -134,7 +134,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); - exists = tcf_idr_check(tn, parm->index, a, bind); + err = tcf_idr_check_alloc(tn, &parm->index, a, bind); + if (err < 0) + return err; + exists = err; if (exists && bind) return 0; @@ -146,12 +149,16 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -ERANGE; } @@ -164,6 +171,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EPROTONOSUPPORT; } } else { @@ -176,6 +185,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, default: if (exists) tcf_idr_release(*a, bind); + else + tcf_idr_cleanup(tn, parm->index); return -EINVAL; } action = parm->v_action; @@ -183,8 +194,10 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_vlan_ops, bind, true); - if (ret) + if (ret) { + tcf_idr_cleanup(tn, parm->index); return ret; + } ret = ACT_P_CREATED; } else if (!ovr) { -- cgit v1.2.3 From 90b73b77d08ec395311411b545c756ca710aae59 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Thu, 5 Jul 2018 17:24:33 +0300 Subject: net: sched: change action API to use array of pointers to actions Act API used linked list to pass set of actions to functions. It is intrusive data structure that stores list nodes inside action structure itself, which means it is not safe to modify such list concurrently. However, action API doesn't use any linked list specific operations on this set of actions, so it can be safely refactored into plain pointer array. Refactor action API to use array of pointers to tc_actions instead of linked list. Change argument 'actions' type of exported action init, destroy and dump functions. Acked-by: Jiri Pirko Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 7 ++-- net/sched/act_api.c | 89 +++++++++++++++++++++++++++++---------------------- net/sched/cls_api.c | 21 ++++-------- 3 files changed, 60 insertions(+), 57 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8090de2edab7..683ce41053d9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -168,19 +168,20 @@ static inline int tcf_idr_release(struct tc_action *a, bool bind) int tcf_register_action(struct tc_action_ops *a, struct pernet_operations *ops); int tcf_unregister_action(struct tc_action_ops *a, struct pernet_operations *ops); -int tcf_action_destroy(struct list_head *actions, int bind); +int tcf_action_destroy(struct tc_action *actions[], int bind); int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res); int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack); -int tcf_action_dump(struct sk_buff *skb, struct list_head *, int, int); +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, + int ref); int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int); int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9511502e1cbb..bf1c35f3deb6 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -657,13 +657,15 @@ repeat: } EXPORT_SYMBOL(tcf_action_exec); -int tcf_action_destroy(struct list_head *actions, int bind) +int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; - struct tc_action *a, *tmp; - int ret = 0; + struct tc_action *a; + int ret = 0, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; + actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) @@ -679,11 +681,12 @@ static int tcf_action_put(struct tc_action *p) return __tcf_action_put(p, false); } -static void tcf_action_put_lst(struct list_head *actions) +static void tcf_action_put_many(struct tc_action *actions[]) { - struct tc_action *a, *tmp; + int i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) @@ -735,14 +738,15 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_action_dump_1); -int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, +int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; - int err = -EINVAL; + int err = -EINVAL, i; struct nlattr *nest; - list_for_each_entry(a, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + a = actions[i]; nest = nla_nest_start(skb, a->order); if (nest == NULL) goto nla_put_failure; @@ -878,10 +882,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { - LIST_HEAD(actions); + struct tc_action *actions[] = { a, NULL }; - list_add_tail(&a->list, &actions); - tcf_action_destroy(&actions, bind); + tcf_action_destroy(actions, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } @@ -899,9 +902,11 @@ err_out: return ERR_PTR(err); } +/* Returns numbers of initialized actions or negative error. */ + int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, - struct list_head *actions, size_t *attr_size, + struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -923,11 +928,12 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } act->order = i; sz += tcf_action_fill_size(act); - list_add_tail(&act->list, actions); + /* Start from index 0 */ + actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); - return 0; + return i - 1; err: tcf_action_destroy(actions, bind); @@ -978,7 +984,7 @@ errout: return -1; } -static int tca_get_fill(struct sk_buff *skb, struct list_head *actions, +static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { @@ -1014,7 +1020,7 @@ out_nlmsg_trim: static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, - struct list_head *actions, int event, + struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1150,14 +1156,14 @@ err_out: return err; } -static int tcf_action_delete(struct net *net, struct list_head *actions, - struct netlink_ext_ack *extack) +static int tcf_action_delete(struct net *net, struct tc_action *actions[], + int *acts_deleted, struct netlink_ext_ack *extack) { - struct tc_action *a, *tmp; u32 act_index; - int ret; + int ret, i; - list_for_each_entry_safe(a, tmp, actions, list) { + for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { + struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their @@ -1165,23 +1171,26 @@ static int tcf_action_delete(struct net *net, struct list_head *actions, */ act_index = a->tcfa_index; - list_del(&a->list); if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { /* now do the delete */ ret = ops->delete(net, act_index); - if (ret < 0) + if (ret < 0) { + *acts_deleted = i + 1; return ret; + } } } + *acts_deleted = i; return 0; } static int -tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, - u32 portid, size_t attr_size, struct netlink_ext_ack *extack) +tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], + int *acts_deleted, u32 portid, size_t attr_size, + struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; @@ -1199,7 +1208,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, } /* now do the delete */ - ret = tcf_action_delete(net, actions, extack); + ret = tcf_action_delete(net, actions, acts_deleted, extack); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); @@ -1221,7 +1230,8 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO + 1] = {}; + int acts_deleted = 0; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) @@ -1243,26 +1253,27 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } act->order = i; attr_size += tcf_action_fill_size(act); - list_add_tail(&act->list, &actions); + actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) - ret = tcf_get_notify(net, portid, n, &actions, event, extack); + ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ - ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_del_notify(net, n, actions, &acts_deleted, portid, + attr_size, extack); if (ret) goto err; return ret; } err: - tcf_action_put_lst(&actions); + tcf_action_put_many(&actions[acts_deleted]); return ret; } static int -tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions, +tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; @@ -1293,15 +1304,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, { size_t attr_size = 0; int ret = 0; - LIST_HEAD(actions); + struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions, + ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, &attr_size, true, extack); - if (ret) + if (ret < 0) return ret; - ret = tcf_add_notify(net, n, &actions, portid, attr_size, extack); + ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); if (ovr) - tcf_action_put_lst(&actions); + tcf_action_put_many(actions); return ret; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 9041f0e43e9a..73d9967c3739 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1609,10 +1609,7 @@ out: void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT - LIST_HEAD(actions); - - tcf_exts_to_list(exts, &actions); - tcf_action_destroy(&actions, TCA_ACT_UNBIND); + tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); exts->nr_actions = 0; #endif @@ -1639,18 +1636,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, exts->actions[0] = act; exts->nr_actions = 1; } else if (exts->action && tb[exts->action]) { - LIST_HEAD(actions); - int err, i = 0; + int err; err = tcf_action_init(net, tp, tb[exts->action], rate_tlv, NULL, ovr, TCA_ACT_BIND, - &actions, &attr_size, true, + exts->actions, &attr_size, true, extack); - if (err) + if (err < 0) return err; - list_for_each_entry(act, &actions, list) - exts->actions[i++] = act; - exts->nr_actions = i; + exts->nr_actions = err; } exts->net = net; } @@ -1699,14 +1693,11 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) * tc data even if iproute2 was newer - jhs */ if (exts->type != TCA_OLD_COMPAT) { - LIST_HEAD(actions); - nest = nla_nest_start(skb, exts->action); if (nest == NULL) goto nla_put_failure; - tcf_exts_to_list(exts, &actions); - if (tcf_action_dump(skb, &actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { -- cgit v1.2.3 From 0dbc81eab4d13f6d295da69c00e6efee2427b55c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 8 Jul 2018 17:02:59 +0900 Subject: net: sched: Fix warnings from xchg() on RCU'd cookie pointer. The kbuild test robot reports: >> net/sched/act_api.c:71:15: sparse: incorrect type in initializer (different address spaces) @@ expected struct tc_cookie [noderef] *__ret @@ got [noderef] *__ret @@ net/sched/act_api.c:71:15: expected struct tc_cookie [noderef] *__ret net/sched/act_api.c:71:15: got struct tc_cookie *new_cookie >> net/sched/act_api.c:71:13: sparse: incorrect type in assignment (different address spaces) @@ expected struct tc_cookie *old @@ got struct tc_cookie [noderef] *[assigned] __ret >> net/sched/act_api.c:132:48: sparse: dereference of noderef expression Handle this in the usual way by force casting away the __rcu annotation when we are using xchg() on it. Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update") Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/sched/act_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index bf1c35f3deb6..66dc19746c63 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -68,7 +68,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, { struct tc_cookie *old; - old = xchg(old_cookie, new_cookie); + old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } -- cgit v1.2.3 From 046f6fd5daefac7f5abdafb436b30f63bc7c602b Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sched: Add Common Applications Kept Enhanced (cake) qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sch_cake targets the home router use case and is intended to squeeze the most bandwidth and latency out of even the slowest ISP links and routers, while presenting an API simple enough that even an ISP can configure it. Example of use on a cable ISP uplink: tc qdisc add dev eth0 cake bandwidth 20Mbit nat docsis ack-filter To shape a cable download link (ifb and tc-mirred setup elided) tc qdisc add dev ifb0 cake bandwidth 200mbit nat docsis ingress wash CAKE is filled with: * A hybrid Codel/Blue AQM algorithm, "Cobalt", tied to an FQ_Codel derived Flow Queuing system, which autoconfigures based on the bandwidth. * A novel "triple-isolate" mode (the default) which balances per-host and per-flow FQ even through NAT. * An deficit based shaper, that can also be used in an unlimited mode. * 8 way set associative hashing to reduce flow collisions to a minimum. * A reasonable interpretation of various diffserv latency/loss tradeoffs. * Support for zeroing diffserv markings for entering and exiting traffic. * Support for interacting well with Docsis 3.0 shaper framing. * Extensive support for DSL framing types. * Support for ack filtering. * Extensive statistics for measuring, loss, ecn markings, latency variation. A paper describing the design of CAKE is available at https://arxiv.org/abs/1804.07617, and will be published at the 2018 IEEE International Symposium on Local and Metropolitan Area Networks (LANMAN). This patch adds the base shaper and packet scheduler, while subsequent commits add the optional (configurable) features. The full userspace API and most data structures are included in this commit, but options not understood in the base version will be ignored. Various versions baking have been available as an out of tree build for kernel versions going back to 3.10, as the embedded router world has been running a few years behind mainline Linux. A stable version has been generally available on lede-17.01 and later. sch_cake replaces a combination of iptables, tc filter, htb and fq_codel in the sqm-scripts, with sane defaults and vastly simpler configuration. CAKE's principal author is Jonathan Morton, with contributions from Kevin Darbyshire-Bryant, Toke Høiland-Jørgensen, Sebastian Moeller, Ryan Mounce, Tony Ambardar, Dean Scarff, Nils Andreas Svee, Dave Täht, and Loganaden Velvindron. Testing from Pete Heist, Georgios Amanakis, and the many other members of the cake@lists.bufferbloat.net mailing list. tc -s qdisc show dev eth2 qdisc cake 8017: root refcnt 2 bandwidth 1Gbit diffserv3 triple-isolate split-gso rtt 100.0ms noatm overhead 38 mpu 84 Sent 51504294511 bytes 37724591 pkt (dropped 6, overlimits 64958695 requeues 12) backlog 0b 0p requeues 12 memory used: 1053008b of 15140Kb capacity estimate: 970Mbit min/max network layer size: 28 / 1500 min/max overhead-adjusted size: 84 / 1538 average network hdr offset: 14 Bulk Best Effort Voice thresh 62500Kbit 1Gbit 250Mbit target 5.0ms 5.0ms 5.0ms interval 100.0ms 100.0ms 100.0ms pk_delay 5us 5us 6us av_delay 3us 2us 2us sp_delay 2us 1us 1us backlog 0b 0b 0b pkts 3164050 25030267 9530280 bytes 3227519915 35396974782 12879808898 way_inds 0 8 0 way_miss 21 366 25 way_cols 0 0 0 drops 5 0 1 marks 0 0 0 ack_drop 0 0 0 sp_flows 1 3 0 bk_flows 0 1 1 un_flows 0 0 0 max_len 68130 68130 68130 Tested-by: Pete Heist Tested-by: Georgios Amanakis Signed-off-by: Dave Taht Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 114 +++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/sch_cake.c | 1867 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1993 insertions(+) create mode 100644 net/sched/sch_cake.c (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 949118461009..d9cc9dc4f547 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -955,4 +955,118 @@ enum { #define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index fcc89706745b..7af246764a35 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -295,6 +295,17 @@ config NET_SCH_FQ_CODEL If unsure, say N. +config NET_SCH_CAKE + tristate "Common Applications Kept Enhanced (CAKE)" + help + Say Y here if you want to use the Common Applications Kept Enhanced + (CAKE) queue management algorithm. + + To compile this driver as a module, choose M here: the module + will be called sch_cake. + + If unsure, say N. + config NET_SCH_FQ tristate "Fair Queue" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 9a5a7077d217..673ee7d26ff2 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o +obj-$(CONFIG_NET_SCH_CAKE) += sch_cake.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c new file mode 100644 index 000000000000..ea0272615d63 --- /dev/null +++ b/net/sched/sch_cake.c @@ -0,0 +1,1867 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* COMMON Applications Kept Enhanced (CAKE) discipline + * + * Copyright (C) 2014-2018 Jonathan Morton + * Copyright (C) 2015-2018 Toke Høiland-Jørgensen + * Copyright (C) 2014-2018 Dave Täht + * Copyright (C) 2015-2018 Sebastian Moeller + * (C) 2015-2018 Kevin Darbyshire-Bryant + * Copyright (C) 2017-2018 Ryan Mounce + * + * The CAKE Principles: + * (or, how to have your cake and eat it too) + * + * This is a combination of several shaping, AQM and FQ techniques into one + * easy-to-use package: + * + * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE + * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), + * eliminating the need for any sort of burst parameter (eg. token bucket + * depth). Burst support is limited to that necessary to overcome scheduling + * latency. + * + * - A Diffserv-aware priority queue, giving more priority to certain classes, + * up to a specified fraction of bandwidth. Above that bandwidth threshold, + * the priority is reduced to avoid starving other tins. + * + * - Each priority tin has a separate Flow Queue system, to isolate traffic + * flows from each other. This prevents a burst on one flow from increasing + * the delay to another. Flows are distributed to queues using a + * set-associative hash function. + * + * - Each queue is actively managed by Cobalt, which is a combination of the + * Codel and Blue AQM algorithms. This serves flows fairly, and signals + * congestion early via ECN (if available) and/or packet drops, to keep + * latency low. The codel parameters are auto-tuned based on the bandwidth + * setting, as is necessary at low bandwidths. + * + * The configuration parameters are kept deliberately simple for ease of use. + * Everything has sane defaults. Complete generality of configuration is *not* + * a goal. + * + * The priority queue operates according to a weighted DRR scheme, combined with + * a bandwidth tracker which reuses the shaper logic to detect which side of the + * bandwidth sharing threshold the tin is operating. This determines whether a + * priority-based weight (high) or a bandwidth-based weight (low) is used for + * that tin in the current pass. + * + * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly + * granted us permission to leverage. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CAKE_SET_WAYS (8) +#define CAKE_MAX_TINS (8) +#define CAKE_QUEUES (1024) +#define CAKE_FLOW_MASK 63 +#define CAKE_FLOW_NAT_FLAG 64 + +/* struct cobalt_params - contains codel and blue parameters + * @interval: codel initial drop rate + * @target: maximum persistent sojourn time & blue update rate + * @mtu_time: serialisation delay of maximum-size packet + * @p_inc: increment of blue drop probability (0.32 fxp) + * @p_dec: decrement of blue drop probability (0.32 fxp) + */ +struct cobalt_params { + u64 interval; + u64 target; + u64 mtu_time; + u32 p_inc; + u32 p_dec; +}; + +/* struct cobalt_vars - contains codel and blue variables + * @count: codel dropping frequency + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * @drop_next: time to drop next packet, or when we dropped last + * @blue_timer: Blue time to next drop + * @p_drop: BLUE drop probability (0.32 fxp) + * @dropping: set if in dropping state + * @ecn_marked: set if marked + */ +struct cobalt_vars { + u32 count; + u32 rec_inv_sqrt; + ktime_t drop_next; + ktime_t blue_timer; + u32 p_drop; + bool dropping; + bool ecn_marked; +}; + +enum { + CAKE_SET_NONE = 0, + CAKE_SET_SPARSE, + CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ + CAKE_SET_BULK, + CAKE_SET_DECAYING +}; + +struct cake_flow { + /* this stuff is all needed per-flow at dequeue time */ + struct sk_buff *head; + struct sk_buff *tail; + struct list_head flowchain; + s32 deficit; + u32 dropped; + struct cobalt_vars cvars; + u16 srchost; /* index into cake_host table */ + u16 dsthost; + u8 set; +}; /* please try to keep this structure <= 64 bytes */ + +struct cake_host { + u32 srchost_tag; + u32 dsthost_tag; + u16 srchost_refcnt; + u16 dsthost_refcnt; +}; + +struct cake_heap_entry { + u16 t:3, b:10; +}; + +struct cake_tin_data { + struct cake_flow flows[CAKE_QUEUES]; + u32 backlogs[CAKE_QUEUES]; + u32 tags[CAKE_QUEUES]; /* for set association */ + u16 overflow_idx[CAKE_QUEUES]; + struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ + u16 flow_quantum; + + struct cobalt_params cparams; + u32 drop_overlimit; + u16 bulk_flow_count; + u16 sparse_flow_count; + u16 decaying_flow_count; + u16 unresponsive_flow_count; + + u32 max_skblen; + + struct list_head new_flows; + struct list_head old_flows; + struct list_head decaying_flows; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + ktime_t time_next_packet; + u64 tin_rate_ns; + u64 tin_rate_bps; + u16 tin_rate_shft; + + u16 tin_quantum_prio; + u16 tin_quantum_band; + s32 tin_deficit; + u32 tin_backlog; + u32 tin_dropped; + u32 tin_ecn_mark; + + u32 packets; + u64 bytes; + + u32 ack_drops; + + /* moving averages */ + u64 avge_delay; + u64 peak_delay; + u64 base_delay; + + /* hash function stats */ + u32 way_directs; + u32 way_hits; + u32 way_misses; + u32 way_collisions; +}; /* number of tins is small, so size of this struct doesn't matter much */ + +struct cake_sched_data { + struct tcf_proto __rcu *filter_list; /* optional external classifier */ + struct tcf_block *block; + struct cake_tin_data *tins; + + struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; + u16 overflow_timeout; + + u16 tin_cnt; + u8 tin_mode; + u8 flow_mode; + u8 ack_filter; + u8 atm_mode; + + /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ + u16 rate_shft; + ktime_t time_next_packet; + ktime_t failsafe_next_packet; + u64 rate_ns; + u64 rate_bps; + u16 rate_flags; + s16 rate_overhead; + u16 rate_mpu; + u64 interval; + u64 target; + + /* resource tracking */ + u32 buffer_used; + u32 buffer_max_used; + u32 buffer_limit; + u32 buffer_config_limit; + + /* indices for dequeue */ + u16 cur_tin; + u16 cur_flow; + + struct qdisc_watchdog watchdog; + const u8 *tin_index; + const u8 *tin_order; + + /* bandwidth capacity estimate */ + ktime_t last_packet_time; + ktime_t avg_window_begin; + u64 avg_packet_interval; + u64 avg_window_bytes; + u64 avg_peak_bandwidth; + ktime_t last_reconfig_time; + + /* packet length stats */ + u32 avg_netoff; + u16 max_netlen; + u16 max_adjlen; + u16 min_netlen; + u16 min_adjlen; +}; + +enum { + CAKE_FLAG_OVERHEAD = BIT(0), + CAKE_FLAG_AUTORATE_INGRESS = BIT(1), + CAKE_FLAG_INGRESS = BIT(2), + CAKE_FLAG_WASH = BIT(3), + CAKE_FLAG_SPLIT_GSO = BIT(4) +}; + +/* COBALT operates the Codel and BLUE algorithms in parallel, in order to + * obtain the best features of each. Codel is excellent on flows which + * respond to congestion signals in a TCP-like way. BLUE is more effective on + * unresponsive flows. + */ + +struct cobalt_skb_cb { + ktime_t enqueue_time; +}; + +static u64 us_to_ns(u64 us) +{ + return us * NSEC_PER_USEC; +} + +static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); + return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) +{ + return get_cobalt_cb(skb)->enqueue_time; +} + +static void cobalt_set_enqueue_time(struct sk_buff *skb, + ktime_t now) +{ + get_cobalt_cb(skb)->enqueue_time = now; +} + +static u16 quantum_div[CAKE_QUEUES + 1] = {0}; + +#define REC_INV_SQRT_CACHE (16) +static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; + +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ + +static void cobalt_newton_step(struct cobalt_vars *vars) +{ + u32 invsqrt, invsqrt2; + u64 val; + + invsqrt = vars->rec_inv_sqrt; + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; + val = (3LL << 32) - ((u64)vars->count * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val; +} + +static void cobalt_invsqrt(struct cobalt_vars *vars) +{ + if (vars->count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count]; + else + cobalt_newton_step(vars); +} + +/* There is a big difference in timing between the accurate values placed in + * the cache and the approximations given by a single Newton step for small + * count values, particularly when stepping from count 1 to 2 or vice versa. + * Above 16, a single Newton step gives sufficient accuracy in either + * direction, given the precision stored. + * + * The magnitude of the error when stepping up to count 2 is such as to give + * the value that *should* have been produced at count 4. + */ + +static void cobalt_cache_init(void) +{ + struct cobalt_vars v; + + memset(&v, 0, sizeof(v)); + v.rec_inv_sqrt = ~0U; + cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt; + + for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) { + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + cobalt_newton_step(&v); + + cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt; + } +} + +static void cobalt_vars_init(struct cobalt_vars *vars) +{ + memset(vars, 0, sizeof(*vars)); + + if (!cobalt_rec_inv_sqrt_cache[0]) { + cobalt_cache_init(); + cobalt_rec_inv_sqrt_cache[0] = ~0; + } +} + +/* CoDel control_law is t + interval/sqrt(count) + * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid + * both sqrt() and divide operation. + */ +static ktime_t cobalt_control(ktime_t t, + u64 interval, + u32 rec_inv_sqrt) +{ + return ktime_add_ns(t, reciprocal_scale(interval, + rec_inv_sqrt)); +} + +/* Call this when a packet had to be dropped due to queue overflow. Returns + * true if the BLUE state was quiescent before but active after this call. + */ +static bool cobalt_queue_full(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool up = false; + + if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + up = !vars->p_drop; + vars->p_drop += p->p_inc; + if (vars->p_drop < p->p_inc) + vars->p_drop = ~0; + vars->blue_timer = now; + } + vars->dropping = true; + vars->drop_next = now; + if (!vars->count) + vars->count = 1; + + return up; +} + +/* Call this when the queue was serviced but turned out to be empty. Returns + * true if the BLUE state was active before but quiescent after this call. + */ +static bool cobalt_queue_empty(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now) +{ + bool down = false; + + if (vars->p_drop && + ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { + if (vars->p_drop < p->p_dec) + vars->p_drop = 0; + else + vars->p_drop -= p->p_dec; + vars->blue_timer = now; + down = !vars->p_drop; + } + vars->dropping = false; + + if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + } + + return down; +} + +/* Call this with a freshly dequeued packet for possible congestion marking. + * Returns true as an instruction to drop the packet, false for delivery. + */ +static bool cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb) +{ + bool next_due, over_target, drop = false; + ktime_t schedule; + u64 sojourn; + +/* The 'schedule' variable records, in its sign, whether 'now' is before or + * after 'drop_next'. This allows 'drop_next' to be updated before the next + * scheduling decision is actually branched, without destroying that + * information. Similarly, the first 'schedule' value calculated is preserved + * in the boolean 'next_due'. + * + * As for 'drop_next', we take advantage of the fact that 'interval' is both + * the delay between first exceeding 'target' and the first signalling event, + * *and* the scaling factor for the signalling frequency. It's therefore very + * natural to use a single mechanism for both purposes, and eliminates a + * significant amount of reference Codel's spaghetti code. To help with this, + * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close + * as possible to 1.0 in fixed-point. + */ + + sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + schedule = ktime_sub(now, vars->drop_next); + over_target = sojourn > p->target && + sojourn > p->mtu_time * 4; + next_due = vars->count && ktime_to_ns(schedule) >= 0; + + vars->ecn_marked = false; + + if (over_target) { + if (!vars->dropping) { + vars->dropping = true; + vars->drop_next = cobalt_control(now, + p->interval, + vars->rec_inv_sqrt); + } + if (!vars->count) + vars->count = 1; + } else if (vars->dropping) { + vars->dropping = false; + } + + if (next_due && vars->dropping) { + /* Use ECN mark if possible, otherwise drop */ + drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + + vars->count++; + if (!vars->count) + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + } else { + while (next_due) { + vars->count--; + cobalt_invsqrt(vars); + vars->drop_next = cobalt_control(vars->drop_next, + p->interval, + vars->rec_inv_sqrt); + schedule = ktime_sub(now, vars->drop_next); + next_due = vars->count && ktime_to_ns(schedule) >= 0; + } + } + + /* Simple BLUE implementation. Lack of ECN is deliberate. */ + if (vars->p_drop) + drop |= (prandom_u32() < vars->p_drop); + + /* Overload the drop_next field as an activity timeout */ + if (!vars->count) + vars->drop_next = ktime_add_ns(now, p->interval); + else if (ktime_to_ns(schedule) > 0 && !drop) + vars->drop_next = now; + + return drop; +} + +/* Cake has several subtle multiple bit settings. In these cases you + * would be matching triple isolate mode as well. + */ + +static bool cake_dsrc(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; +} + +static bool cake_ddst(int flow_mode) +{ + return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; +} + +static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, + int flow_mode) +{ + u32 flow_hash = 0, srchost_hash, dsthost_hash; + u16 reduced_hash, srchost_idx, dsthost_idx; + struct flow_keys keys, host_keys; + + if (unlikely(flow_mode == CAKE_FLOW_NONE)) + return 0; + + skb_flow_dissect_flow_keys(skb, &keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + + /* flow_hash_from_keys() sorts the addresses by value, so we have + * to preserve their order in a separate data structure to treat + * src and dst host addresses as independently selectable. + */ + host_keys = keys; + host_keys.ports.ports = 0; + host_keys.basic.ip_proto = 0; + host_keys.keyid.keyid = 0; + host_keys.tags.flow_label = 0; + + switch (host_keys.control.addr_type) { + case FLOW_DISSECTOR_KEY_IPV4_ADDRS: + host_keys.addrs.v4addrs.src = 0; + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + host_keys.addrs.v4addrs.dst = 0; + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + case FLOW_DISSECTOR_KEY_IPV6_ADDRS: + memset(&host_keys.addrs.v6addrs.src, 0, + sizeof(host_keys.addrs.v6addrs.src)); + dsthost_hash = flow_hash_from_keys(&host_keys); + host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; + memset(&host_keys.addrs.v6addrs.dst, 0, + sizeof(host_keys.addrs.v6addrs.dst)); + srchost_hash = flow_hash_from_keys(&host_keys); + break; + + default: + dsthost_hash = 0; + srchost_hash = 0; + } + + /* This *must* be after the above switch, since as a + * side-effect it sorts the src and dst addresses. + */ + if (flow_mode & CAKE_FLOW_FLOWS) + flow_hash = flow_hash_from_keys(&keys); + + if (!(flow_mode & CAKE_FLOW_FLOWS)) { + if (flow_mode & CAKE_FLOW_SRC_IP) + flow_hash ^= srchost_hash; + + if (flow_mode & CAKE_FLOW_DST_IP) + flow_hash ^= dsthost_hash; + } + + reduced_hash = flow_hash % CAKE_QUEUES; + + /* set-associative hashing */ + /* fast path if no hash collision (direct lookup succeeds) */ + if (likely(q->tags[reduced_hash] == flow_hash && + q->flows[reduced_hash].set)) { + q->way_directs++; + } else { + u32 inner_hash = reduced_hash % CAKE_SET_WAYS; + u32 outer_hash = reduced_hash - inner_hash; + bool allocate_src = false; + bool allocate_dst = false; + u32 i, k; + + /* check if any active queue in the set is reserved for + * this flow. + */ + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->tags[outer_hash + k] == flow_hash) { + if (i) + q->way_hits++; + + if (!q->flows[outer_hash + k].set) { + /* need to increment host refcnts */ + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + } + + goto found; + } + } + + /* no queue is reserved for this flow, look for an + * empty one. + */ + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->flows[outer_hash + k].set) { + q->way_misses++; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); + goto found; + } + } + + /* With no empty queues, default to the original + * queue, accept the collision, update the host tags. + */ + q->way_collisions++; + q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--; + q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--; + allocate_src = cake_dsrc(flow_mode); + allocate_dst = cake_ddst(flow_mode); +found: + /* reserve queue for future packets in same flow */ + reduced_hash = outer_hash + k; + q->tags[reduced_hash] = flow_hash; + + if (allocate_src) { + srchost_idx = srchost_hash % CAKE_QUEUES; + inner_hash = srchost_idx % CAKE_SET_WAYS; + outer_hash = srchost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].srchost_tag == + srchost_hash) + goto found_src; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].srchost_refcnt) + break; + } + q->hosts[outer_hash + k].srchost_tag = srchost_hash; +found_src: + srchost_idx = outer_hash + k; + q->hosts[srchost_idx].srchost_refcnt++; + q->flows[reduced_hash].srchost = srchost_idx; + } + + if (allocate_dst) { + dsthost_idx = dsthost_hash % CAKE_QUEUES; + inner_hash = dsthost_idx % CAKE_SET_WAYS; + outer_hash = dsthost_idx - inner_hash; + for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (q->hosts[outer_hash + k].dsthost_tag == + dsthost_hash) + goto found_dst; + } + for (i = 0; i < CAKE_SET_WAYS; + i++, k = (k + 1) % CAKE_SET_WAYS) { + if (!q->hosts[outer_hash + k].dsthost_refcnt) + break; + } + q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; +found_dst: + dsthost_idx = outer_hash + k; + q->hosts[dsthost_idx].dsthost_refcnt++; + q->flows[reduced_hash].dsthost = dsthost_idx; + } + } + + return reduced_hash; +} + +/* helper functions : might be changed when/if skb use a standard list_head */ +/* remove one skb from head of slot queue */ + +static struct sk_buff *dequeue_head(struct cake_flow *flow) +{ + struct sk_buff *skb = flow->head; + + if (skb) { + flow->head = skb->next; + skb->next = NULL; + } + + return skb; +} + +/* add skb to flow queue (tail add) */ + +static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) +{ + if (!flow->head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; +} + +static u64 cake_ewma(u64 avg, u64 sample, u32 shift) +{ + avg -= avg >> shift; + avg += sample >> shift; + return avg; +} + +static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + struct cake_heap_entry jj = q->overflow_heap[j]; + + q->overflow_heap[i] = jj; + q->overflow_heap[j] = ii; + + q->tins[ii.t].overflow_idx[ii.b] = j; + q->tins[jj.t].overflow_idx[jj.b] = i; +} + +static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) +{ + struct cake_heap_entry ii = q->overflow_heap[i]; + + return q->tins[ii.t].backlogs[ii.b]; +} + +static void cake_heapify(struct cake_sched_data *q, u16 i) +{ + static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; + u32 mb = cake_heap_get_backlog(q, i); + u32 m = i; + + while (m < a) { + u32 l = m + m + 1; + u32 r = l + 1; + + if (l < a) { + u32 lb = cake_heap_get_backlog(q, l); + + if (lb > mb) { + m = l; + mb = lb; + } + } + + if (r < a) { + u32 rb = cake_heap_get_backlog(q, r); + + if (rb > mb) { + m = r; + mb = rb; + } + } + + if (m != i) { + cake_heap_swap(q, i, m); + i = m; + } else { + break; + } + } +} + +static void cake_heapify_up(struct cake_sched_data *q, u16 i) +{ + while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { + u16 p = (i - 1) >> 1; + u32 ib = cake_heap_get_backlog(q, i); + u32 pb = cake_heap_get_backlog(q, p); + + if (ib > pb) { + cake_heap_swap(q, i, p); + i = p; + } else { + break; + } + } +} + +static int cake_advance_shaper(struct cake_sched_data *q, + struct cake_tin_data *b, + struct sk_buff *skb, + ktime_t now, bool drop) +{ + u32 len = qdisc_pkt_len(skb); + + /* charge packet bandwidth to this tin + * and to the global shaper. + */ + if (q->rate_ns) { + u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; + u64 global_dur = (len * q->rate_ns) >> q->rate_shft; + u64 failsafe_dur = global_dur + (global_dur >> 1); + + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = ktime_add_ns(b->time_next_packet, + tin_dur); + + else if (ktime_before(b->time_next_packet, + ktime_add_ns(now, tin_dur))) + b->time_next_packet = ktime_add_ns(now, tin_dur); + + q->time_next_packet = ktime_add_ns(q->time_next_packet, + global_dur); + if (!drop) + q->failsafe_next_packet = \ + ktime_add_ns(q->failsafe_next_packet, + failsafe_dur); + } + return len; +} + +static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + ktime_t now = ktime_get(); + u32 idx = 0, tin = 0, len; + struct cake_heap_entry qq; + struct cake_tin_data *b; + struct cake_flow *flow; + struct sk_buff *skb; + + if (!q->overflow_timeout) { + int i; + /* Build fresh max-heap */ + for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--) + cake_heapify(q, i); + } + q->overflow_timeout = 65535; + + /* select longest queue for pruning */ + qq = q->overflow_heap[0]; + tin = qq.t; + idx = qq.b; + + b = &q->tins[tin]; + flow = &b->flows[idx]; + skb = dequeue_head(flow); + if (unlikely(!skb)) { + /* heap has gone wrong, rebuild it next time */ + q->overflow_timeout = 0; + return idx + (tin << 16); + } + + if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count++; + + len = qdisc_pkt_len(skb); + q->buffer_used -= skb->truesize; + b->backlogs[idx] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + qdisc_tree_reduce_backlog(sch, 1, len); + + flow->dropped++; + b->tin_dropped++; + sch->qstats.drops++; + + __qdisc_drop(skb, to_free); + sch->q.qlen--; + + cake_heapify(q, 0); + + return idx + (tin << 16); +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, + struct sk_buff *skb, int flow_mode, int *qerr) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct tcf_proto *filter; + struct tcf_result res; + int result; + + filter = rcu_dereference_bh(q->filter_list); + if (!filter) + return cake_hash(t, skb, flow_mode) + 1; + + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + /* fall through */ + case TC_ACT_SHOT: + return 0; + } +#endif + if (TC_H_MIN(res.classid) <= CAKE_QUEUES) + return TC_H_MIN(res.classid); + } + return 0; +} + +static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int len = qdisc_pkt_len(skb); + int uninitialized_var(ret); + ktime_t now = ktime_get(); + struct cake_tin_data *b; + struct cake_flow *flow; + u32 idx, tin; + + tin = 0; + b = &q->tins[tin]; + + /* choose flow to insert into */ + idx = cake_classify(sch, b, skb, q->flow_mode, &ret); + if (idx == 0) { + if (ret & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return ret; + } + idx--; + flow = &b->flows[idx]; + + /* ensure shaper state isn't stale */ + if (!b->tin_backlog) { + if (ktime_before(b->time_next_packet, now)) + b->time_next_packet = now; + + if (!sch->q.qlen) { + if (ktime_before(q->time_next_packet, now)) { + q->failsafe_next_packet = now; + q->time_next_packet = now; + } else if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = \ + min(ktime_to_ns(q->time_next_packet), + ktime_to_ns( + q->failsafe_next_packet)); + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } + } + } + + if (unlikely(len > b->max_skblen)) + b->max_skblen = len; + + cobalt_set_enqueue_time(skb, now); + flow_queue_add(flow, skb); + + sch->q.qlen++; + q->buffer_used += skb->truesize; + + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + + if (q->overflow_timeout) + cake_heapify_up(q, b->overflow_idx[idx]); + + /* incoming bandwidth capacity estimate */ + q->avg_window_bytes = 0; + q->last_packet_time = now; + + /* flowchain */ + if (!flow->set || flow->set == CAKE_SET_DECAYING) { + struct cake_host *srchost = &b->hosts[flow->srchost]; + struct cake_host *dsthost = &b->hosts[flow->dsthost]; + u16 host_load = 1; + + if (!flow->set) { + list_add_tail(&flow->flowchain, &b->new_flows); + } else { + b->decaying_flow_count--; + list_move_tail(&flow->flowchain, &b->new_flows); + } + flow->set = CAKE_SET_SPARSE; + b->sparse_flow_count++; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + flow->deficit = (b->flow_quantum * + quantum_div[host_load]) >> 16; + } else if (flow->set == CAKE_SET_SPARSE_WAIT) { + /* this flow was empty, accounted as a sparse flow, but actually + * in the bulk rotation. + */ + flow->set = CAKE_SET_BULK; + b->sparse_flow_count--; + b->bulk_flow_count++; + } + + if (q->buffer_used > q->buffer_max_used) + q->buffer_max_used = q->buffer_used; + + if (q->buffer_used > q->buffer_limit) { + u32 dropped = 0; + + while (q->buffer_used > q->buffer_limit) { + dropped++; + cake_drop(sch, to_free); + } + b->drop_overlimit += dropped; + } + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_flow *flow = &b->flows[q->cur_flow]; + struct sk_buff *skb = NULL; + u32 len; + + if (flow->head) { + skb = dequeue_head(flow); + len = qdisc_pkt_len(skb); + b->backlogs[q->cur_flow] -= len; + b->tin_backlog -= len; + sch->qstats.backlog -= len; + q->buffer_used -= skb->truesize; + sch->q.qlen--; + + if (q->overflow_timeout) + cake_heapify(q, b->overflow_idx[q->cur_flow]); + } + return skb; +} + +/* Discard leftover packets from a tin no longer in use. */ +static void cake_clear_tin(struct Qdisc *sch, u16 tin) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + q->cur_tin = tin; + for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) + while (!!(skb = cake_dequeue_one(sch))) + kfree_skb(skb); +} + +static struct sk_buff *cake_dequeue(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[q->cur_tin]; + struct cake_host *srchost, *dsthost; + ktime_t now = ktime_get(); + struct cake_flow *flow; + struct list_head *head; + bool first_flow = true; + struct sk_buff *skb; + u16 host_load; + u64 delay; + u32 len; + +begin: + if (!sch->q.qlen) + return NULL; + + /* global hard shaper */ + if (ktime_after(q->time_next_packet, now) && + ktime_after(q->failsafe_next_packet, now)) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + sch->qstats.overlimits++; + qdisc_watchdog_schedule_ns(&q->watchdog, next); + return NULL; + } + + /* Choose a class to work on. */ + if (!q->rate_ns) { + /* In unlimited mode, can't rely on shaper timings, just balance + * with DRR + */ + bool wrapped = false, empty = true; + + while (b->tin_deficit < 0 || + !(b->sparse_flow_count + b->bulk_flow_count)) { + if (b->tin_deficit <= 0) + b->tin_deficit += b->tin_quantum_band; + if (b->sparse_flow_count + b->bulk_flow_count) + empty = false; + + q->cur_tin++; + b++; + if (q->cur_tin >= q->tin_cnt) { + q->cur_tin = 0; + b = q->tins; + + if (wrapped) { + /* It's possible for q->qlen to be + * nonzero when we actually have no + * packets anywhere. + */ + if (empty) + return NULL; + } else { + wrapped = true; + } + } + } + } else { + /* In shaped mode, choose: + * - Highest-priority tin with queue and meeting schedule, or + * - The earliest-scheduled tin with queue. + */ + ktime_t best_time = KTIME_MAX; + int tin, best_tin = 0; + + for (tin = 0; tin < q->tin_cnt; tin++) { + b = q->tins + tin; + if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { + ktime_t time_to_pkt = \ + ktime_sub(b->time_next_packet, now); + + if (ktime_to_ns(time_to_pkt) <= 0 || + ktime_compare(time_to_pkt, + best_time) <= 0) { + best_time = time_to_pkt; + best_tin = tin; + } + } + } + + q->cur_tin = best_tin; + b = q->tins + best_tin; + + /* No point in going further if no packets to deliver. */ + if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) + return NULL; + } + +retry: + /* service this class */ + head = &b->decaying_flows; + if (!first_flow || list_empty(head)) { + head = &b->new_flows; + if (list_empty(head)) { + head = &b->old_flows; + if (unlikely(list_empty(head))) { + head = &b->decaying_flows; + if (unlikely(list_empty(head))) + goto begin; + } + } + } + flow = list_first_entry(head, struct cake_flow, flowchain); + q->cur_flow = flow - b->flows; + first_flow = false; + + /* triple isolation (modified DRR++) */ + srchost = &b->hosts[flow->srchost]; + dsthost = &b->hosts[flow->dsthost]; + host_load = 1; + + if (cake_dsrc(q->flow_mode)) + host_load = max(host_load, srchost->srchost_refcnt); + + if (cake_ddst(q->flow_mode)) + host_load = max(host_load, dsthost->dsthost_refcnt); + + WARN_ON(host_load > CAKE_QUEUES); + + /* flow isolation (DRR++) */ + if (flow->deficit <= 0) { + /* The shifted prandom_u32() is a way to apply dithering to + * avoid accumulating roundoff errors + */ + flow->deficit += (b->flow_quantum * quantum_div[host_load] + + (prandom_u32() >> 16)) >> 16; + list_move_tail(&flow->flowchain, &b->old_flows); + + /* Keep all flows with deficits out of the sparse and decaying + * rotations. No non-empty flow can go into the decaying + * rotation, so they can't get deficits + */ + if (flow->set == CAKE_SET_SPARSE) { + if (flow->head) { + b->sparse_flow_count--; + b->bulk_flow_count++; + flow->set = CAKE_SET_BULK; + } else { + /* we've moved it to the bulk rotation for + * correct deficit accounting but we still want + * to count it as a sparse flow, not a bulk one. + */ + flow->set = CAKE_SET_SPARSE_WAIT; + } + } + goto retry; + } + + /* Retrieve a packet via the AQM */ + while (1) { + skb = cake_dequeue_one(sch); + if (!skb) { + /* this queue was actually empty */ + if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) + b->unresponsive_flow_count--; + + if (flow->cvars.p_drop || flow->cvars.count || + ktime_before(now, flow->cvars.drop_next)) { + /* keep in the flowchain until the state has + * decayed to rest + */ + list_move_tail(&flow->flowchain, + &b->decaying_flows); + if (flow->set == CAKE_SET_BULK) { + b->bulk_flow_count--; + b->decaying_flow_count++; + } else if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) { + b->sparse_flow_count--; + b->decaying_flow_count++; + } + flow->set = CAKE_SET_DECAYING; + } else { + /* remove empty queue from the flowchain */ + list_del_init(&flow->flowchain); + if (flow->set == CAKE_SET_SPARSE || + flow->set == CAKE_SET_SPARSE_WAIT) + b->sparse_flow_count--; + else if (flow->set == CAKE_SET_BULK) + b->bulk_flow_count--; + else + b->decaying_flow_count--; + + flow->set = CAKE_SET_NONE; + srchost->srchost_refcnt--; + dsthost->dsthost_refcnt--; + } + goto begin; + } + + /* Last packet in queue may be marked, shouldn't be dropped */ + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) || + !flow->head) + break; + + flow->dropped++; + b->tin_dropped++; + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); + qdisc_qstats_drop(sch); + kfree_skb(skb); + } + + b->tin_ecn_mark += !!flow->cvars.ecn_marked; + qdisc_bstats_update(sch, skb); + + /* collect delay stats */ + delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); + b->avge_delay = cake_ewma(b->avge_delay, delay, 8); + b->peak_delay = cake_ewma(b->peak_delay, delay, + delay > b->peak_delay ? 2 : 8); + b->base_delay = cake_ewma(b->base_delay, delay, + delay < b->base_delay ? 2 : 8); + + len = cake_advance_shaper(q, b, skb, now, false); + flow->deficit -= len; + b->tin_deficit -= len; + + if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { + u64 next = min(ktime_to_ns(q->time_next_packet), + ktime_to_ns(q->failsafe_next_packet)); + + qdisc_watchdog_schedule_ns(&q->watchdog, next); + } else if (!sch->q.qlen) { + int i; + + for (i = 0; i < q->tin_cnt; i++) { + if (q->tins[i].decaying_flow_count) { + ktime_t next = \ + ktime_add_ns(now, + q->tins[i].cparams.target); + + qdisc_watchdog_schedule_ns(&q->watchdog, + ktime_to_ns(next)); + break; + } + } + } + + if (q->overflow_timeout) + q->overflow_timeout--; + + return skb; +} + +static void cake_reset(struct Qdisc *sch) +{ + u32 c; + + for (c = 0; c < CAKE_MAX_TINS; c++) + cake_clear_tin(sch, c); +} + +static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { + [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, + [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_ATM] = { .type = NLA_U32 }, + [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, + [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, + [TCA_CAKE_RTT] = { .type = NLA_U32 }, + [TCA_CAKE_TARGET] = { .type = NLA_U32 }, + [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, + [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, + [TCA_CAKE_NAT] = { .type = NLA_U32 }, + [TCA_CAKE_RAW] = { .type = NLA_U32 }, + [TCA_CAKE_WASH] = { .type = NLA_U32 }, + [TCA_CAKE_MPU] = { .type = NLA_U32 }, + [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, + [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, +}; + +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns) +{ + /* convert byte-rate into time-per-byte + * so it will always unwedge in reasonable time. + */ + static const u64 MIN_RATE = 64; + u32 byte_target = mtu; + u64 byte_target_ns; + u8 rate_shft = 0; + u64 rate_ns = 0; + + b->flow_quantum = 1514; + if (rate) { + b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); + rate_shft = 34; + rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; + rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); + while (!!(rate_ns >> 34)) { + rate_ns >>= 1; + rate_shft--; + } + } /* else unlimited, ie. zero delay */ + + b->tin_rate_bps = rate; + b->tin_rate_ns = rate_ns; + b->tin_rate_shft = rate_shft; + + byte_target_ns = (byte_target * rate_ns) >> rate_shft; + + b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); + b->cparams.interval = max(rtt_est_ns + + b->cparams.target - target_ns, + b->cparams.target * 2); + b->cparams.mtu_time = byte_target_ns; + b->cparams.p_inc = 1 << 24; /* 1/256 */ + b->cparams.p_dec = 1 << 20; /* 1/4096 */ +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct cake_tin_data *b = &q->tins[0]; + int c, ft = 0; + + q->tin_cnt = 1; + cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)), + us_to_ns(q->target), us_to_ns(q->interval)); + b->tin_quantum_band = 65535; + b->tin_quantum_prio = 65535; + + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + cake_clear_tin(sch, c); + q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + } + + q->rate_ns = q->tins[ft].tin_rate_ns; + q->rate_shft = q->tins[ft].tin_rate_shft; + + if (q->buffer_config_limit) { + q->buffer_limit = q->buffer_config_limit; + } else if (q->rate_bps) { + u64 t = q->rate_bps * q->interval; + + do_div(t, USEC_PER_SEC / 4); + q->buffer_limit = max_t(u32, t, 4U << 20); + } else { + q->buffer_limit = ~0; + } + + sch->flags &= ~TCQ_F_CAN_BYPASS; + + q->buffer_limit = min(q->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tb[TCA_CAKE_MAX + 1]; + int err; + + if (!opt) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack); + if (err < 0) + return err; + + if (tb[TCA_CAKE_BASE_RATE64]) + q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + + if (tb[TCA_CAKE_FLOW_MODE]) + q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK); + + if (tb[TCA_CAKE_RTT]) { + q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); + + if (!q->interval) + q->interval = 1; + } + + if (tb[TCA_CAKE_TARGET]) { + q->target = nla_get_u32(tb[TCA_CAKE_TARGET]); + + if (!q->target) + q->target = 1; + } + + if (tb[TCA_CAKE_MEMORY]) + q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + + if (q->tins) { + sch_tree_lock(sch); + cake_reconfigure(sch); + sch_tree_unlock(sch); + } + + return 0; +} + +static void cake_destroy(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); + tcf_block_put(q->block); + kvfree(q->tins); +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int i, j, err; + + sch->limit = 10240; + q->tin_mode = CAKE_DIFFSERV_BESTEFFORT; + q->flow_mode = CAKE_FLOW_TRIPLE; + + q->rate_bps = 0; /* unlimited by default */ + + q->interval = 100000; /* 100ms default */ + q->target = 5000; /* 5ms: codel RFC argues + * for 5 to 10% of interval + */ + + q->cur_tin = 0; + q->cur_flow = 0; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (opt) { + int err = cake_change(sch, opt, extack); + + if (err) + return err; + } + + err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + if (err) + return err; + + quantum_div[0] = ~0; + for (i = 1; i <= CAKE_QUEUES; i++) + quantum_div[i] = 65535 / i; + + q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data), + GFP_KERNEL); + if (!q->tins) + goto nomem; + + for (i = 0; i < CAKE_MAX_TINS; i++) { + struct cake_tin_data *b = q->tins + i; + + INIT_LIST_HEAD(&b->new_flows); + INIT_LIST_HEAD(&b->old_flows); + INIT_LIST_HEAD(&b->decaying_flows); + b->sparse_flow_count = 0; + b->bulk_flow_count = 0; + b->decaying_flow_count = 0; + + for (j = 0; j < CAKE_QUEUES; j++) { + struct cake_flow *flow = b->flows + j; + u32 k = j * CAKE_MAX_TINS + i; + + INIT_LIST_HEAD(&flow->flowchain); + cobalt_vars_init(&flow->cvars); + + q->overflow_heap[k].t = i; + q->overflow_heap[k].b = j; + b->overflow_idx[j] = k; + } + } + + cake_reconfigure(sch); + q->avg_peak_bandwidth = q->rate_bps; + q->min_netlen = ~0; + q->min_adjlen = ~0; + return 0; + +nomem: + cake_destroy(sch); + return -ENOMEM; +} + +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *opts; + + opts = nla_nest_start(skb, TCA_OPTIONS); + if (!opts) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps, + TCA_CAKE_PAD)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, + q->flow_mode & CAKE_FLOW_MASK)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) + goto nla_put_failure; + + return nla_nest_end(skb, opts); + +nla_put_failure: + return -1; +} + +static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP); + struct cake_sched_data *q = qdisc_priv(sch); + struct nlattr *tstats, *ts; + int i; + + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ + data, TCA_CAKE_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); + PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); + PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); + PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); + PUT_STAT_U32(MAX_NETLEN, q->max_netlen); + PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); + PUT_STAT_U32(MIN_NETLEN, q->min_netlen); + PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + +#undef PUT_STAT_U32 +#undef PUT_STAT_U64 + + tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS); + if (!tstats) + goto nla_put_failure; + +#define PUT_TSTAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_TSTAT_U64(attr, data) do { \ + if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ + data, TCA_CAKE_TIN_STATS_PAD)) \ + goto nla_put_failure; \ + } while (0) + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + ts = nla_nest_start(d->skb, i + 1); + if (!ts) + goto nla_put_failure; + + PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); + PUT_TSTAT_U64(SENT_BYTES64, b->bytes); + PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); + + PUT_TSTAT_U32(TARGET_US, + ktime_to_us(ns_to_ktime(b->cparams.target))); + PUT_TSTAT_U32(INTERVAL_US, + ktime_to_us(ns_to_ktime(b->cparams.interval))); + + PUT_TSTAT_U32(SENT_PACKETS, b->packets); + PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); + PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); + PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); + + PUT_TSTAT_U32(PEAK_DELAY_US, + ktime_to_us(ns_to_ktime(b->peak_delay))); + PUT_TSTAT_U32(AVG_DELAY_US, + ktime_to_us(ns_to_ktime(b->avge_delay))); + PUT_TSTAT_U32(BASE_DELAY_US, + ktime_to_us(ns_to_ktime(b->base_delay))); + + PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); + PUT_TSTAT_U32(WAY_MISSES, b->way_misses); + PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); + + PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + + b->decaying_flow_count); + PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); + PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); + PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); + + PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); + nla_nest_end(d->skb, ts); + } + +#undef PUT_TSTAT_U32 +#undef PUT_TSTAT_U64 + + nla_nest_end(d->skb, tstats); + return nla_nest_end(d->skb, stats); + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long cake_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return 0; +} + +static void cake_unbind(struct Qdisc *q, unsigned long cl) +{ +} + +static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return q->block; +} + +static int cake_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct cake_sched_data *q = qdisc_priv(sch); + const struct cake_flow *flow = NULL; + struct gnet_stats_queue qs = { 0 }; + struct nlattr *stats; + u32 idx = cl - 1; + + if (idx < CAKE_QUEUES * q->tin_cnt) { + const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES]; + const struct sk_buff *skb; + + flow = &b->flows[idx % CAKE_QUEUES]; + + if (flow->head) { + sch_tree_lock(sch); + skb = flow->head; + while (skb) { + qs.qlen++; + skb = skb->next; + } + sch_tree_unlock(sch); + } + qs.backlog = b->backlogs[idx % CAKE_QUEUES]; + qs.drops = flow->dropped; + } + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) + return -1; + if (flow) { + ktime_t now = ktime_get(); + + stats = nla_nest_start(d->skb, TCA_STATS_APP); + if (!stats) + return -1; + +#define PUT_STAT_U32(attr, data) do { \ + if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) +#define PUT_STAT_S32(attr, data) do { \ + if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ + goto nla_put_failure; \ + } while (0) + + PUT_STAT_S32(DEFICIT, flow->deficit); + PUT_STAT_U32(DROPPING, flow->cvars.dropping); + PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); + PUT_STAT_U32(P_DROP, flow->cvars.p_drop); + if (flow->cvars.p_drop) { + PUT_STAT_S32(BLUE_TIMER_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.blue_timer))); + } + if (flow->cvars.dropping) { + PUT_STAT_S32(DROP_NEXT_US, + ktime_to_us( + ktime_sub(now, + flow->cvars.drop_next))); + } + + if (nla_nest_end(d->skb, stats) < 0) + return -1; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(d->skb, stats); + return -1; +} + +static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cake_sched_data *q = qdisc_priv(sch); + unsigned int i, j; + + if (arg->stop) + return; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + for (j = 0; j < CAKE_QUEUES; j++) { + if (list_empty(&b->flows[j].flowchain) || + arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } + } +} + +static const struct Qdisc_class_ops cake_class_ops = { + .leaf = cake_leaf, + .find = cake_find, + .tcf_block = cake_tcf_block, + .bind_tcf = cake_bind, + .unbind_tcf = cake_unbind, + .dump = cake_dump_class, + .dump_stats = cake_dump_class_stats, + .walk = cake_walk, +}; + +static struct Qdisc_ops cake_qdisc_ops __read_mostly = { + .cl_ops = &cake_class_ops, + .id = "cake", + .priv_size = sizeof(struct cake_sched_data), + .enqueue = cake_enqueue, + .dequeue = cake_dequeue, + .peek = qdisc_peek_dequeued, + .init = cake_init, + .reset = cake_reset, + .destroy = cake_destroy, + .change = cake_change, + .dump = cake_dump, + .dump_stats = cake_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cake_module_init(void) +{ + return register_qdisc(&cake_qdisc_ops); +} + +static void __exit cake_module_exit(void) +{ + unregister_qdisc(&cake_qdisc_ops); +} + +module_init(cake_module_init) +module_exit(cake_module_exit) +MODULE_AUTHOR("Jonathan Morton"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("The CAKE shaper."); -- cgit v1.2.3 From 7298de9cd7255a783ba93533acbf1c2b0a9c582d Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add ingress mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ingress mode is meant to be enabled when CAKE runs downlink of the actual bottleneck (such as on an IFB device). The mode changes the shaper to also account dropped packets to the shaped rate, as these have already traversed the bottleneck. Enabling ingress mode will also tune the AQM to always keep at least two packets queued *for each flow*. This is done by scaling the minimum queue occupancy level that will disable the AQM by the number of active bulk flows. The rationale for this is that retransmits are more expensive in ingress mode, since dropped packets have to traverse the bottleneck again when they are retransmitted; thus, being more lenient and keeping a minimum number of packets queued will improve throughput in cases where the number of active flows are so large that they saturate the bottleneck even at their minimum window size. This commit also adds a separate switch to enable ingress mode rate autoscaling. If enabled, the autoscaling code will observe the actual traffic rate and adjust the shaper rate to match it. This can help avoid latency increases in the case where the actual bottleneck rate decreases below the shaped rate. The scaling filters out spikes by an EWMA filter. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ea0272615d63..2950a8d07887 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -435,7 +435,8 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, static bool cobalt_should_drop(struct cobalt_vars *vars, struct cobalt_params *p, ktime_t now, - struct sk_buff *skb) + struct sk_buff *skb, + u32 bulk_flows) { bool next_due, over_target, drop = false; ktime_t schedule; @@ -459,6 +460,7 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); schedule = ktime_sub(now, vars->drop_next); over_target = sojourn > p->target && + sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; next_due = vars->count && ktime_to_ns(schedule) >= 0; @@ -881,6 +883,9 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) b->tin_dropped++; sch->qstats.drops++; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, skb, now, true); + __qdisc_drop(skb, to_free); sch->q.qlen--; @@ -921,6 +926,8 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, return 0; } +static void cake_reconfigure(struct Qdisc *sch); + static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -988,8 +995,46 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ - q->avg_window_bytes = 0; - q->last_packet_time = now; + if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + u64 packet_interval = \ + ktime_to_ns(ktime_sub(now, q->last_packet_time)); + + if (packet_interval > NSEC_PER_SEC) + packet_interval = NSEC_PER_SEC; + + /* filter out short-term bursts, eg. wifi aggregation */ + q->avg_packet_interval = \ + cake_ewma(q->avg_packet_interval, + packet_interval, + (packet_interval > q->avg_packet_interval ? + 2 : 8)); + + q->last_packet_time = now; + + if (packet_interval > q->avg_packet_interval) { + u64 window_interval = \ + ktime_to_ns(ktime_sub(now, + q->avg_window_begin)); + u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; + + do_div(b, window_interval); + q->avg_peak_bandwidth = + cake_ewma(q->avg_peak_bandwidth, b, + b > q->avg_peak_bandwidth ? 2 : 8); + q->avg_window_bytes = 0; + q->avg_window_begin = now; + + if (ktime_after(now, + ktime_add_ms(q->last_reconfig_time, + 250))) { + q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + cake_reconfigure(sch); + } + } + } else { + q->avg_window_bytes = 0; + q->last_packet_time = now; + } /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { @@ -1268,15 +1313,27 @@ retry: } /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb) || + if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))) || !flow->head) break; + /* drop this packet, get another one */ + if (q->rate_flags & CAKE_FLAG_INGRESS) { + len = cake_advance_shaper(q, b, skb, + now, true); + flow->deficit -= len; + b->tin_deficit -= len; + } flow->dropped++; b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); kfree_skb(skb); + if (q->rate_flags & CAKE_FLAG_INGRESS) + goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; @@ -1459,6 +1516,20 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->target = 1; } + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) + q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + + if (tb[TCA_CAKE_INGRESS]) { + if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) + q->rate_flags |= CAKE_FLAG_INGRESS; + else + q->rate_flags &= ~CAKE_FLAG_INGRESS; + } + if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); @@ -1582,6 +1653,14 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_AUTORATE, + !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_INGRESS, + !!(q->rate_flags & CAKE_FLAG_INGRESS))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 8b7138814f29933898ecd31dfc83e35a30ee69f5 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add optional ACK filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ACK filter is an optional feature of CAKE which is designed to improve performance on links with very asymmetrical rate limits. On such links (which are unfortunately quite prevalent, especially for DSL and cable subscribers), the downstream throughput can be limited by the number of ACKs capable of being transmitted in the *upstream* direction. Filtering ACKs can, in general, have adverse effects on TCP performance because it interferes with ACK clocking (especially in slow start), and it reduces the flow's resiliency to ACKs being dropped further along the path. To alleviate these drawbacks, the ACK filter in CAKE tries its best to always keep enough ACKs queued to ensure forward progress in the TCP flow being filtered. It does this by only filtering redundant ACKs. In its default 'conservative' mode, the filter will always keep at least two redundant ACKs in the queue, while in 'aggressive' mode, it will filter down to a single ACK. The ACK filter works by inspecting the per-flow queue on every packet enqueue. Starting at the head of the queue, the filter looks for another eligible packet to drop (so the ACK being dropped is always closer to the head of the queue than the packet being enqueued). An ACK is eligible only if it ACKs *fewer* bytes than the new packet being enqueued, including any SACK options. This prevents duplicate ACKs from being filtered, to avoid interfering with retransmission logic. In addition, we check TCP header options and only drop those that are known to not interfere with sender state. In particular, packets with unknown option codes are never dropped. In aggressive mode, an eligible packet is always dropped, while in conservative mode, at least two ACKs are kept in the queue. Only pure ACKs (with no data segments) are considered eligible for dropping, but when an ACK with data segments is enqueued, this can cause another pure ACK to become eligible for dropping. The approach described above ensures that this ACK filter avoids most of the drawbacks of a naive filtering mechanism that only keeps flow state but does not inspect the queue. This is the rationale for including the ACK filter in CAKE itself rather than as separate module (as the TC filter, for instance). Our performance evaluation has shown that on a 30/1 Mbps link with a bidirectional traffic test (RRUL), turning on the ACK filter on the upstream link improves downstream throughput by ~20% (both modes) and upstream throughput by ~12% in conservative mode and ~40% in aggressive mode, at the cost of ~5ms of inter-flow latency due to the increased congestion. In *really* pathological cases, the effect can be a lot more; for instance, the ACK filter increases the achievable downstream throughput on a link with 100 Kbps in the upstream direction by an order of magnitude (from ~2.5 Mbps to ~25 Mbps). Finally, even though we consider the ACK filter to be safer than most, we do not recommend turning it on everywhere: on more symmetrical link bandwidths the effect is negligible at best. Cc: Yuchung Cheng Cc: Neal Cardwell Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 454 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 452 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 2950a8d07887..930096d46c4f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -725,6 +725,433 @@ static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) skb->next = NULL; } +static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, + struct ipv6hdr *buf) +{ + unsigned int offset = skb_network_offset(skb); + struct iphdr *iph; + + iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); + + if (!iph) + return NULL; + + if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) + return skb_header_pointer(skb, offset + iph->ihl * 4, + sizeof(struct ipv6hdr), buf); + + else if (iph->version == 4) + return iph; + + else if (iph->version == 6) + return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), + buf); + + return NULL; +} + +static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, + void *buf, unsigned int bufsize) +{ + unsigned int offset = skb_network_offset(skb); + const struct ipv6hdr *ipv6h; + const struct tcphdr *tcph; + const struct iphdr *iph; + struct ipv6hdr _ipv6h; + struct tcphdr _tcph; + + ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h) + return NULL; + + if (ipv6h->version == 4) { + iph = (struct iphdr *)ipv6h; + offset += iph->ihl * 4; + + /* special-case 6in4 tunnelling, as that is a common way to get + * v6 connectivity in the home + */ + if (iph->protocol == IPPROTO_IPV6) { + ipv6h = skb_header_pointer(skb, offset, + sizeof(_ipv6h), &_ipv6h); + + if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + + } else if (iph->protocol != IPPROTO_TCP) { + return NULL; + } + + } else if (ipv6h->version == 6) { + if (ipv6h->nexthdr != IPPROTO_TCP) + return NULL; + + offset += sizeof(struct ipv6hdr); + } else { + return NULL; + } + + tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!tcph) + return NULL; + + return skb_header_pointer(skb, offset, + min(__tcp_hdrlen(tcph), bufsize), buf); +} + +static const void *cake_get_tcpopt(const struct tcphdr *tcph, + int code, int *oplen) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + if (opcode == code) { + *oplen = opsize; + return ptr; + } + + ptr += opsize - 2; + length -= opsize; + } + + return NULL; +} + +/* Compare two SACK sequences. A sequence is considered greater if it SACKs more + * bytes than the other. In the case where both sequences ACKs bytes that the + * other doesn't, A is considered greater. DSACKs in A also makes A be + * considered greater. + * + * @return -1, 0 or 1 as normal compare functions + */ +static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, + const struct tcphdr *tcph_b) +{ + const struct tcp_sack_block_wire *sack_a, *sack_b; + u32 ack_seq_a = ntohl(tcph_a->ack_seq); + u32 bytes_a = 0, bytes_b = 0; + int oplen_a, oplen_b; + bool first = true; + + sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); + sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); + + /* pointers point to option contents */ + oplen_a -= TCPOLEN_SACK_BASE; + oplen_b -= TCPOLEN_SACK_BASE; + + if (sack_a && oplen_a >= sizeof(*sack_a) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return -1; + else if (sack_b && oplen_b >= sizeof(*sack_b) && + (!sack_a || oplen_a < sizeof(*sack_a))) + return 1; + else if ((!sack_a || oplen_a < sizeof(*sack_a)) && + (!sack_b || oplen_b < sizeof(*sack_b))) + return 0; + + while (oplen_a >= sizeof(*sack_a)) { + const struct tcp_sack_block_wire *sack_tmp = sack_b; + u32 start_a = get_unaligned_be32(&sack_a->start_seq); + u32 end_a = get_unaligned_be32(&sack_a->end_seq); + int oplen_tmp = oplen_b; + bool found = false; + + /* DSACK; always considered greater to prevent dropping */ + if (before(start_a, ack_seq_a)) + return -1; + + bytes_a += end_a - start_a; + + while (oplen_tmp >= sizeof(*sack_tmp)) { + u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); + u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); + + /* first time through we count the total size */ + if (first) + bytes_b += end_b - start_b; + + if (!after(start_b, start_a) && !before(end_b, end_a)) { + found = true; + if (!first) + break; + } + oplen_tmp -= sizeof(*sack_tmp); + sack_tmp++; + } + + if (!found) + return -1; + + oplen_a -= sizeof(*sack_a); + sack_a++; + first = false; + } + + /* If we made it this far, all ranges SACKed by A are covered by B, so + * either the SACKs are equal, or B SACKs more bytes. + */ + return bytes_b > bytes_a ? 1 : 0; +} + +static void cake_tcph_get_tstamp(const struct tcphdr *tcph, + u32 *tsval, u32 *tsecr) +{ + const u8 *ptr; + int opsize; + + ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); + + if (ptr && opsize == TCPOLEN_TIMESTAMP) { + *tsval = get_unaligned_be32(ptr); + *tsecr = get_unaligned_be32(ptr + 4); + } +} + +static bool cake_tcph_may_drop(const struct tcphdr *tcph, + u32 tstamp_new, u32 tsecr_new) +{ + /* inspired by tcp_parse_options in tcp_input.c */ + int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); + const u8 *ptr = (const u8 *)(tcph + 1); + u32 tstamp, tsecr; + + /* 3 reserved flags must be unset to avoid future breakage + * ACK must be set + * ECE/CWR are handled separately + * All other flags URG/PSH/RST/SYN/FIN must be unset + * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) + * 0x00C00000 = CWR/ECE (handled separately) + * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 + */ + if (((tcp_flag_word(tcph) & + cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) + return false; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + if (opcode == TCPOPT_EOL) + break; + if (opcode == TCPOPT_NOP) { + length--; + continue; + } + opsize = *ptr++; + if (opsize < 2 || opsize > length) + break; + + switch (opcode) { + case TCPOPT_MD5SIG: /* doesn't influence state */ + break; + + case TCPOPT_SACK: /* stricter checking performed later */ + if (opsize % 8 != 2) + return false; + break; + + case TCPOPT_TIMESTAMP: + /* only drop timestamps lower than new */ + if (opsize != TCPOLEN_TIMESTAMP) + return false; + tstamp = get_unaligned_be32(ptr); + tsecr = get_unaligned_be32(ptr + 4); + if (after(tstamp, tstamp_new) || + after(tsecr, tsecr_new)) + return false; + break; + + case TCPOPT_MSS: /* these should only be set on SYN */ + case TCPOPT_WINDOW: + case TCPOPT_SACK_PERM: + case TCPOPT_FASTOPEN: + case TCPOPT_EXP: + default: /* don't drop if any unknown options are present */ + return false; + } + + ptr += opsize - 2; + length -= opsize; + } + + return true; +} + +static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, + struct cake_flow *flow) +{ + bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; + struct sk_buff *skb_check, *skb_prev = NULL; + const struct ipv6hdr *ipv6h, *ipv6h_check; + unsigned char _tcph[64], _tcph_check[64]; + const struct tcphdr *tcph, *tcph_check; + const struct iphdr *iph, *iph_check; + struct ipv6hdr _iph, _iph_check; + const struct sk_buff *skb; + int seglen, num_found = 0; + u32 tstamp = 0, tsecr = 0; + __be32 elig_flags = 0; + int sack_comp; + + /* no other possible ACKs to filter */ + if (flow->head == flow->tail) + return NULL; + + skb = flow->tail; + tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); + iph = cake_get_iphdr(skb, &_iph); + if (!tcph) + return NULL; + + cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); + + /* the 'triggering' packet need only have the ACK flag set. + * also check that SYN is not set, as there won't be any previous ACKs. + */ + if ((tcp_flag_word(tcph) & + (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) + return NULL; + + /* the 'triggering' ACK is at the tail of the queue, we have already + * returned if it is the only packet in the flow. loop through the rest + * of the queue looking for pure ACKs with the same 5-tuple as the + * triggering one. + */ + for (skb_check = flow->head; + skb_check && skb_check != skb; + skb_prev = skb_check, skb_check = skb_check->next) { + iph_check = cake_get_iphdr(skb_check, &_iph_check); + tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, + sizeof(_tcph_check)); + + /* only TCP packets with matching 5-tuple are eligible, and only + * drop safe headers + */ + if (!tcph_check || iph->version != iph_check->version || + tcph_check->source != tcph->source || + tcph_check->dest != tcph->dest) + continue; + + if (iph_check->version == 4) { + if (iph_check->saddr != iph->saddr || + iph_check->daddr != iph->daddr) + continue; + + seglen = ntohs(iph_check->tot_len) - + (4 * iph_check->ihl); + } else if (iph_check->version == 6) { + ipv6h = (struct ipv6hdr *)iph; + ipv6h_check = (struct ipv6hdr *)iph_check; + + if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || + ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) + continue; + + seglen = ntohs(ipv6h_check->payload_len); + } else { + WARN_ON(1); /* shouldn't happen */ + continue; + } + + /* If the ECE/CWR flags changed from the previous eligible + * packet in the same flow, we should no longer be dropping that + * previous packet as this would lose information. + */ + if (elig_ack && (tcp_flag_word(tcph_check) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { + elig_ack = NULL; + elig_ack_prev = NULL; + num_found--; + } + + /* Check TCP options and flags, don't drop ACKs with segment + * data, and don't drop ACKs with a higher cumulative ACK + * counter than the triggering packet. Check ACK seqno here to + * avoid parsing SACK options of packets we are going to exclude + * anyway. + */ + if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || + (seglen - __tcp_hdrlen(tcph_check)) != 0 || + after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) + continue; + + /* Check SACK options. The triggering packet must SACK more data + * than the ACK under consideration, or SACK the same range but + * have a larger cumulative ACK counter. The latter is a + * pathological case, but is contained in the following check + * anyway, just to be safe. + */ + sack_comp = cake_tcph_sack_compare(tcph_check, tcph); + + if (sack_comp < 0 || + (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && + sack_comp == 0)) + continue; + + /* At this point we have found an eligible pure ACK to drop; if + * we are in aggressive mode, we are done. Otherwise, keep + * searching unless this is the second eligible ACK we + * found. + * + * Since we want to drop ACK closest to the head of the queue, + * save the first eligible ACK we find, even if we need to loop + * again. + */ + if (!elig_ack) { + elig_ack = skb_check; + elig_ack_prev = skb_prev; + elig_flags = (tcp_flag_word(tcph_check) + & (TCP_FLAG_ECE | TCP_FLAG_CWR)); + } + + if (num_found++ > 0) + goto found; + } + + /* We made it through the queue without finding two eligible ACKs . If + * we found a single eligible ACK we can drop it in aggressive mode if + * we can guarantee that this does not interfere with ECN flag + * information. We ensure this by dropping it only if the enqueued + * packet is consecutive with the eligible ACK, and their flags match. + */ + if (elig_ack && aggressive && elig_ack->next == skb && + (elig_flags == (tcp_flag_word(tcph) & + (TCP_FLAG_ECE | TCP_FLAG_CWR)))) + goto found; + + return NULL; + +found: + if (elig_ack_prev) + elig_ack_prev->next = elig_ack->next; + else + flow->head = elig_ack->next; + + elig_ack->next = NULL; + + return elig_ack; +} + static u64 cake_ewma(u64 avg, u64 sample, u32 shift) { avg -= avg >> shift; @@ -934,6 +1361,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct cake_sched_data *q = qdisc_priv(sch); int len = qdisc_pkt_len(skb); int uninitialized_var(ret); + struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; @@ -980,8 +1408,24 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, cobalt_set_enqueue_time(skb, now); flow_queue_add(flow, skb); - sch->q.qlen++; - q->buffer_used += skb->truesize; + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } /* stats */ b->packets++; @@ -1530,6 +1974,9 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->rate_flags &= ~CAKE_FLAG_INGRESS; } + if (tb[TCA_CAKE_ACK_FILTER]) + q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]); + if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); @@ -1661,6 +2108,9 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_INGRESS))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From ea82511518f4f2e5fe83d2fe1884ef5fc6be6204 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add NAT awareness to packet classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CAKE is deployed on a gateway that also performs NAT (which is a common deployment mode), the host fairness mechanism cannot distinguish internal hosts from each other, and so fails to work correctly. To fix this, we add an optional NAT awareness mode, which will query the kernel conntrack mechanism to obtain the pre-NAT addresses for each packet and use that in the flow and host hashing. When the shaper is enabled and the host is already performing NAT, the cost of this lookup is negligible. However, in unlimited mode with no NAT being performed, there is a significant CPU cost at higher bandwidths. For this reason, the feature is turned off by default. Cc: netfilter-devel@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 930096d46c4f..633ca1578114 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -71,6 +71,10 @@ #include #include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + #define CAKE_SET_WAYS (8) #define CAKE_MAX_TINS (8) #define CAKE_QUEUES (1024) @@ -516,6 +520,29 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } +static void cake_update_flowkeys(struct flow_keys *keys, + const struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conntrack_tuple tuple = {}; + bool rev = !skb->_nfct; + + if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + return; + + if (!nf_ct_get_tuple_skb(&tuple, skb)) + return; + + keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + + if (keys->ports.ports) { + keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; + keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + } +#endif +} + /* Cake has several subtle multiple bit settings. In these cases you * would be matching triple isolate mode as well. */ @@ -543,6 +570,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); + if (flow_mode & CAKE_FLOW_NAT_FLAG) + cake_update_flowkeys(&keys, skb); + /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat * src and dst host addresses as independently selectable. @@ -1939,12 +1969,25 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + if (tb[TCA_CAKE_NAT]) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + q->flow_mode &= ~CAKE_FLOW_NAT_FLAG; + q->flow_mode |= CAKE_FLOW_NAT_FLAG * + !!nla_get_u32(tb[TCA_CAKE_NAT]); +#else + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], + "No conntrack support in kernel"); + return -EOPNOTSUPP; +#endif + } + if (tb[TCA_CAKE_BASE_RATE64]) q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); if (tb[TCA_CAKE_FLOW_MODE]) - q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & - CAKE_FLOW_MASK); + q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | + (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & + CAKE_FLOW_MASK)); if (tb[TCA_CAKE_RTT]) { q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); @@ -2111,6 +2154,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_NAT, + !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 83f8fd69af4f62136765b60fd0efa1c9167917c5 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add DiffServ handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for DiffServ-based priority queueing to CAKE. If the shaper is in use, each priority tier gets its own virtual clock, which limits that tier's rate to a fraction of the overall shaped rate, to discourage trying to game the priority mechanism. CAKE defaults to a simple, three-tier mode that interprets most code points as "best effort", but places CS1 traffic into a low-priority "bulk" tier which is assigned 1/16 of the total rate, and a few code points indicating latency-sensitive or control traffic (specifically TOS4, VA, EF, CS6, CS7) into a "latency sensitive" high-priority tier, which is assigned 1/4 rate. The other supported DiffServ modes are a 4-tier mode matching the 802.11e precedence rules, as well as two 8-tier modes, one of which implements strict precedence of the eight priority levels. This commit also adds an optional DiffServ 'wash' mode, which will zero out the DSCP fields of any packet passing through CAKE. While this can technically be done with other mechanisms in the kernel, having the feature available in CAKE significantly decreases configuration complexity; and the implementation cost is low on top of the other DiffServ-handling code. Filters and applications can set the skb->priority field to override the DSCP-based classification into tiers. If TC_H_MAJ(skb->priority) matches CAKE's qdisc handle, the minor number will be interpreted as a priority tier if it is less than or equal to the number of configured priority tiers. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 439 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 423 insertions(+), 16 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 633ca1578114..43eeca81b247 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -296,6 +296,68 @@ static void cobalt_set_enqueue_time(struct sk_buff *skb, static u16 quantum_div[CAKE_QUEUES + 1] = {0}; +/* Diffserv lookup tables */ + +static const u8 precedence[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, +}; + +static const u8 diffserv8[] = { + 2, 5, 1, 2, 4, 2, 2, 2, + 0, 2, 1, 2, 1, 2, 1, 2, + 5, 2, 4, 2, 4, 2, 4, 2, + 3, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 3, 2, 3, 2, 3, 2, + 6, 2, 2, 2, 6, 2, 6, 2, + 7, 2, 2, 2, 2, 2, 2, 2, + 7, 2, 2, 2, 2, 2, 2, 2, +}; + +static const u8 diffserv4[] = { + 0, 2, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 2, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 2, 0, 2, 0, 2, 0, + 3, 0, 0, 0, 3, 0, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, + 3, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 diffserv3[] = { + 0, 0, 0, 0, 2, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 0, 2, 0, + 2, 0, 0, 0, 0, 0, 0, 0, + 2, 0, 0, 0, 0, 0, 0, 0, +}; + +static const u8 besteffort[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* tin priority order for stats dumping */ + +static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; +static const u8 bulk_order[] = {1, 0, 2, 3}; + #define REC_INV_SQRT_CACHE (16) static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0}; @@ -1351,20 +1413,91 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) return idx + (tin << 16); } -static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, +static void cake_wash_diffserv(struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + break; + case htons(ETH_P_IPV6): + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + break; + default: + break; + } +} + +static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash) +{ + u8 dscp; + + switch (skb->protocol) { + case htons(ETH_P_IP): + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2; + if (wash && dscp) + ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_IPV6): + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2; + if (wash && dscp) + ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); + return dscp; + + case htons(ETH_P_ARP): + return 0x38; /* CS7 - Net Control */ + + default: + /* If there is no Diffserv field, treat as best-effort */ + return 0; + } +} + +static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, + struct sk_buff *skb) +{ + struct cake_sched_data *q = qdisc_priv(sch); + u32 tin; + + if (TC_H_MAJ(skb->priority) == sch->handle && + TC_H_MIN(skb->priority) > 0 && + TC_H_MIN(skb->priority) <= q->tin_cnt) { + tin = TC_H_MIN(skb->priority) - 1; + + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { + /* extract the Diffserv Precedence field, if it exists */ + /* and clear DSCP bits if washing */ + tin = q->tin_index[cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH)]; + if (unlikely(tin >= q->tin_cnt)) + tin = 0; + } else { + tin = 0; + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } + + return &q->tins[tin]; +} + +static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, struct sk_buff *skb, int flow_mode, int *qerr) { struct cake_sched_data *q = qdisc_priv(sch); struct tcf_proto *filter; struct tcf_result res; + u32 flow = 0; int result; filter = rcu_dereference_bh(q->filter_list); if (!filter) - return cake_hash(t, skb, flow_mode) + 1; + goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, filter, &res, false); + if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1378,9 +1511,11 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data *t, } #endif if (TC_H_MIN(res.classid) <= CAKE_QUEUES) - return TC_H_MIN(res.classid); + flow = TC_H_MIN(res.classid); } - return 0; +hash: + *t = cake_select_tin(sch, skb); + return flow ?: cake_hash(*t, skb, flow_mode) + 1; } static void cake_reconfigure(struct Qdisc *sch); @@ -1395,13 +1530,10 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx, tin; - - tin = 0; - b = &q->tins[tin]; + u32 idx; /* choose flow to insert into */ - idx = cake_classify(sch, b, skb, q->flow_mode, &ret); + idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); @@ -1917,18 +2049,275 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->cparams.p_dec = 1 << 20; /* 1/4096 */ } -static void cake_reconfigure(struct Qdisc *sch) +static int cake_config_besteffort(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; - int c, ft = 0; + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; q->tin_cnt = 1; - cake_set_rate(b, q->rate_bps, psched_mtu(qdisc_dev(sch)), + + q->tin_index = besteffort; + q->tin_order = normal_order; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), us_to_ns(q->interval)); b->tin_quantum_band = 65535; b->tin_quantum_prio = 65535; + return 0; +} + +static int cake_config_precedence(struct Qdisc *sch) +{ + /* convert high-level (user visible) parameters into internal format */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + q->tin_index = precedence; + q->tin_order = normal_order; + + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +/* List of known Diffserv codepoints: + * + * Least Effort (CS1) + * Best Effort (CS0) + * Max Reliability & LLT "Lo" (TOS1) + * Max Throughput (TOS2) + * Min Delay (TOS4) + * LLT "La" (TOS5) + * Assured Forwarding 1 (AF1x) - x3 + * Assured Forwarding 2 (AF2x) - x3 + * Assured Forwarding 3 (AF3x) - x3 + * Assured Forwarding 4 (AF4x) - x3 + * Precedence Class 2 (CS2) + * Precedence Class 3 (CS3) + * Precedence Class 4 (CS4) + * Precedence Class 5 (CS5) + * Precedence Class 6 (CS6) + * Precedence Class 7 (CS7) + * Voice Admit (VA) + * Expedited Forwarding (EF) + + * Total 25 codepoints. + */ + +/* List of traffic classes in RFC 4594: + * (roughly descending order of contended priority) + * (roughly ascending order of uncontended throughput) + * + * Network Control (CS6,CS7) - routing traffic + * Telephony (EF,VA) - aka. VoIP streams + * Signalling (CS5) - VoIP setup + * Multimedia Conferencing (AF4x) - aka. video calls + * Realtime Interactive (CS4) - eg. games + * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch + * Broadcast Video (CS3) + * Low Latency Data (AF2x,TOS4) - eg. database + * Ops, Admin, Management (CS2,TOS1) - eg. ssh + * Standard Service (CS0 & unrecognised codepoints) + * High Throughput Data (AF1x,TOS2) - eg. web traffic + * Low Priority Data (CS1) - eg. BitTorrent + + * Total 12 traffic classes. + */ + +static int cake_config_diffserv8(struct Qdisc *sch) +{ +/* Pruned list of traffic classes for typical applications: + * + * Network Control (CS6, CS7) + * Minimum Latency (EF, VA, CS5, CS4) + * Interactive Shell (CS2, TOS1) + * Low Latency Transactions (AF2x, TOS4) + * Video Streaming (AF4x, AF3x, CS3) + * Bog Standard (CS0 etc.) + * High Throughput (AF1x, TOS2) + * Background Traffic (CS1) + * + * Total 8 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum1 = 256; + u32 quantum2 = 256; + u32 i; + + q->tin_cnt = 8; + + /* codepoint to class mapping */ + q->tin_index = diffserv8; + q->tin_order = normal_order; + + /* class characteristics */ + for (i = 0; i < q->tin_cnt; i++) { + struct cake_tin_data *b = &q->tins[i]; + + cake_set_rate(b, rate, mtu, us_to_ns(q->target), + us_to_ns(q->interval)); + + b->tin_quantum_prio = max_t(u16, 1U, quantum1); + b->tin_quantum_band = max_t(u16, 1U, quantum2); + + /* calculate next class's parameters */ + rate *= 7; + rate >>= 3; + + quantum1 *= 3; + quantum1 >>= 1; + + quantum2 *= 7; + quantum2 >>= 3; + } + + return 0; +} + +static int cake_config_diffserv4(struct Qdisc *sch) +{ +/* Further pruned list of traffic classes for four-class system: + * + * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) + * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1) + * Best Effort (CS0, AF1x, TOS2, and those not specified) + * Background Traffic (CS1) + * + * Total 4 traffic classes. + */ + + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 4; + + /* codepoint to class mapping */ + q->tin_index = diffserv4; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 1, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[3], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 2; + q->tins[3].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 1; + q->tins[3].tin_quantum_band = quantum >> 2; + + return 0; +} + +static int cake_config_diffserv3(struct Qdisc *sch) +{ +/* Simplified Diffserv structure with 3 tins. + * Low Priority (CS1) + * Best Effort + * Latency Sensitive (TOS4, VA, EF, CS6, CS7) + */ + struct cake_sched_data *q = qdisc_priv(sch); + u32 mtu = psched_mtu(qdisc_dev(sch)); + u64 rate = q->rate_bps; + u32 quantum = 1024; + + q->tin_cnt = 3; + + /* codepoint to class mapping */ + q->tin_index = diffserv3; + q->tin_order = bulk_order; + + /* class characteristics */ + cake_set_rate(&q->tins[0], rate, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[1], rate >> 4, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + cake_set_rate(&q->tins[2], rate >> 2, mtu, + us_to_ns(q->target), us_to_ns(q->interval)); + + /* priority weights */ + q->tins[0].tin_quantum_prio = quantum; + q->tins[1].tin_quantum_prio = quantum >> 4; + q->tins[2].tin_quantum_prio = quantum << 4; + + /* bandwidth-sharing weights */ + q->tins[0].tin_quantum_band = quantum; + q->tins[1].tin_quantum_band = quantum >> 4; + q->tins[2].tin_quantum_band = quantum >> 2; + + return 0; +} + +static void cake_reconfigure(struct Qdisc *sch) +{ + struct cake_sched_data *q = qdisc_priv(sch); + int c, ft; + + switch (q->tin_mode) { + case CAKE_DIFFSERV_BESTEFFORT: + ft = cake_config_besteffort(sch); + break; + + case CAKE_DIFFSERV_PRECEDENCE: + ft = cake_config_precedence(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV8: + ft = cake_config_diffserv8(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV4: + ft = cake_config_diffserv4(sch); + break; + + case CAKE_DIFFSERV_DIFFSERV3: + default: + ft = cake_config_diffserv3(sch); + break; + } + for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; @@ -1984,6 +2373,16 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_BASE_RATE64]) q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]); + if (tb[TCA_CAKE_DIFFSERV_MODE]) + q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]); + + if (tb[TCA_CAKE_WASH]) { + if (!!nla_get_u32(tb[TCA_CAKE_WASH])) + q->rate_flags |= CAKE_FLAG_WASH; + else + q->rate_flags &= ~CAKE_FLAG_WASH; + } + if (tb[TCA_CAKE_FLOW_MODE]) q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) | (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & @@ -2048,7 +2447,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; - q->tin_mode = CAKE_DIFFSERV_BESTEFFORT; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; q->rate_bps = 0; /* unlimited by default */ @@ -2158,6 +2557,13 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->flow_mode & CAKE_FLOW_NAT_FLAG))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_WASH, + !!(q->rate_flags & CAKE_FLAG_WASH))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: @@ -2211,7 +2617,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) } while (0) for (i = 0; i < q->tin_cnt; i++) { - struct cake_tin_data *b = &q->tins[i]; + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; ts = nla_nest_start(d->skb, i + 1); if (!ts) @@ -2310,7 +2716,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, u32 idx = cl - 1; if (idx < CAKE_QUEUES * q->tin_cnt) { - const struct cake_tin_data *b = &q->tins[idx / CAKE_QUEUES]; + const struct cake_tin_data *b = \ + &q->tins[q->tin_order[idx / CAKE_QUEUES]]; const struct sk_buff *skb; flow = &b->flows[idx % CAKE_QUEUES]; @@ -2382,7 +2789,7 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->tin_cnt; i++) { - struct cake_tin_data *b = &q->tins[i]; + struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { if (list_empty(&b->flows[j].flowchain) || -- cgit v1.2.3 From a729b7f0bd5bf4919306556aed614438f5174537 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Add overhead compensation support to the rate shaper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds configurable overhead compensation support to the rate shaper. With this feature, userspace can configure the actual bottleneck link overhead and encapsulation mode used, which will be used by the shaper to calculate the precise duration of each packet on the wire. This feature is needed because CAKE is often deployed one or two hops upstream of the actual bottleneck (which can be, e.g., inside a DSL or cable modem). In this case, the link layer characteristics and overhead reported by the kernel does not match the actual bottleneck. Being able to set the actual values in use makes it possible to configure the shaper rate much closer to the actual bottleneck rate (our experience shows it is possible to get with 0.1% of the actual physical bottleneck rate), thus keeping latency low without sacrificing bandwidth. The overhead compensation has three tunables: A fixed per-packet overhead size (which, if set, will be accounted from the IP packet header), a minimum packet size (MPU) and a framing mode supporting either ATM or PTM framing. We include a set of common keywords in TC to help users configure the right parameters. If no overhead value is set, the value reported by the kernel is used. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 123 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 43eeca81b247..199670e1eb94 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -270,6 +270,7 @@ enum { struct cobalt_skb_cb { ktime_t enqueue_time; + u32 adjusted_len; }; static u64 us_to_ns(u64 us) @@ -1251,6 +1252,88 @@ static u64 cake_ewma(u64 avg, u64 sample, u32 shift) return avg; } +static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +{ + if (q->rate_flags & CAKE_FLAG_OVERHEAD) + len -= off; + + if (q->max_netlen < len) + q->max_netlen = len; + if (q->min_netlen > len) + q->min_netlen = len; + + len += q->rate_overhead; + + if (len < q->rate_mpu) + len = q->rate_mpu; + + if (q->atm_mode == CAKE_ATM_ATM) { + len += 47; + len /= 48; + len *= 53; + } else if (q->atm_mode == CAKE_ATM_PTM) { + /* Add one byte per 64 bytes or part thereof. + * This is conservative and easier to calculate than the + * precise value. + */ + len += (len + 63) / 64; + } + + if (q->max_adjlen < len) + q->max_adjlen = len; + if (q->min_adjlen > len) + q->min_adjlen = len; + + return len; +} + +static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + unsigned int hdr_len, last_len = 0; + u32 off = skb_network_offset(skb); + u32 len = qdisc_pkt_len(skb); + u16 segs = 1; + + q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); + + if (!shinfo->gso_size) + return cake_calc_overhead(q, len, off); + + /* borrowed from qdisc_pkt_len_init() */ + hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + /* + transport layer */ + if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | + SKB_GSO_TCPV6))) { + const struct tcphdr *th; + struct tcphdr _tcphdr; + + th = skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_tcphdr), &_tcphdr); + if (likely(th)) + hdr_len += __tcp_hdrlen(th); + } else { + struct udphdr _udphdr; + + if (skb_header_pointer(skb, skb_transport_offset(skb), + sizeof(_udphdr), &_udphdr)) + hdr_len += sizeof(struct udphdr); + } + + if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) + segs = DIV_ROUND_UP(skb->len - hdr_len, + shinfo->gso_size); + else + segs = shinfo->gso_segs; + + len = shinfo->gso_size + hdr_len; + last_len = skb->len - shinfo->gso_size * (segs - 1); + + return (cake_calc_overhead(q, len, off) * (segs - 1) + + cake_calc_overhead(q, last_len, off)); +} + static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) { struct cake_heap_entry ii = q->overflow_heap[i]; @@ -1328,7 +1411,7 @@ static int cake_advance_shaper(struct cake_sched_data *q, struct sk_buff *skb, ktime_t now, bool drop) { - u32 len = qdisc_pkt_len(skb); + u32 len = get_cobalt_cb(skb)->adjusted_len; /* charge packet bandwidth to this tin * and to the global shaper. @@ -1568,6 +1651,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->max_skblen = len; cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); if (q->ack_filter) @@ -2388,6 +2472,31 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & CAKE_FLOW_MASK)); + if (tb[TCA_CAKE_ATM]) + q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]); + + if (tb[TCA_CAKE_OVERHEAD]) { + q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]); + q->rate_flags |= CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_RAW]) { + q->rate_flags &= ~CAKE_FLAG_OVERHEAD; + + q->max_netlen = 0; + q->max_adjlen = 0; + q->min_netlen = ~0; + q->min_adjlen = ~0; + } + + if (tb[TCA_CAKE_MPU]) + q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]); + if (tb[TCA_CAKE_RTT]) { q->interval = nla_get_u32(tb[TCA_CAKE_RTT]); @@ -2564,6 +2673,19 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_WASH))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead)) + goto nla_put_failure; + + if (!(q->rate_flags & CAKE_FLAG_OVERHEAD)) + if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 0c850344d3882886f842bf0b50a9ff23001adb7e Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 6 Jul 2018 17:37:19 +0200 Subject: sch_cake: Conditionally split GSO segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At lower bandwidths, the transmission time of a single GSO segment can add an unacceptable amount of latency due to HOL blocking. Furthermore, with a software shaper, any tuning mechanism employed by the kernel to control the maximum size of GSO segments is thrown off by the artificial limit on bandwidth. For this reason, we split GSO segments into their individual packets iff the shaper is active and configured to a bandwidth <= 1 Gbps. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 99 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 26 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 199670e1eb94..30695691e9ff 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -80,6 +80,7 @@ #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 +#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate @@ -1650,36 +1651,73 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - cobalt_set_enqueue_time(skb, now); - get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); - flow_queue_add(flow, skb); - - if (q->ack_filter) - ack = cake_ack_filter(q, flow); + if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + struct sk_buff *segs, *nskb; + netdev_features_t features = netif_skb_features(skb); + unsigned int slen = 0; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + while (segs) { + nskb = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + cobalt_set_enqueue_time(segs, now); + get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, + segs); + flow_queue_add(flow, segs); + + sch->q.qlen++; + slen += segs->len; + q->buffer_used += segs->truesize; + b->packets++; + segs = nskb; + } - if (ack) { - b->ack_drops++; - sch->qstats.drops++; - b->bytes += qdisc_pkt_len(ack); - len -= qdisc_pkt_len(ack); - q->buffer_used += skb->truesize - ack->truesize; - if (q->rate_flags & CAKE_FLAG_INGRESS) - cake_advance_shaper(q, b, ack, now, true); + /* stats */ + b->bytes += slen; + b->backlogs[idx] += slen; + b->tin_backlog += slen; + sch->qstats.backlog += slen; + q->avg_window_bytes += slen; - qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); - consume_skb(ack); + qdisc_tree_reduce_backlog(sch, 1, len); + consume_skb(skb); } else { - sch->q.qlen++; - q->buffer_used += skb->truesize; - } + /* not splitting */ + cobalt_set_enqueue_time(skb, now); + get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); + flow_queue_add(flow, skb); + + if (q->ack_filter) + ack = cake_ack_filter(q, flow); + + if (ack) { + b->ack_drops++; + sch->qstats.drops++; + b->bytes += qdisc_pkt_len(ack); + len -= qdisc_pkt_len(ack); + q->buffer_used += skb->truesize - ack->truesize; + if (q->rate_flags & CAKE_FLAG_INGRESS) + cake_advance_shaper(q, b, ack, now, true); + + qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + consume_skb(ack); + } else { + sch->q.qlen++; + q->buffer_used += skb->truesize; + } - /* stats */ - b->packets++; - b->bytes += len; - b->backlogs[idx] += len; - b->tin_backlog += len; - sch->qstats.backlog += len; - q->avg_window_bytes += len; + /* stats */ + b->packets++; + b->bytes += len; + b->backlogs[idx] += len; + b->tin_backlog += len; + sch->qstats.backlog += len; + q->avg_window_bytes += len; + } if (q->overflow_timeout) cake_heapify_up(q, b->overflow_idx[idx]); @@ -2531,6 +2569,11 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); + if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2686,6 +2729,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, + !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 5e9a0fe492f89ff1c7583ee6ea89dc37b8c2e5c2 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 9 Jul 2018 02:26:20 +0000 Subject: net/sched: flower: Fix null pointer dereference when run tc vlan command Zahari issued tc vlan command without setting vlan_ethtype, which will crash kernel. To avoid this, we must check tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE] is not null before use it. Also we don't need to dump vlan_ethtype or cvlan_ethtype in this case. Fixes: d64efd0926ba ('net/sched: flower: Add supprt for matching on QinQ vlan headers') Signed-off-by: Jianbo Liu Reported-by: Zahari Doychev Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 487a152a852c..8b2474293db1 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -605,20 +605,22 @@ static int fl_set_key(struct net *net, struct nlattr **tb, TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, &mask->vlan); - ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); - if (eth_type_vlan(ethertype)) { - fl_set_key_vlan(tb, ethertype, - TCA_FLOWER_KEY_CVLAN_ID, - TCA_FLOWER_KEY_CVLAN_PRIO, - &key->cvlan, &mask->cvlan); - fl_set_key_val(tb, &key->basic.n_proto, - TCA_FLOWER_KEY_CVLAN_ETH_TYPE, - &mask->basic.n_proto, - TCA_FLOWER_UNSPEC, - sizeof(key->basic.n_proto)); - } else { - key->basic.n_proto = ethertype; - mask->basic.n_proto = cpu_to_be16(~0); + if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) { + ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); + if (eth_type_vlan(ethertype)) { + fl_set_key_vlan(tb, ethertype, + TCA_FLOWER_KEY_CVLAN_ID, + TCA_FLOWER_KEY_CVLAN_PRIO, + &key->cvlan, &mask->cvlan); + fl_set_key_val(tb, &key->basic.n_proto, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + &mask->basic.n_proto, + TCA_FLOWER_UNSPEC, + sizeof(key->basic.n_proto)); + } else { + key->basic.n_proto = ethertype; + mask->basic.n_proto = cpu_to_be16(~0); + } } } else { key->basic.n_proto = ethertype; @@ -1344,14 +1346,16 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, key->cvlan.vlan_tpid))) goto nla_put_failure; - if (mask->cvlan.vlan_tpid) { - if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, - key->basic.n_proto)) - goto nla_put_failure; - } else if (mask->vlan.vlan_tpid) { - if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - key->basic.n_proto)) - goto nla_put_failure; + if (mask->basic.n_proto) { + if (mask->cvlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } else if (mask->vlan.vlan_tpid) { + if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->basic.n_proto)) + goto nla_put_failure; + } } if ((key->basic.n_proto == htons(ETH_P_IP) || -- cgit v1.2.3 From 01e866bf07fbb10e96bff46ea1e5e0410d6e40b9 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 14:33:26 +0300 Subject: net: sched: act_ife: fix memory leak in ife init Free params if tcf_idr_check_alloc() returned error. Fixes: 0190c1d452a9 ("net: sched: atomically check-allocate action") Reported-by: Dan Carpenter Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ife.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a3eef00cd711..3d6e265758c0 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -485,8 +485,10 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return -ENOMEM; err = tcf_idr_check_alloc(tn, &parm->index, a, bind); - if (err < 0) + if (err < 0) { + kfree(p); return err; + } exists = err; if (exists && bind) { kfree(p); -- cgit v1.2.3 From e0479b670d394d478907bd4fc22daab6516953c7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 20:26:47 +0300 Subject: net: sched: fix unprotected access to rcu cookie pointer Fix action attribute size calculation function to take rcu read lock and access act_cookie pointer with rcu dereference. Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Vlad Buslov Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sched/act_api.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 66dc19746c63..148a89ab789b 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { + struct tc_cookie *act_cookie; u32 cookie_len = 0; - if (act->act_cookie) - cookie_len = nla_total_size(act->act_cookie->len); + rcu_read_lock(); + act_cookie = rcu_dereference(act->act_cookie); + + if (act_cookie) + cookie_len = nla_total_size(act_cookie->len); + rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ -- cgit v1.2.3 From 6f3dfb0dc831953187fea8e3b798768611441321 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 11 Jul 2018 16:04:49 +0200 Subject: net/sched: skbedit: use per-cpu counters use per-CPU counters, instead of sharing a single set of stats with all cores: this removes the need of spinlocks when stats are read/updated. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 86521a74ecdd..8651b5bd6b59 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -38,10 +38,10 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, { struct tcf_skbedit *d = to_skbedit(a); - spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); - bstats_update(&d->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + spin_lock(&d->tcf_lock); if (d->flags & SKBEDIT_F_PRIORITY) skb->priority = d->priority; if (d->flags & SKBEDIT_F_INHERITDSFIELD) { @@ -77,8 +77,8 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, return d->tcf_action; err: - d->tcf_qstats.drops++; spin_unlock(&d->tcf_lock); + qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -169,7 +169,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, - &act_skbedit_ops, bind, false); + &act_skbedit_ops, bind, true); if (ret) { tcf_idr_cleanup(tn, parm->index); return ret; -- cgit v1.2.3 From c749cdda9089eb1fdb6a9ab98f945124d12f2595 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 11 Jul 2018 16:04:50 +0200 Subject: net/sched: act_skbedit: don't use spinlock in the data path use RCU instead of spin_{,un}lock_bh, to protect concurrent read/write on act_skbedit configuration. This reduces the effects of contention in the data path, in case multiple readers are present. Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_skbedit.h | 37 ++++++++++---- net/sched/act_skbedit.c | 107 +++++++++++++++++++++++++--------------- 2 files changed, 95 insertions(+), 49 deletions(-) (limited to 'net/sched') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 19cd3d345804..911bbac838a2 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -22,14 +22,19 @@ #include #include +struct tcf_skbedit_params { + u32 flags; + u32 priority; + u32 mark; + u32 mask; + u16 queue_mapping; + u16 ptype; + struct rcu_head rcu; +}; + struct tcf_skbedit { - struct tc_action common; - u32 flags; - u32 priority; - u32 mark; - u32 mask; - u16 queue_mapping; - u16 ptype; + struct tc_action common; + struct tcf_skbedit_params __rcu *params; }; #define to_skbedit(a) ((struct tcf_skbedit *)a) @@ -37,15 +42,27 @@ struct tcf_skbedit { static inline bool is_tcf_skbedit_mark(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) - return to_skbedit(a)->flags == SKBEDIT_F_MARK; + u32 flags; + + if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) { + rcu_read_lock(); + flags = rcu_dereference(to_skbedit(a)->params)->flags; + rcu_read_unlock(); + return flags == SKBEDIT_F_MARK; + } #endif return false; } static inline u32 tcf_skbedit_mark(const struct tc_action *a) { - return to_skbedit(a)->mark; + u32 mark; + + rcu_read_lock(); + mark = rcu_dereference(to_skbedit(a)->params)->mark; + rcu_read_unlock(); + + return mark; } #endif /* __NET_TC_SKBEDIT_H */ diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 8651b5bd6b59..da56e6938c9e 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -37,14 +37,19 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + int action; tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - spin_lock(&d->tcf_lock); - if (d->flags & SKBEDIT_F_PRIORITY) - skb->priority = d->priority; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) { + rcu_read_lock(); + params = rcu_dereference(d->params); + action = READ_ONCE(d->tcf_action); + + if (params->flags & SKBEDIT_F_PRIORITY) + skb->priority = params->priority; + if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); switch (tc_skb_protocol(skb)) { @@ -63,23 +68,23 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, break; } } - if (d->flags & SKBEDIT_F_QUEUE_MAPPING && - skb->dev->real_num_tx_queues > d->queue_mapping) - skb_set_queue_mapping(skb, d->queue_mapping); - if (d->flags & SKBEDIT_F_MARK) { - skb->mark &= ~d->mask; - skb->mark |= d->mark & d->mask; + if (params->flags & SKBEDIT_F_QUEUE_MAPPING && + skb->dev->real_num_tx_queues > params->queue_mapping) + skb_set_queue_mapping(skb, params->queue_mapping); + if (params->flags & SKBEDIT_F_MARK) { + skb->mark &= ~params->mask; + skb->mark |= params->mark & params->mask; } - if (d->flags & SKBEDIT_F_PTYPE) - skb->pkt_type = d->ptype; - - spin_unlock(&d->tcf_lock); - return d->tcf_action; + if (params->flags & SKBEDIT_F_PTYPE) + skb->pkt_type = params->ptype; +unlock: + rcu_read_unlock(); + return action; err: - spin_unlock(&d->tcf_lock); qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; + action = TC_ACT_SHOT; + goto unlock; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { @@ -98,6 +103,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tcf_skbedit_params *params_old, *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tc_skbedit *parm; struct tcf_skbedit *d; @@ -185,25 +191,34 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } } - spin_lock_bh(&d->tcf_lock); + ASSERT_RTNL(); - d->flags = flags; + params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); + if (unlikely(!params_new)) { + if (ret == ACT_P_CREATED) + tcf_idr_release(*a, bind); + return -ENOMEM; + } + + params_new->flags = flags; if (flags & SKBEDIT_F_PRIORITY) - d->priority = *priority; + params_new->priority = *priority; if (flags & SKBEDIT_F_QUEUE_MAPPING) - d->queue_mapping = *queue_mapping; + params_new->queue_mapping = *queue_mapping; if (flags & SKBEDIT_F_MARK) - d->mark = *mark; + params_new->mark = *mark; if (flags & SKBEDIT_F_PTYPE) - d->ptype = *ptype; + params_new->ptype = *ptype; /* default behaviour is to use all the bits */ - d->mask = 0xffffffff; + params_new->mask = 0xffffffff; if (flags & SKBEDIT_F_MASK) - d->mask = *mask; + params_new->mask = *mask; d->tcf_action = parm->action; - - spin_unlock_bh(&d->tcf_lock); + params_old = rtnl_dereference(d->params); + rcu_assign_pointer(d->params, params_new); + if (params_old) + kfree_rcu(params_old, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -215,33 +230,36 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, .action = d->tcf_action, }; - struct tcf_t t; u64 pure_flags = 0; + struct tcf_t t; + + params = rtnl_dereference(d->params); if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PRIORITY) && - nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, d->priority)) + if ((params->flags & SKBEDIT_F_PRIORITY) && + nla_put_u32(skb, TCA_SKBEDIT_PRIORITY, params->priority)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) && - nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, d->queue_mapping)) + if ((params->flags & SKBEDIT_F_QUEUE_MAPPING) && + nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING, params->queue_mapping)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MARK) && - nla_put_u32(skb, TCA_SKBEDIT_MARK, d->mark)) + if ((params->flags & SKBEDIT_F_MARK) && + nla_put_u32(skb, TCA_SKBEDIT_MARK, params->mark)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_PTYPE) && - nla_put_u16(skb, TCA_SKBEDIT_PTYPE, d->ptype)) + if ((params->flags & SKBEDIT_F_PTYPE) && + nla_put_u16(skb, TCA_SKBEDIT_PTYPE, params->ptype)) goto nla_put_failure; - if ((d->flags & SKBEDIT_F_MASK) && - nla_put_u32(skb, TCA_SKBEDIT_MASK, d->mask)) + if ((params->flags & SKBEDIT_F_MASK) && + nla_put_u32(skb, TCA_SKBEDIT_MASK, params->mask)) goto nla_put_failure; - if (d->flags & SKBEDIT_F_INHERITDSFIELD) + if (params->flags & SKBEDIT_F_INHERITDSFIELD) pure_flags |= SKBEDIT_F_INHERITDSFIELD; if (pure_flags != 0 && nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) @@ -257,6 +275,16 @@ nla_put_failure: return -1; } +static void tcf_skbedit_cleanup(struct tc_action *a) +{ + struct tcf_skbedit *d = to_skbedit(a); + struct tcf_skbedit_params *params; + + params = rcu_dereference_protected(d->params, 1); + if (params) + kfree_rcu(params, rcu); +} + static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, @@ -289,6 +317,7 @@ static struct tc_action_ops act_skbedit_ops = { .act = tcf_skbedit, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, + .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, .lookup = tcf_skbedit_search, .delete = tcf_skbedit_delete, -- cgit v1.2.3 From 01683a1469995cc7aaf833d6f8b3f1c1d2fc3b92 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 9 Jul 2018 13:29:11 +0300 Subject: net: sched: refactor flower walk to iterate over idr Extend struct tcf_walker with additional 'cookie' field. It is intended to be used by classifier walk implementations to continue iteration directly from particular filter, instead of iterating 'skip' number of times. Change flower walk implementation to save filter handle in 'cookie'. Each time flower walk is called, it looks up filter with saved handle directly with idr, instead of iterating over filter linked list 'skip' number of times. This change improves complexity of dumping flower classifier from quadratic to linearithmic. (assuming idr lookup has logarithmic complexity) Reviewed-by: Jiri Pirko Signed-off-by: Vlad Buslov Reported-by: Simon Horman Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + net/sched/cls_api.c | 2 ++ net/sched/cls_flower.c | 20 +++++++++----------- 3 files changed, 12 insertions(+), 11 deletions(-) (limited to 'net/sched') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2081e4219f81..e4252a176eec 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -13,6 +13,7 @@ struct tcf_walker { int stop; int skip; int count; + unsigned long cookie; int (*fn)(struct tcf_proto *, void *node, struct tcf_walker *); }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 73d9967c3739..c51b1b12450d 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1508,7 +1508,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.stop = 0; arg.w.skip = cb->args[1] - 1; arg.w.count = 0; + arg.w.cookie = cb->args[2]; tp->ops->walk(tp, &arg.w); + cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; if (arg.w.stop) return false; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8b2474293db1..c53fdd411f90 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1099,19 +1099,17 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; - struct fl_flow_mask *mask; - list_for_each_entry_rcu(mask, &head->masks, list) { - list_for_each_entry_rcu(f, &mask->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; - break; - } -skip: - arg->count++; + arg->count = arg->skip; + + while ((f = idr_get_next_ul(&head->handle_idr, + &arg->cookie)) != NULL) { + if (arg->fn(tp, f, arg) < 0) { + arg->stop = 1; + break; } + arg->cookie = f->handle + 1; + arg->count++; } } -- cgit v1.2.3 From 301f935be9e09a1bf188bd8262a4db0aeeac2b50 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 16 Jul 2018 16:45:09 +0200 Subject: sch_cake: Fix tin order when set through skb->priority MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In diffserv mode, CAKE stores tins in a different order internally than the logical order exposed to userspace. The order remapping was missing in the handling of 'tc filter' priority mappings through skb->priority, resulting in bulk and best effort mappings being reversed relative to how they are displayed. Fix this by adding the missing mapping when reading skb->priority. Fixes: 83f8fd69af4f ("sch_cake: Add DiffServ handling") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 30695691e9ff..539c9490c308 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1546,7 +1546,7 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->tin_cnt) { - tin = TC_H_MIN(skb->priority) - 1; + tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; if (q->rate_flags & CAKE_FLAG_WASH) cake_wash_diffserv(skb); -- cgit v1.2.3 From 07a557f47d7e09b2c60ad4d51b1ac8b035b75f73 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:16 +0300 Subject: net/sched: tunnel_key: Allow to set tos and ttl for tc based ip tunnels Allow user-space to provide tos and ttl to be set for the tunnel headers. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 2 ++ net/sched/act_tunnel_key.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index e284fec8c467..be384d63e1b5 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -39,6 +39,8 @@ enum { TCA_TUNNEL_KEY_ENC_OPTS, /* Nested TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_TOS, /* u8 */ + TCA_TUNNEL_KEY_ENC_TTL, /* u8 */ __TCA_TUNNEL_KEY_MAX, }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 3ec585d58762..22f26e9ea8f1 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -197,6 +197,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, + [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static int tunnel_key_init(struct net *net, struct nlattr *nla, @@ -216,6 +218,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, int opts_len = 0; __be64 key_id; __be16 flags; + u8 tos, ttl; int ret = 0; int err; @@ -273,6 +276,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } } + tos = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TOS]) + tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); + ttl = 0; + if (tb[TCA_TUNNEL_KEY_ENC_TTL]) + ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); + if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; @@ -281,7 +291,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); - metadata = __ip_tun_set_dst(saddr, daddr, 0, 0, + metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && @@ -292,7 +302,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); - metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port, + metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, 0); } else { @@ -504,6 +514,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, !(key->tun_flags & TUNNEL_CSUM)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; + + if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) + goto nla_put_failure; + + if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) + goto nla_put_failure; } tcf_tm_dump(&tm, &t->tcf_tm); -- cgit v1.2.3 From 0e2c17b64d5c7f57bcd7054ef87797376dcdee26 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 17 Jul 2018 19:27:18 +0300 Subject: net/sched: cls_flower: Support matching on ip tos and ttl for tunnels Allow users to set rules matching on ipv4 tos and ttl or ipv6 traffic-class and hoplimit of tunnel headers. Signed-off-by: Or Gerlitz Reviewed-by: Roi Dayan Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ net/sched/cls_flower.c | 43 ++++++++++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 15 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c4262d911596..b4512254036b 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -473,6 +473,11 @@ enum { TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index c53fdd411f90..38d74803e2df 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -52,6 +52,7 @@ struct fl_flow_key { struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; + struct flow_dissector_key_ip enc_ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -453,6 +454,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -561,17 +566,17 @@ static int fl_set_key_flags(struct nlattr **tb, return 0; } -static void fl_set_key_ip(struct nlattr **tb, +static void fl_set_key_ip(struct nlattr **tb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, - &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, - sizeof(key->tos)); + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; - fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, - &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, - sizeof(key->ttl)); + fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)); + fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } static int fl_set_key(struct net *net, struct nlattr **tb, @@ -633,7 +638,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); - fl_set_key_ip(tb, &key->ip, &mask->ip); + fl_set_key_ip(tb, false, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { @@ -768,6 +773,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); + fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -860,6 +867,8 @@ static void fl_init_dissector(struct fl_flow_mask *mask) enc_control); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); skb_flow_dissector_init(&mask->dissector, keys, cnt); } @@ -1208,14 +1217,17 @@ static int fl_dump_key_mpls(struct sk_buff *skb, return 0; } -static int fl_dump_key_ip(struct sk_buff *skb, +static int fl_dump_key_ip(struct sk_buff *skb, bool encap, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { - if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, - TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || - fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, - TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) + int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS; + int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL; + int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK; + int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK; + + if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) || + fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl))) return -1; return 0; @@ -1361,7 +1373,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)) || - fl_dump_key_ip(skb, &key->ip, &mask->ip))) + fl_dump_key_ip(skb, false, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && @@ -1486,7 +1498,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, - sizeof(key->enc_tp.dst))) + sizeof(key->enc_tp.dst)) || + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) -- cgit v1.2.3 From baa2d2b17ee93e2a5b8accc2dd5328db17caf90e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 18 Jul 2018 23:14:17 -0500 Subject: net: sched: use PTR_ERR_OR_ZERO macro in tcf_block_cb_register This line makes up what macro PTR_ERR_OR_ZERO already does. So, make use of PTR_ERR_OR_ZERO rather than an open-code version. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 623fe2cfe529..620067209ba8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -818,7 +818,7 @@ int tcf_block_cb_register(struct tcf_block *block, block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv, extack); - return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0; + return PTR_ERR_OR_ZERO(block_cb); } EXPORT_SYMBOL(tcf_block_cb_register); -- cgit v1.2.3 From f34e8bff58f071e1a3452a5a6e29dac218c83f69 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:04 +0200 Subject: net: sched: push ops lookup bits into tcf_proto_lookup_ops() Push all bits that take care of ops lookup, including module loading outside tcf_proto_create() function, into tcf_proto_lookup_ops() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 620067209ba8..1e8d69790d82 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,7 +39,7 @@ static DEFINE_RWLOCK(cls_mod_lock); /* Find classifier type by string name */ -static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) +static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind) { const struct tcf_proto_ops *t, *res = NULL; @@ -57,6 +57,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind) return res; } +static const struct tcf_proto_ops * +tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + + ops = __tcf_proto_lookup_ops(kind); + if (ops) + return ops; +#ifdef CONFIG_MODULES + rtnl_unlock(); + request_module("cls_%s", kind); + rtnl_lock(); + ops = __tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to perform + * the module load. So, even if we succeeded in loading + * the module we have to replay the request. We indicate + * this using -EAGAIN. + */ + if (ops) { + module_put(ops->owner); + return ERR_PTR(-EAGAIN); + } +#endif + NL_SET_ERR_MSG(extack, "TC classifier not found"); + return ERR_PTR(-ENOENT); +} + /* Register(unregister) new classifier type */ int register_tcf_proto_ops(struct tcf_proto_ops *ops) @@ -133,27 +160,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, if (!tp) return ERR_PTR(-ENOBUFS); - err = -ENOENT; - tp->ops = tcf_proto_lookup_ops(kind); - if (!tp->ops) { -#ifdef CONFIG_MODULES - rtnl_unlock(); - request_module("cls_%s", kind); - rtnl_lock(); - tp->ops = tcf_proto_lookup_ops(kind); - /* We dropped the RTNL semaphore in order to perform - * the module load. So, even if we succeeded in loading - * the module we have to replay the request. We indicate - * this using -EAGAIN. - */ - if (tp->ops) { - module_put(tp->ops->owner); - err = -EAGAIN; - } else { - NL_SET_ERR_MSG(extack, "TC classifier not found"); - err = -ENOENT; - } -#endif + tp->ops = tcf_proto_lookup_ops(kind, extack); + if (IS_ERR(tp->ops)) { + err = PTR_ERR(tp->ops); goto errout; } tp->classify = tp->ops->classify; -- cgit v1.2.3 From f71e0ca4db187af7c44987e9d21e9042c3046070 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:05 +0200 Subject: net: sched: Avoid implicit chain 0 creation Currently, chain 0 is implicitly created during block creation. However that does not align with chain object exposure, creation and destruction api introduced later on. So make the chain 0 behave the same way as any other chain and only create it when it is needed. Since chain 0 is somehow special as the qdiscs need to hold pointer to the first chain tp, this requires to move the chain head change callback infra to the block structure. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ++- net/sched/cls_api.c | 86 +++++++++++++++++++++-------------------------- 2 files changed, 43 insertions(+), 48 deletions(-) (limited to 'net/sched') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7432100027b7..86f4651784e8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -300,7 +300,6 @@ typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); struct tcf_chain { struct tcf_proto __rcu *filter_chain; - struct list_head filter_chain_list; struct list_head list; struct tcf_block *block; u32 index; /* chain index */ @@ -318,6 +317,10 @@ struct tcf_block { bool keep_dst; unsigned int offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ + struct { + struct tcf_chain *chain; + struct list_head filter_chain_list; + } chain0; }; static inline void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1e8d69790d82..eb0bf9037ef9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -204,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (!chain) return NULL; - INIT_LIST_HEAD(&chain->filter_chain_list); list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; chain->refcnt = 1; + if (!chain->index) + block->chain0.chain = chain; return chain; } @@ -218,12 +219,16 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item, if (item->chain_head_change) item->chain_head_change(tp_head, item->chain_head_change_priv); } -static void tcf_chain_head_change(struct tcf_chain *chain, - struct tcf_proto *tp_head) + +static void tcf_chain0_head_change(struct tcf_chain *chain, + struct tcf_proto *tp_head) { struct tcf_filter_chain_list_item *item; + struct tcf_block *block = chain->block; - list_for_each_entry(item, &chain->filter_chain_list, list) + if (chain->index) + return; + list_for_each_entry(item, &block->chain0.filter_chain_list, list) tcf_chain_head_change_item(item, tp_head); } @@ -231,7 +236,7 @@ static void tcf_chain_flush(struct tcf_chain *chain) { struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - tcf_chain_head_change(chain, NULL); + tcf_chain0_head_change(chain, NULL); while (tp) { RCU_INIT_POINTER(chain->filter_chain, tp->next); tcf_proto_destroy(tp, NULL); @@ -245,8 +250,10 @@ static void tcf_chain_destroy(struct tcf_chain *chain) struct tcf_block *block = chain->block; list_del(&chain->list); + if (!chain->index) + block->chain0.chain = NULL; kfree(chain); - if (list_empty(&block->chain_list)) + if (list_empty(&block->chain_list) && block->refcnt == 0) kfree(block); } @@ -346,10 +353,11 @@ no_offload_dev_dec: } static int -tcf_chain_head_change_cb_add(struct tcf_chain *chain, - struct tcf_block_ext_info *ei, - struct netlink_ext_ack *extack) +tcf_chain0_head_change_cb_add(struct tcf_block *block, + struct tcf_block_ext_info *ei, + struct netlink_ext_ack *extack) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; item = kmalloc(sizeof(*item), GFP_KERNEL); @@ -359,23 +367,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain, } item->chain_head_change = ei->chain_head_change; item->chain_head_change_priv = ei->chain_head_change_priv; - if (chain->filter_chain) - tcf_chain_head_change_item(item, chain->filter_chain); - list_add(&item->list, &chain->filter_chain_list); + if (chain0 && chain0->filter_chain) + tcf_chain_head_change_item(item, chain0->filter_chain); + list_add(&item->list, &block->chain0.filter_chain_list); return 0; } static void -tcf_chain_head_change_cb_del(struct tcf_chain *chain, - struct tcf_block_ext_info *ei) +tcf_chain0_head_change_cb_del(struct tcf_block *block, + struct tcf_block_ext_info *ei) { + struct tcf_chain *chain0 = block->chain0.chain; struct tcf_filter_chain_list_item *item; - list_for_each_entry(item, &chain->filter_chain_list, list) { + list_for_each_entry(item, &block->chain0.filter_chain_list, list) { if ((!ei->chain_head_change && !ei->chain_head_change_priv) || (item->chain_head_change == ei->chain_head_change && item->chain_head_change_priv == ei->chain_head_change_priv)) { - tcf_chain_head_change_item(item, NULL); + if (chain0) + tcf_chain_head_change_item(item, NULL); list_del(&item->list); kfree(item); return; @@ -411,8 +421,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, struct netlink_ext_ack *extack) { struct tcf_block *block; - struct tcf_chain *chain; - int err; block = kzalloc(sizeof(*block), GFP_KERNEL); if (!block) { @@ -422,14 +430,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, INIT_LIST_HEAD(&block->chain_list); INIT_LIST_HEAD(&block->cb_list); INIT_LIST_HEAD(&block->owner_list); + INIT_LIST_HEAD(&block->chain0.filter_chain_list); - /* Create chain 0 by default, it has to be always present. */ - chain = tcf_chain_create(block, 0); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create new tcf chain"); - err = -ENOMEM; - goto err_chain_create; - } block->refcnt = 1; block->net = net; block->index = block_index; @@ -438,10 +440,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q, if (!tcf_block_shared(block)) block->q = q; return block; - -err_chain_create: - kfree(block); - return ERR_PTR(err); } static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index) @@ -523,11 +521,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q, return block; } -static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block) -{ - return list_first_entry(&block->chain_list, struct tcf_chain, list); -} - struct tcf_block_owner_item { struct list_head list; struct Qdisc *q; @@ -621,10 +614,9 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, tcf_block_owner_netif_keep_dst(block, q, ei->binder_type); - err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block), - ei, extack); + err = tcf_chain0_head_change_cb_add(block, ei, extack); if (err) - goto err_chain_head_change_cb_add; + goto err_chain0_head_change_cb_add; err = tcf_block_offload_bind(block, q, ei, extack); if (err) @@ -634,15 +626,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q, return 0; err_block_offload_bind: - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); -err_chain_head_change_cb_add: + tcf_chain0_head_change_cb_del(block, ei); +err_chain0_head_change_cb_add: tcf_block_owner_del(block, q, ei->binder_type); err_block_owner_add: if (created) { if (tcf_block_shared(block)) tcf_block_remove(block, net); err_block_insert: - kfree(tcf_block_chain_zero(block)); kfree(block); } else { block->refcnt--; @@ -682,10 +673,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (!block) return; - tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei); + tcf_chain0_head_change_cb_del(block, ei); tcf_block_owner_del(block, q, ei->binder_type); - if (--block->refcnt == 0) { + if (block->refcnt == 1) { if (tcf_block_shared(block)) tcf_block_remove(block, block->net); @@ -701,13 +692,14 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, tcf_block_offload_unbind(block, q, ei); - if (block->refcnt == 0) { + if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); - /* Finally, put chain 0 and allow block to be freed. */ - tcf_chain_put(tcf_block_chain_zero(block)); + block->refcnt--; + if (list_empty(&block->chain_list)) + kfree(block); } } EXPORT_SYMBOL(tcf_block_put_ext); @@ -947,7 +939,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, struct tcf_proto *tp) { if (*chain_info->pprev == chain->filter_chain) - tcf_chain_head_change(chain, tp); + tcf_chain0_head_change(chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); tcf_chain_hold(chain); @@ -960,7 +952,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, struct tcf_proto *next = rtnl_dereference(chain_info->next); if (tp == chain->filter_chain) - tcf_chain_head_change(chain, next); + tcf_chain0_head_change(chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); tcf_chain_put(chain); } -- cgit v1.2.3 From 32a4f5ecd7381f30ae3bb36dea77a150ba68af2e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:06 +0200 Subject: net: sched: introduce chain object to uapi Allow user to create, destroy, get and dump chain objects. Do that by extending rtnl commands by the chain-specific ones. User will now be able to explicitly create or destroy chains (so far this was done only automatically according the filter/act needs and refcounting). Also, the user will receive notification about any chain creation or destuction. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 7 + net/sched/cls_api.c | 308 +++++++++++++++++++++++++++++++++++++++-- security/selinux/nlmsgtab.c | 2 +- 4 files changed, 309 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 86f4651784e8..81ec8276db9c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -304,6 +304,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + bool explicitly_created; }; struct tcf_block { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 7d8502313c99..46399367627f 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -150,6 +150,13 @@ enum { RTM_NEWCACHEREPORT = 96, #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + RTM_NEWCHAIN = 100, +#define RTM_NEWCHAIN RTM_NEWCHAIN + RTM_DELCHAIN, +#define RTM_DELCHAIN RTM_DELCHAIN + RTM_GETCHAIN, +#define RTM_GETCHAIN RTM_GETCHAIN + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index eb0bf9037ef9..e65b390336aa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,29 +262,57 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, + u32 chain_index) { struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) { - tcf_chain_hold(chain); + if (chain->index == chain_index) return chain; - } + } + return NULL; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast); + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); + + if (chain) { + tcf_chain_hold(chain); + return chain; } - return create ? tcf_chain_create(block, chain_index) : NULL; + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } EXPORT_SYMBOL(tcf_chain_get); void tcf_chain_put(struct tcf_chain *chain) { - if (--chain->refcnt == 0) + if (--chain->refcnt == 0) { + tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); tcf_chain_destroy(chain); + } } EXPORT_SYMBOL(tcf_chain_put); +static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) +{ + if (chain->explicitly_created) + tcf_chain_put(chain); +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; @@ -694,8 +722,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, if (block->refcnt == 1) { /* At this point, all the chains should have refcnt >= 1. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { + tcf_chain_put_explicitly_created(chain); tcf_chain_put(chain); + } block->refcnt--; if (list_empty(&block->chain_list)) @@ -1609,6 +1639,264 @@ out: return skb->len; } +static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, + struct sk_buff *skb, struct tcf_block *block, + u32 portid, u32 seq, u16 flags, int event) +{ + unsigned char *b = skb_tail_pointer(skb); + struct nlmsghdr *nlh; + struct tcmsg *tcm; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); + if (!nlh) + goto out_nlmsg_trim; + tcm = nlmsg_data(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm__pad1 = 0; + tcm->tcm__pad2 = 0; + tcm->tcm_handle = 0; + if (block->q) { + tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex; + tcm->tcm_parent = block->q->handle; + } else { + tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK; + tcm->tcm_block_index = block->index; + } + + if (nla_put_u32(skb, TCA_CHAIN, chain->index)) + goto nla_put_failure; + + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; + +out_nlmsg_trim: +nla_put_failure: + nlmsg_trim(skb, b); + return -EMSGSIZE; +} + +static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, + u32 seq, u16 flags, int event, bool unicast) +{ + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + struct tcf_block *block = chain->block; + struct net *net = block->net; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_chain_fill_node(chain, net, skb, block, portid, + seq, flags, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + if (unicast) + return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); +} + +/* Add/delete/get a chain */ + +static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct tcmsg *t; + u32 parent; + u32 chain_index; + struct Qdisc *q = NULL; + struct tcf_chain *chain = NULL; + struct tcf_block *block; + unsigned long cl; + int err; + + if (n->nlmsg_type != RTM_GETCHAIN && + !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + +replay: + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); + if (err < 0) + return err; + + t = nlmsg_data(n); + parent = t->tcm_parent; + cl = 0; + + block = tcf_block_find(net, &q, &parent, &cl, + t->tcm_ifindex, t->tcm_block_index, extack); + if (IS_ERR(block)) + return PTR_ERR(block); + + chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0; + if (chain_index > TC_ACT_EXT_VAL_MASK) { + NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit"); + return -EINVAL; + } + chain = tcf_chain_lookup(block, chain_index); + if (n->nlmsg_type == RTM_NEWCHAIN) { + if (chain) { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } + } else { + if (!chain) { + NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); + return -EINVAL; + } + tcf_chain_hold(chain); + } + + switch (n->nlmsg_type) { + case RTM_NEWCHAIN: + /* In case the chain was successfully added, take a reference + * to the chain. This ensures that an empty chain + * does not disappear at the end of this function. + */ + tcf_chain_hold(chain); + chain->explicitly_created = true; + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + break; + case RTM_DELCHAIN: + /* Flush the chain first as the user requested chain removal. */ + tcf_chain_flush(chain); + /* In case the chain was successfully deleted, put a reference + * to the chain previously taken during addition. + */ + tcf_chain_put_explicitly_created(chain); + break; + case RTM_GETCHAIN: + break; + err = tc_chain_notify(chain, skb, n->nlmsg_seq, + n->nlmsg_seq, n->nlmsg_type, true); + if (err < 0) + NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); + break; + default: + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(extack, "Unsupported message type"); + goto errout; + } + +errout: + tcf_chain_put(chain); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +/* called with RTNL */ +static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct Qdisc *q = NULL; + struct tcf_block *block; + struct tcf_chain *chain; + struct tcmsg *tcm = nlmsg_data(cb->nlh); + long index_start; + long index; + u32 parent; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return skb->len; + + err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err) + return err; + + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { + block = tcf_block_lookup(net, tcm->tcm_block_index); + if (!block) + goto out; + /* If we work with block index, q is NULL and parent value + * will never be used in the following code. The check + * in tcf_fill_node prevents it. However, compiler does not + * see that far, so set parent to zero to silence the warning + * about parent being uninitialized. + */ + parent = 0; + } else { + const struct Qdisc_class_ops *cops; + struct net_device *dev; + unsigned long cl = 0; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return skb->len; + + parent = tcm->tcm_parent; + if (!parent) { + q = dev->qdisc; + parent = q->handle; + } else { + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + } + if (!q) + goto out; + cops = q->ops->cl_ops; + if (!cops) + goto out; + if (!cops->tcf_block) + goto out; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->find(q, tcm->tcm_parent); + if (cl == 0) + goto out; + } + block = cops->tcf_block(q, cl, NULL); + if (!block) + goto out; + if (tcf_block_shared(block)) + q = NULL; + } + + index_start = cb->args[0]; + index = 0; + + list_for_each_entry(chain, &block->chain_list, list) { + if ((tca[TCA_CHAIN] && + nla_get_u32(tca[TCA_CHAIN]) != chain->index)) + continue; + if (index < index_start) { + index++; + continue; + } + err = tc_chain_fill_node(chain, net, skb, block, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + RTM_NEWCHAIN); + if (err <= 0) + break; + index++; + } + + cb->args[0] = index; + +out: + /* If we did no progress, the error (EMSGSIZE) is real */ + if (skb->len == 0 && err) + return err; + return skb->len; +} + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT @@ -1825,6 +2113,10 @@ static int __init tc_filter_init(void) rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter, tc_dump_tfilter, 0); + rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0); + rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain, + tc_dump_chain, 0); return 0; diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 7b7433a1a34c..74b951f55608 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -159,7 +159,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) switch (sclass) { case SECCLASS_NETLINK_ROUTE_SOCKET: /* RTM_MAX always point to RTM_SETxxxx, ie RTM_NEWxxx + 3 */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWCACHEREPORT + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWCHAIN + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 9f407f1768d3e1a5ddd7bd49fa4d1f5a26e10ed2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:07 +0200 Subject: net: sched: introduce chain templates Allow user to set a template for newly created chains. Template lock down the chain for particular classifier type/options combinations. The classifier needs to support templates, otherwise kernel would reply with error. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 12 +++++++++ net/sched/cls_api.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'net/sched') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 81ec8276db9c..085c509c8674 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -238,6 +238,8 @@ struct tcf_result { }; }; +struct tcf_chain; + struct tcf_proto_ops { struct list_head head; char kind[IFNAMSIZ]; @@ -263,10 +265,18 @@ struct tcf_proto_ops { tc_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack); void (*bind_class)(void *, u32, unsigned long); + void * (*tmplt_create)(struct net *net, + struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); /* rtnetlink specific */ int (*dump)(struct net*, struct tcf_proto*, void *, struct sk_buff *skb, struct tcmsg*); + int (*tmplt_dump)(struct sk_buff *skb, + struct net *net, + void *tmplt_priv); struct module *owner; }; @@ -305,6 +315,8 @@ struct tcf_chain { u32 index; /* chain index */ unsigned int refcnt; bool explicitly_created; + const struct tcf_proto_ops *tmplt_ops; + void *tmplt_priv; }; struct tcf_block { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e65b390336aa..5f7098b5405e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -298,10 +298,13 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +static void tc_chain_tmplt_del(struct tcf_chain *chain); + void tcf_chain_put(struct tcf_chain *chain) { if (--chain->refcnt == 0) { tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); } } @@ -1258,6 +1261,12 @@ replay: goto errout; } + if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); + err = -EINVAL; + goto errout; + } + err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, extack); @@ -1644,8 +1653,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, u32 portid, u32 seq, u16 flags, int event) { unsigned char *b = skb_tail_pointer(skb); + const struct tcf_proto_ops *ops; struct nlmsghdr *nlh; struct tcmsg *tcm; + void *priv; + + ops = chain->tmplt_ops; + priv = chain->tmplt_priv; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -1666,6 +1680,13 @@ static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net, if (nla_put_u32(skb, TCA_CHAIN, chain->index)) goto nla_put_failure; + if (ops) { + if (nla_put_string(skb, TCA_KIND, ops->kind)) + goto nla_put_failure; + if (ops->tmplt_dump(skb, net, priv) < 0) + goto nla_put_failure; + } + nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; @@ -1699,6 +1720,47 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } +static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + const struct tcf_proto_ops *ops; + void *tmplt_priv; + + /* If kind is not set, user did not specify template. */ + if (!tca[TCA_KIND]) + return 0; + + ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); + if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + return -EOPNOTSUPP; + } + + tmplt_priv = ops->tmplt_create(net, chain, tca, extack); + if (IS_ERR(tmplt_priv)) { + module_put(ops->owner); + return PTR_ERR(tmplt_priv); + } + chain->tmplt_ops = ops; + chain->tmplt_priv = tmplt_priv; + return 0; +} + +static void tc_chain_tmplt_del(struct tcf_chain *chain) +{ + const struct tcf_proto_ops *ops = chain->tmplt_ops; + + /* If template ops are set, no work to do for us. */ + if (!ops) + return; + + ops->tmplt_destroy(chain->tmplt_priv); + module_put(ops->owner); +} + /* Add/delete/get a chain */ static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n, @@ -1763,6 +1825,9 @@ replay: switch (n->nlmsg_type) { case RTM_NEWCHAIN: + err = tc_chain_tmplt_add(chain, net, tca, extack); + if (err) + goto errout; /* In case the chain was successfully added, take a reference * to the chain. This ensures that an empty chain * does not disappear at the end of this function. -- cgit v1.2.3 From f5749081f0d48ae585233232df6cfc4c7c9642f9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:08 +0200 Subject: net: sched: cls_flower: move key/mask dumping into a separate function Push key/mask dumping from fl_dump() into a separate function fl_dump_key(), that will be reused for template dumping. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 38d74803e2df..ab10a7c88359 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1296,29 +1296,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } -static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t) +static int fl_dump_key(struct sk_buff *skb, struct net *net, + struct fl_flow_key *key, struct fl_flow_key *mask) { - struct cls_fl_filter *f = fh; - struct nlattr *nest; - struct fl_flow_key *key, *mask; - - if (!f) - return skb->len; - - t->tcm_handle = f->handle; - - nest = nla_nest_start(skb, TCA_OPTIONS); - if (!nest) - goto nla_put_failure; - - if (f->res.classid && - nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) - goto nla_put_failure; - - key = &f->key; - mask = &f->mask->key; - if (mask->indev_ifindex) { struct net_device *dev; @@ -1327,9 +1307,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, goto nla_put_failure; } - if (!tc_skip_hw(f->flags)) - fl_hw_update_stats(tp, f); - if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || @@ -1505,6 +1482,41 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + struct fl_flow_key *key, *mask; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + if (f->res.classid && + nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) + goto nla_put_failure; + + key = &f->key; + mask = &f->mask->key; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + if (!tc_skip_hw(f->flags)) + fl_hw_update_stats(tp, f); + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; -- cgit v1.2.3 From 33fb5cba11ff639c32f4f0104b04b2415fcd9ecc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:09 +0200 Subject: net: sched: cls_flower: change fl_init_dissector to accept mask and dissector This function is going to be used for templates as well, so we need to pass the pointer separately. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index ab10a7c88359..bb7aa1e9d281 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -826,51 +826,52 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask) FL_KEY_SET(keys, cnt, id, member); \ } while(0); -static void fl_init_dissector(struct fl_flow_mask *mask) +static void fl_init_dissector(struct flow_dissector *dissector, + struct fl_flow_key *mask) { struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_MPLS, mpls); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_CVLAN, cvlan); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); - if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) || - FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) + if (FL_KEY_IS_MASKED(mask, enc_ipv4) || + FL_KEY_IS_MASKED(mask, enc_ipv6)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); - FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); - skb_flow_dissector_init(&mask->dissector, keys, cnt); + skb_flow_dissector_init(dissector, keys, cnt); } static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, @@ -889,7 +890,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head, if (err) goto errout_free; - fl_init_dissector(newmask); + fl_init_dissector(&newmask->dissector, &newmask->key); INIT_LIST_HEAD_RCU(&newmask->filters); -- cgit v1.2.3 From b95ec7eb3b4d2f158dd15c912cf670b546f09571 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:10 +0200 Subject: net: sched: cls_flower: implement chain templates Use the previously introduced template extension and implement callback to create, destroy and dump chain template. The existing parsing and dumping functions are re-used. Also, check if newly added filters fit the template if it is set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index bb7aa1e9d281..f0c80758a594 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -72,6 +72,13 @@ struct fl_flow_mask { struct list_head list; }; +struct fl_flow_tmplt { + struct fl_flow_key dummy_key; + struct fl_flow_key mask; + struct flow_dissector dissector; + struct tcf_chain *chain; +}; + struct cls_fl_head { struct rhashtable ht; struct list_head masks; @@ -147,6 +154,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, *lmkey++ = *lkey++ & *lmask++; } +static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt, + struct fl_flow_mask *mask) +{ + const long *lmask = fl_key_get_start(&mask->key, mask); + const long *ltmplt; + int i; + + if (!tmplt) + return true; + ltmplt = fl_key_get_start(&tmplt->mask, mask); + for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) { + if (~*ltmplt++ & *lmask++) + return false; + } + return true; +} + static void fl_clear_masked_range(struct fl_flow_key *key, struct fl_flow_mask *mask) { @@ -939,6 +963,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr, + struct fl_flow_tmplt *tmplt, struct netlink_ext_ack *extack) { int err; @@ -959,6 +984,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, fl_mask_update_range(mask); fl_set_masked_key(&f->mkey, &f->key, mask); + if (!fl_mask_fits_tmplt(tmplt, mask)) { + NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template"); + return -EINVAL; + } + return 0; } @@ -1024,7 +1054,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr, - extack); + tp->chain->tmplt_priv, extack); if (err) goto errout_idr; @@ -1164,6 +1194,52 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, + struct nlattr **tca, + struct netlink_ext_ack *extack) +{ + struct fl_flow_tmplt *tmplt; + struct nlattr **tb; + int err; + + if (!tca[TCA_OPTIONS]) + return ERR_PTR(-EINVAL); + + tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); + if (!tb) + return ERR_PTR(-ENOBUFS); + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], + fl_policy, NULL); + if (err) + goto errout_tb; + + tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); + if (!tmplt) + goto errout_tb; + tmplt->chain = chain; + err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); + if (err) + goto errout_tmplt; + kfree(tb); + + fl_init_dissector(&tmplt->dissector, &tmplt->mask); + + return tmplt; + +errout_tmplt: + kfree(tmplt); +errout_tb: + kfree(tb); + return ERR_PTR(err); +} + +static void fl_tmplt_destroy(void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + + kfree(tmplt); +} + static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) @@ -1536,6 +1612,31 @@ nla_put_failure: return -1; } +static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) +{ + struct fl_flow_tmplt *tmplt = tmplt_priv; + struct fl_flow_key *key, *mask; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + key = &tmplt->dummy_key; + mask = &tmplt->mask; + + if (fl_dump_key(skb, net, key, mask)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static void fl_bind_class(void *fh, u32 classid, unsigned long cl) { struct cls_fl_filter *f = fh; @@ -1556,6 +1657,9 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .reoffload = fl_reoffload, .dump = fl_dump, .bind_class = fl_bind_class, + .tmplt_create = fl_tmplt_create, + .tmplt_destroy = fl_tmplt_destroy, + .tmplt_dump = fl_tmplt_dump, .owner = THIS_MODULE, }; -- cgit v1.2.3 From 34738452739069947e528123810533f28dd8332b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 23 Jul 2018 09:23:11 +0200 Subject: net: sched: cls_flower: propagate chain teplate creation and destruction to drivers Introduce a couple of flower offload commands in order to propagate template creation/destruction events down to device drivers. Drivers may use this information to prepare HW in an optimal way for future filter insertions. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) (limited to 'net/sched') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 4f405ca8346f..a3101582f642 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -721,6 +721,8 @@ enum tc_fl_command { TC_CLSFLOWER_REPLACE, TC_CLSFLOWER_DESTROY, TC_CLSFLOWER_STATS, + TC_CLSFLOWER_TMPLT_CREATE, + TC_CLSFLOWER_TMPLT_DESTROY, }; struct tc_cls_flower_offload { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f0c80758a594..6ccf60364297 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1194,6 +1194,42 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, return 0; } +static void fl_hw_create_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + struct tcf_exts dummy_exts = { 0, }; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE; + cls_flower.cookie = (unsigned long) tmplt; + cls_flower.dissector = &tmplt->dissector; + cls_flower.mask = &tmplt->mask; + cls_flower.key = &tmplt->dummy_key; + cls_flower.exts = &dummy_exts; + + /* We don't care if driver (any of them) fails to handle this + * call. It serves just as a hint for it. + */ + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + +static void fl_hw_destroy_tmplt(struct tcf_chain *chain, + struct fl_flow_tmplt *tmplt) +{ + struct tc_cls_flower_offload cls_flower = {}; + struct tcf_block *block = chain->block; + + cls_flower.common.chain_index = chain->index; + cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY; + cls_flower.cookie = (unsigned long) tmplt; + + tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER, + &cls_flower, false); +} + static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, struct nlattr **tca, struct netlink_ext_ack *extack) @@ -1224,6 +1260,8 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, fl_init_dissector(&tmplt->dissector, &tmplt->mask); + fl_hw_create_tmplt(chain, tmplt); + return tmplt; errout_tmplt: @@ -1237,6 +1275,7 @@ static void fl_tmplt_destroy(void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; + fl_hw_destroy_tmplt(tmplt->chain, tmplt); kfree(tmplt); } -- cgit v1.2.3 From 50f699b1f8462959482251a6cd1b7bc6bbd20796 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:01 -0700 Subject: sched: fix trailing whitespace Remove trailing whitespace and blank lines at EOF Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/Kconfig | 4 ++-- net/sched/Makefile | 2 +- net/sched/act_connmark.c | 1 - net/sched/act_pedit.c | 1 - net/sched/cls_basic.c | 1 - 5 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net/sched') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 7af246764a35..bba71225adbd 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -1,6 +1,6 @@ # # Traffic control configuration. -# +# menuconfig NET_SCHED bool "QoS and/or fair queueing" @@ -706,7 +706,7 @@ config NET_CLS_ACT config NET_ACT_POLICE tristate "Traffic Policing" - depends on NET_CLS_ACT + depends on NET_CLS_ACT ---help--- Say Y here if you want to do traffic policing, i.e. strict bandwidth limiting. This action replaces the existing policing diff --git a/net/sched/Makefile b/net/sched/Makefile index 673ee7d26ff2..910ec7463a36 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o -obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 1e31f0e448e2..2f9bc833d046 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -252,4 +252,3 @@ module_exit(connmark_cleanup_module); MODULE_AUTHOR("Felix Fietkau "); MODULE_DESCRIPTION("Connection tracking mark restoring"); MODULE_LICENSE("GPL"); - diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index cc8ffcd1ddb5..9ab5d81aff1a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -516,4 +516,3 @@ static void __exit pedit_cleanup_module(void) module_init(pedit_init_module); module_exit(pedit_cleanup_module); - diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 95367f37098d..6a5dce8baf19 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -324,4 +324,3 @@ static void __exit exit_basic(void) module_init(init_basic) module_exit(exit_basic) MODULE_LICENSE("GPL"); - -- cgit v1.2.3 From aea5f654e6b78a0c976f7a25950155932c77a53f Mon Sep 17 00:00:00 2001 From: Nishanth Devarajan Date: Mon, 23 Jul 2018 19:37:41 +0530 Subject: net/sched: add skbprio scheduler Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets according to their skb->priority field. Under congestion, already-enqueued lower priority packets will be dropped to make space available for higher priority packets. Skbprio was conceived as a solution for denial-of-service defenses that need to route packets with different priorities as a means to overcome DoS attacks. v5 *Do not reference qdisc_dev(sch)->tx_queue_len for setting limit. Instead set default sch->limit to 64. v4 *Drop Documentation/networking/sch_skbprio.txt doc file to move it to tc man page for Skbprio, in iproute2. v3 *Drop max_limit parameter in struct skbprio_sched_data and instead use sch->limit. *Reference qdisc_dev(sch)->tx_queue_len only once, during initialisation for qdisc (previously being referenced every time qdisc changes). *Move qdisc's detailed description from in-code to Documentation/networking. *When qdisc is saturated, enqueue incoming packet first before dequeueing lowest priority packet in queue - improves usage of call stack registers. *Introduce and use overlimit stat to keep track of number of dropped packets. v2 *Use skb->priority field rather than DS field. Rename queueing discipline as SKB Priority Queue (previously Gatekeeper Priority Queue). *Queueing discipline is made classful to expose Skbprio's internal priority queues. Signed-off-by: Nishanth Devarajan Reviewed-by: Sachin Paryani Reviewed-by: Cody Doucette Reviewed-by: Michel Machado Acked-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 15 ++ net/sched/Kconfig | 13 ++ net/sched/Makefile | 1 + net/sched/sch_skbprio.c | 320 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 349 insertions(+) create mode 100644 net/sched/sch_skbprio.c (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d9cc9dc4f547..8975fd1a1421 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -124,6 +124,21 @@ struct tc_fifo_qopt { __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ }; +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + /* PRIO section */ #define TCQ_PRIO_BANDS 16 diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bba71225adbd..e95741388311 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -251,6 +251,19 @@ config NET_SCH_MQPRIO If unsure, say N. +config NET_SCH_SKBPRIO + tristate "SKB priority queue scheduler (SKBPRIO)" + help + Say Y here if you want to use the SKB priority queue + scheduler. This schedules packets according to skb->priority, + which is useful for request packets in DoS mitigation systems such + as Gatekeeper. + + To compile this driver as a module, choose M here: the module will + be called sch_skbprio. + + If unsure, say N. + config NET_SCH_CHOKE tristate "CHOose and Keep responsive flow scheduler (CHOKE)" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 910ec7463a36..f0403f49edcb 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c new file mode 100644 index 000000000000..52c0b6d8f1d7 --- /dev/null +++ b/net/sched/sch_skbprio.c @@ -0,0 +1,320 @@ +/* + * net/sched/sch_skbprio.c SKB Priority Queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Nishanth Devarajan, + * Cody Doucette, + * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* SKB Priority Queue + * ================================= + * + * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes + * packets according to their skb->priority field. Under congestion, + * Skbprio drops already-enqueued lower priority packets to make space + * available for higher priority packets; it was conceived as a solution + * for denial-of-service defenses that need to route packets with different + * priorities as a mean to overcome DoS attacks. + */ + +struct skbprio_sched_data { + /* Queue state. */ + struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; + struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; + u16 highest_prio; + u16 lowest_prio; +}; + +static u16 calc_new_high_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return 0 (default highest priority setting). */ + return 0; +} + +static u16 calc_new_low_prio(const struct skbprio_sched_data *q) +{ + int prio; + + for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { + if (!skb_queue_empty(&q->qdiscs[prio])) + return prio; + } + + /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 + * (default lowest priority setting). + */ + return SKBPRIO_MAX_PRIORITY - 1; +} + +static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *qdisc; + struct sk_buff_head *lp_qdisc; + struct sk_buff *to_drop; + u16 prio, lp; + + /* Obtain the priority of @skb. */ + prio = min(skb->priority, max_priority); + + qdisc = &q->qdiscs[prio]; + if (sch->q.qlen < sch->limit) { + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Check to update highest and lowest priorities. */ + if (prio > q->highest_prio) + q->highest_prio = prio; + + if (prio < q->lowest_prio) + q->lowest_prio = prio; + + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + + /* If this packet has the lowest priority, drop it. */ + lp = q->lowest_prio; + if (prio <= lp) { + q->qstats[prio].drops++; + q->qstats[prio].overlimits++; + return qdisc_drop(skb, sch, to_free); + } + + __skb_queue_tail(qdisc, skb); + qdisc_qstats_backlog_inc(sch, skb); + q->qstats[prio].backlog += qdisc_pkt_len(skb); + + /* Drop the packet at the tail of the lowest priority qdisc. */ + lp_qdisc = &q->qdiscs[lp]; + to_drop = __skb_dequeue_tail(lp_qdisc); + BUG_ON(!to_drop); + qdisc_qstats_backlog_dec(sch, to_drop); + qdisc_drop(to_drop, sch, to_free); + + q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); + q->qstats[lp].drops++; + q->qstats[lp].overlimits++; + + /* Check to update highest and lowest priorities. */ + if (skb_queue_empty(lp_qdisc)) { + if (q->lowest_prio == q->highest_prio) { + /* The incoming packet is the only packet in queue. */ + BUG_ON(sch->q.qlen != 1); + q->lowest_prio = prio; + q->highest_prio = prio; + } else { + q->lowest_prio = calc_new_low_prio(q); + } + } + + if (prio > q->highest_prio) + q->highest_prio = prio; + + return NET_XMIT_CN; +} + +static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; + struct sk_buff *skb = __skb_dequeue(hpq); + + if (unlikely(!skb)) + return NULL; + + sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + + q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); + + /* Update highest priority field. */ + if (skb_queue_empty(hpq)) { + if (q->lowest_prio == q->highest_prio) { + BUG_ON(sch->q.qlen); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + } else { + q->highest_prio = calc_new_high_prio(q); + } + } + return skb; +} + +static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct tc_skbprio_qopt *ctl = nla_data(opt); + + sch->limit = ctl->limit; + return 0; +} + +static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + /* Initialise all queues, one for each possible priority. */ + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_head_init(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; + sch->limit = 64; + if (!opt) + return 0; + + return skbprio_change(sch, opt, extack); +} + +static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tc_skbprio_qopt opt; + + opt.limit = sch->limit; + + if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) + return -1; + + return skb->len; +} + +static void skbprio_reset(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); + + memset(&q->qstats, 0, sizeof(q->qstats)); + q->highest_prio = 0; + q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; +} + +static void skbprio_destroy(struct Qdisc *sch) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) + __skb_queue_purge(&q->qdiscs[prio]); +} + +static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + +static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) +{ + return 0; +} + +static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct skbprio_sched_data *q = qdisc_priv(sch); + if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], + q->qstats[cl - 1].qlen) < 0) + return -1; + return 0; +} + +static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, i + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops skbprio_class_ops = { + .leaf = skbprio_leaf, + .find = skbprio_find, + .dump = skbprio_dump_class, + .dump_stats = skbprio_dump_class_stats, + .walk = skbprio_walk, +}; + +static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { + .cl_ops = &skbprio_class_ops, + .id = "skbprio", + .priv_size = sizeof(struct skbprio_sched_data), + .enqueue = skbprio_enqueue, + .dequeue = skbprio_dequeue, + .peek = qdisc_peek_dequeued, + .init = skbprio_init, + .reset = skbprio_reset, + .change = skbprio_change, + .dump = skbprio_dump, + .destroy = skbprio_destroy, + .owner = THIS_MODULE, +}; + +static int __init skbprio_module_init(void) +{ + return register_qdisc(&skbprio_qdisc_ops); +} + +static void __exit skbprio_module_exit(void) +{ + unregister_qdisc(&skbprio_qdisc_ops); +} + +module_init(skbprio_module_init) +module_exit(skbprio_module_exit) + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 158abbf170ecfc6d56abeddd0c66da753b3435df Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Wed, 25 Jul 2018 02:31:25 +0000 Subject: net/sched: cls_flower: Use correct inline function for assignment of vlan tpid This fixes the following sparse warning: net/sched/cls_flower.c:1356:36: warning: incorrect type in argument 3 (different base types) net/sched/cls_flower.c:1356:36: expected unsigned short [unsigned] [usertype] value net/sched/cls_flower.c:1356:36: got restricted __be16 [usertype] vlan_tpid Signed-off-by: Jianbo Liu Reported-by: Or Gerlitz Reviewed-by: Or Gerlitz Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6ccf60364297..e8bd08ba998a 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1445,8 +1445,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, TCA_FLOWER_KEY_CVLAN_PRIO, &key->cvlan, &mask->cvlan) || (mask->cvlan.vlan_tpid && - nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - key->cvlan.vlan_tpid))) + nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, + key->cvlan.vlan_tpid))) goto nla_put_failure; if (mask->basic.n_proto) { -- cgit v1.2.3 From 990e35ecba1cb8ebee4ad4a028735e24f4615417 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 23 Jul 2018 17:08:00 -0700 Subject: cbs: Add support for the graft function This will allow to install a child qdisc under cbs. The main use case is to install ETF (Earliest TxTime First) qdisc under cbs, so there's another level of control for time-sensitive traffic. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/sched/sch_cbs.c | 134 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 125 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index cdd96b9a27bc..e26a24017faa 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -78,18 +78,42 @@ struct cbs_sched_data { s64 sendslope; /* in bytes/s */ s64 idleslope; /* in bytes/s */ struct qdisc_watchdog watchdog; - int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch); + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); struct sk_buff *(*dequeue)(struct Qdisc *sch); + struct Qdisc *qdisc; }; -static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) { - return qdisc_enqueue_tail(skb, sch); + int err; + + err = child->ops->enqueue(skb, child, to_free); + if (err != NET_XMIT_SUCCESS) + return err; + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return NET_XMIT_SUCCESS; } -static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) +static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_enqueue(skb, sch, qdisc, to_free); +} + +static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; if (sch->q.qlen == 0 && q->credits > 0) { /* We need to stop accumulating credits when there's @@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch) q->last = ktime_get_ns(); } - return qdisc_enqueue_tail(skb, sch); + return cbs_child_enqueue(skb, sch, qdisc, to_free); } static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct cbs_sched_data *q = qdisc_priv(sch); - return q->enqueue(skb, sch); + return q->enqueue(skb, sch, to_free); } /* timediff is in ns, slope is in bytes/s */ @@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate) return div64_s64(len * slope, port_rate); } +static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child) +{ + struct sk_buff *skb; + + skb = child->ops->dequeue(child); + if (!skb) + return NULL; + + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) { struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; s64 now = ktime_get_ns(); struct sk_buff *skb; s64 credits; @@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) return NULL; } } - - skb = qdisc_dequeue_head(sch); + skb = cbs_child_dequeue(sch, qdisc); if (!skb) return NULL; @@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch) static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch) { - return qdisc_dequeue_head(sch); + struct cbs_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc = q->qdisc; + + return cbs_child_dequeue(sch, qdisc); } static struct sk_buff *cbs_dequeue(struct Qdisc *sch) @@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; } + q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, extack); + if (!q->qdisc) + return -ENOMEM; + + qdisc_hash_add(q->qdisc, false); + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); q->enqueue = cbs_enqueue_soft; @@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); cbs_disable_offload(dev, q); + + if (q->qdisc) + qdisc_destroy(q->qdisc); } static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -356,8 +408,72 @@ nla_put_failure: return -1; } +static int cbs_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (cl != 1 || !q->qdisc) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old, struct netlink_ext_ack *extack) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + if (!new) { + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle, NULL); + if (!new) + new = &noop_qdisc; + } + + *old = qdisc_replace(sch, new, &q->qdisc); + return 0; +} + +static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + return q->qdisc; +} + +static unsigned long cbs_find(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) { + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } +} + +static const struct Qdisc_class_ops cbs_class_ops = { + .graft = cbs_graft, + .leaf = cbs_leaf, + .find = cbs_find, + .walk = cbs_walk, + .dump = cbs_dump_class, +}; + static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .id = "cbs", + .cl_ops = &cbs_class_ops, .priv_size = sizeof(struct cbs_sched_data), .enqueue = cbs_enqueue, .dequeue = cbs_dequeue, -- cgit v1.2.3 From 2ed9db3074fcd8d12709fe40ff0e691d74229818 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 25 Jul 2018 09:07:24 -0500 Subject: net: sched: cls_api: fix dead code in switch Code at line 1850 is unreachable. Fix this by removing the break statement above it, so the code for case RTM_GETCHAIN can be properly executed. Addresses-Coverity-ID: 1472050 ("Structurally dead code") Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Signed-off-by: Gustavo A. R. Silva Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5f7098b5405e..f3d78c23338e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1846,7 +1846,6 @@ replay: tcf_chain_put_explicitly_created(chain); break; case RTM_GETCHAIN: - break; err = tc_chain_notify(chain, skb, n->nlmsg_seq, n->nlmsg_seq, n->nlmsg_type, true); if (err < 0) -- cgit v1.2.3 From c921d7db3d1248c9091af070a7fdce2e55baa86a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 26 Jul 2018 18:27:58 +0200 Subject: net: sched: unmark chain as explicitly created on delete Once user manually deletes the chain using "chain del", the chain cannot be marked as explicitly created anymore. Signed-off-by: Jiri Pirko Fixes: 32a4f5ecd738 ("net: sched: introduce chain object to uapi") Signed-off-by: David S. Miller --- net/sched/cls_api.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f3d78c23338e..75cce2819de9 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1844,6 +1844,7 @@ replay: * to the chain previously taken during addition. */ tcf_chain_put_explicitly_created(chain); + chain->explicitly_created = false; break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, -- cgit v1.2.3 From 1f3ed383fb9a073ae2e408cd7a0717b04c7c3a21 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 27 Jul 2018 09:45:05 +0200 Subject: net: sched: don't dump chains only held by actions In case a chain is empty and not explicitly created by a user, such chain should not exist. The only exception is if there is an action "goto chain" pointing to it. In that case, don't show the chain in the dump. Track the chain references held by actions and use them to find out if a chain should or should not be shown in chain dump. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 ++ include/net/sch_generic.h | 1 + net/sched/act_api.c | 4 +-- net/sched/cls_api.c | 70 +++++++++++++++++++++++++++++++++++++++-------- 4 files changed, 64 insertions(+), 14 deletions(-) (limited to 'net/sched') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a3101582f642..6d02f31abba8 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -39,7 +39,10 @@ bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, + u32 chain_index); void tcf_chain_put(struct tcf_chain *chain); +void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 085c509c8674..c5432362dc26 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -314,6 +314,7 @@ struct tcf_chain { struct tcf_block *block; u32 index; /* chain index */ unsigned int refcnt; + unsigned int action_refcnt; bool explicitly_created; const struct tcf_proto_ops *tmplt_ops; void *tmplt_priv; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 148a89ab789b..b43df1e25c6d 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) if (!tp) return -EINVAL; - a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true); + a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!a->goto_chain) return -ENOMEM; return 0; @@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) static void tcf_action_goto_chain_fini(struct tc_action *a) { - tcf_chain_put(a->goto_chain); + tcf_chain_put_by_act(a->goto_chain); } static void tcf_action_goto_chain_exec(const struct tc_action *a, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 75cce2819de9..e20aad1987b8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,6 +262,25 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } +static void tcf_chain_hold_by_act(struct tcf_chain *chain) +{ + ++chain->action_refcnt; +} + +static void tcf_chain_release_by_act(struct tcf_chain *chain) +{ + --chain->action_refcnt; +} + +static bool tcf_chain_is_zombie(struct tcf_chain *chain) +{ + /* In case all the references are action references, this + * chain is a zombie and should not be listed in the chain + * dump list. + */ + return chain->refcnt == chain->action_refcnt; +} + static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, u32 chain_index) { @@ -298,6 +317,15 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, } EXPORT_SYMBOL(tcf_chain_get); +struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) +{ + struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); + + tcf_chain_hold_by_act(chain); + return chain; +} +EXPORT_SYMBOL(tcf_chain_get_by_act); + static void tc_chain_tmplt_del(struct tcf_chain *chain); void tcf_chain_put(struct tcf_chain *chain) @@ -310,6 +338,13 @@ void tcf_chain_put(struct tcf_chain *chain) } EXPORT_SYMBOL(tcf_chain_put); +void tcf_chain_put_by_act(struct tcf_chain *chain) +{ + tcf_chain_release_by_act(chain); + tcf_chain_put(chain); +} +EXPORT_SYMBOL(tcf_chain_put_by_act); + static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) { if (chain->explicitly_created) @@ -1803,20 +1838,29 @@ replay: chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - NL_SET_ERR_MSG(extack, "Filter chain already exists"); - return -EEXIST; - } - if (!(n->nlmsg_flags & NLM_F_CREATE)) { - NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); - return -ENOENT; - } - chain = tcf_chain_create(block, chain_index); - if (!chain) { - NL_SET_ERR_MSG(extack, "Failed to create filter chain"); - return -ENOMEM; + if (tcf_chain_is_zombie(chain)) { + /* The chain exists only because there is + * some action referencing it, meaning it + * is a zombie. + */ + tcf_chain_hold(chain); + } else { + NL_SET_ERR_MSG(extack, "Filter chain already exists"); + return -EEXIST; + } + } else { + if (!(n->nlmsg_flags & NLM_F_CREATE)) { + NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain"); + return -ENOENT; + } + chain = tcf_chain_create(block, chain_index); + if (!chain) { + NL_SET_ERR_MSG(extack, "Failed to create filter chain"); + return -ENOMEM; + } } } else { - if (!chain) { + if (!chain || tcf_chain_is_zombie(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); return -EINVAL; } @@ -1944,6 +1988,8 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } + if (tcf_chain_is_zombie(chain)) + continue; err = tc_chain_fill_node(chain, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, -- cgit v1.2.3 From 2db6dc2662bab14e59517ab4b86a164cc4d2db42 Mon Sep 17 00:00:00 2001 From: Dave Taht Date: Thu, 26 Jul 2018 19:45:10 -0700 Subject: sch_cake: Make gso-splitting configurable from userspace This patch restores cake's deployed behavior at line rate to always split gso, and makes gso splitting configurable from userspace. running cake unlimited (unshaped) at 1gigE, local traffic: no-split-gso bql limit: 131966 split-gso bql limit: ~42392-45420 On this 4 stream test splitting gso apart results in halving the observed interpacket latency at no loss in throughput. Summary of tcp_nup test run 'gso-split' (at 2018-07-26 16:03:51.824728): Ping (ms) ICMP : 0.83 0.81 ms 341 TCP upload avg : 235.43 235.39 Mbits/s 301 TCP upload sum : 941.71 941.56 Mbits/s 301 TCP upload::1 : 235.45 235.43 Mbits/s 271 TCP upload::2 : 235.45 235.41 Mbits/s 289 TCP upload::3 : 235.40 235.40 Mbits/s 288 TCP upload::4 : 235.41 235.40 Mbits/s 291 verses Summary of tcp_nup test run 'no-split-gso' (at 2018-07-26 16:37:23.563960): avg median # data pts Ping (ms) ICMP : 1.67 1.73 ms 348 TCP upload avg : 234.56 235.37 Mbits/s 301 TCP upload sum : 938.24 941.49 Mbits/s 301 TCP upload::1 : 234.55 235.38 Mbits/s 285 TCP upload::2 : 234.57 235.37 Mbits/s 286 TCP upload::3 : 234.58 235.37 Mbits/s 274 TCP upload::4 : 234.54 235.42 Mbits/s 288 Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/sched') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 539c9490c308..35fc7252187c 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -80,7 +80,6 @@ #define CAKE_QUEUES (1024) #define CAKE_FLOW_MASK 63 #define CAKE_FLOW_NAT_FLAG 64 -#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */ /* struct cobalt_params - contains codel and blue parameters * @interval: codel initial drop rate @@ -2569,10 +2568,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_CAKE_MEMORY]) q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]); - if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD) - q->rate_flags |= CAKE_FLAG_SPLIT_GSO; - else - q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + if (tb[TCA_CAKE_SPLIT_GSO]) { + if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; + else + q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; + } if (q->tins) { sch_tree_lock(sch); @@ -2608,7 +2609,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, q->target = 5000; /* 5ms: codel RFC argues * for 5 to 10% of interval */ - + q->rate_flags |= CAKE_FLAG_SPLIT_GSO; q->cur_tin = 0; q->cur_flow = 0; -- cgit v1.2.3 From 0a80848ec5cc1294984e648b9a71aecf69c4bb73 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:29:01 +0800 Subject: act_pedit: remove unnecessary semicolon net/sched/act_pedit.c:289:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 9ab5d81aff1a..43ba999b2d23 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -286,7 +286,7 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, default: ret = -EINVAL; break; - }; + } return ret; } -- cgit v1.2.3 From f9562fa4a5750d097f4468c0a7fc9a4e0d2dfdc3 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:35:15 +0800 Subject: cls_bpf: Use kmemdup instead of duplicating it in cls_bpf_prog_from_ops Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 66e0ac9811f9..fa6fe2fe0f32 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -349,12 +349,10 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) if (bpf_size != nla_len(tb[TCA_BPF_OPS])) return -EINVAL; - bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; - memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; -- cgit v1.2.3 From 3f6bcc5162a1ba4e99e867364919168c1d821308 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Jul 2018 18:38:06 +0800 Subject: act_bpf: Use kmemdup instead of duplicating it in tcf_bpf_init_from_ops Replace calls to kmalloc followed by a memcpy with a direct call to kmemdup. Signed-off-by: YueHaibing Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 06f743d8ed41..6203eb075c9a 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -196,12 +196,10 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg) if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS])) return -EINVAL; - bpf_ops = kzalloc(bpf_size, GFP_KERNEL); + bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; - memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size); - fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; -- cgit v1.2.3 From 802bfb19152c0fb4137c6ba72bcf042ee023e743 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:42 +0200 Subject: net/sched: user-space can't set unknown tcfa_action values Currently, when initializing an action, the user-space can specify and use arbitrary values for the tcfa_action field. If the value is unknown by the kernel, is implicitly threaded as TC_ACT_UNSPEC. This change explicitly checks for unknown values at action creation time, and explicitly convert them to TC_ACT_UNSPEC. No functional changes are introduced, but this will allow introducing tcfa_action values not exposed to user-space in a later patch. Note: we can't use the above to hide TC_ACT_REDIRECT from user-space, as the latter is already part of uAPI. v3 -> v4: - use an helper to check for action validity (JiriP) - emit an extack for invalid actions (JiriP) v4 -> v5: - keep messages on a single line, drop net_warn (Marcelo) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 6 ++++-- net/sched/act_api.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index b4512254036b..48e5b5d49a34 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -45,6 +45,7 @@ enum { * the skb and act like everything * is alright. */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP /* There is a special kind of actions called "extended actions", * which need a value parameter. These have a local opcode located in @@ -55,11 +56,12 @@ enum { #define __TC_ACT_EXT_SHIFT 28 #define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) #define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) -#define TC_ACT_EXT_CMP(combined, opcode) \ - (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) #define TC_ACT_JUMP __TC_ACT_EXT(1) #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN /* Action type identifiers*/ enum { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b43df1e25c6d..229d63c99be2 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -786,6 +786,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) return c; } +static bool tcf_action_valid(int action) +{ + int opcode = TC_ACT_EXT_OPCODE(action); + + if (!opcode) + return action <= TC_ACT_VALUE_MAX; + return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC; +} + struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, @@ -895,6 +904,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, } } + if (!tcf_action_valid(a->tcfa_action)) { + NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead"); + a->tcfa_action = TC_ACT_UNSPEC; + } + return a; err_mod: -- cgit v1.2.3 From 7fd4b288ea6a3e45ad8afbcd5ec39554d57f1ae0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:43 +0200 Subject: tc/act: remove unneeded RCU lock in action callback Each lockless action currently does its own RCU locking in ->act(). This allows using plain RCU accessor, even if the context is really RCU BH. This change drops the per action RCU lock, replace the accessors with the _bh variant, cleans up a bit the surrounding code and documents the RCU status in the relevant header. No functional nor performance change is intended. The goal of this patch is clarifying that the RCU critical section used by the tc actions extends up to the classifier's caller. v1 -> v2: - preserve rcu lock in act_bpf: it's needed by eBPF helpers, as pointed out by Daniel v3 -> v4: - fixed some typos in the commit message (JiriP) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/sch_generic.h | 2 ++ net/sched/act_csum.c | 12 +++--------- net/sched/act_ife.c | 5 +---- net/sched/act_mirred.c | 4 +--- net/sched/act_sample.c | 4 +--- net/sched/act_skbedit.c | 10 +++------- net/sched/act_skbmod.c | 21 +++++++++------------ net/sched/act_tunnel_key.c | 6 +----- net/sched/act_vlan.c | 19 +++++++------------ 10 files changed, 29 insertions(+), 56 deletions(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 683ce41053d9..8c9bc02d05e1 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -85,7 +85,7 @@ struct tc_action_ops { size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, - struct tcf_result *); + struct tcf_result *); /* called under RCU BH lock*/ int (*dump)(struct sk_buff *, struct tc_action *, int, int); void (*cleanup)(struct tc_action *); int (*lookup)(struct net *net, struct tc_action **a, u32 index, diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c5432362dc26..bcae181c1857 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -285,6 +285,8 @@ struct tcf_proto { /* Fast access part */ struct tcf_proto __rcu *next; void __rcu *root; + + /* called under RCU BH lock*/ int (*classify)(struct sk_buff *, const struct tcf_proto *, struct tcf_result *); diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4e8c383f379e..648a3a35b720 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -561,15 +561,14 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, u32 update_flags; int action; - rcu_read_lock(); - params = rcu_dereference(p->params); + params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) - goto drop_stats; + goto drop; update_flags = params->update_flags; switch (tc_skb_protocol(skb)) { @@ -583,16 +582,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, break; } -unlock: - rcu_read_unlock(); return action; drop: - action = TC_ACT_SHOT; - -drop_stats: qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats)); - goto unlock; + return TC_ACT_SHOT; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 3d6e265758c0..df4060e32d43 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -820,14 +820,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_ife_params *p; int ret; - rcu_read_lock(); - p = rcu_dereference(ife->params); + p = rcu_dereference_bh(ife->params); if (p->flags & IFE_ENCODE) { ret = tcf_ife_encode(skb, a, res, p); - rcu_read_unlock(); return ret; } - rcu_read_unlock(); return tcf_ife_decode(skb, a, res); } diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 6afd89a36c69..eeb335f03102 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -181,11 +181,10 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&m->tcf_tm); bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - rcu_read_lock(); m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = READ_ONCE(m->tcf_action); - dev = rcu_dereference(m->tcfm_dev); + dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); goto out; @@ -236,7 +235,6 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 3079e7be5bde..2608ccc83e5e 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -140,8 +140,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); - rcu_read_lock(); - psample_group = rcu_dereference(s->psample_group); + psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { @@ -165,7 +164,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, skb_pull(skb, skb->mac_len); } - rcu_read_unlock(); return retval; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index da56e6938c9e..a6db47ebec11 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,8 +43,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&d->tcf_tm); bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); - rcu_read_lock(); - params = rcu_dereference(d->params); + params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) @@ -77,14 +76,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - -unlock: - rcu_read_unlock(); return action; + err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); - action = TC_ACT_SHOT; - goto unlock; + return TC_ACT_SHOT; } static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index cdc6bacfb190..c437c6d51a71 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, * then MAX_EDIT_LEN needs to change appropriately */ err = skb_ensure_writable(skb, MAX_EDIT_LEN); - if (unlikely(err)) { /* best policy is to drop on the floor */ - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - return TC_ACT_SHOT; - } + if (unlikely(err)) /* best policy is to drop on the floor */ + goto drop; - rcu_read_lock(); action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) { - qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); - rcu_read_unlock(); - return action; - } + if (unlikely(action == TC_ACT_SHOT)) + goto drop; - p = rcu_dereference(d->skbmod_p); + p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); @@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src); if (flags & SKBMOD_F_ETYPE) eth_hdr(skb)->h_proto = p->eth_type; - rcu_read_unlock(); if (flags & SKBMOD_F_SWAPMAC) { u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */ @@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, } return action; + +drop: + qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index f811850fd1d0..d42d9e112789 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -31,9 +31,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_tunnel_key_params *params; int action; - rcu_read_lock(); - - params = rcu_dereference(t->params); + params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); @@ -53,8 +51,6 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, break; } - rcu_read_unlock(); - return action; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index ad37f308175a..15a0ee214c9c 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - rcu_read_lock(); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference(v->vlan_p); + p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: @@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, case TCA_VLAN_ACT_MODIFY: /* No-op if no vlan tag (either hw-accel or in-payload) */ if (!skb_vlan_tagged(skb)) - goto unlock; + goto out; /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); @@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, BUG(); } - goto unlock; - -drop: - action = TC_ACT_SHOT; - qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); - -unlock: - rcu_read_unlock(); +out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return action; + +drop: + qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats)); + return TC_ACT_SHOT; } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { -- cgit v1.2.3 From e5cf1baf92cb785b90390db1c624948e70c8b8bd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Jul 2018 14:30:45 +0200 Subject: act_mirred: use TC_ACT_REINSERT when possible When mirred is invoked from the ingress path, and it wants to redirect the processed packet, it can now use the TC_ACT_REINSERT action, filling the tcf_result accordingly, and avoiding a per packet skb_clone(). Overall this gives a ~10% improvement in forwarding performance for the TC S/W data path and TC S/W performances are now comparable to the kernel openvswitch datapath. v1 -> v2: use ACT_MIRRED instead of ACT_REDIRECT v2 -> v3: updated after action rename, fixed typo into the commit message v3 -> v4: updated again after action rename, added more comments to the code (JiriP), skip the optimization if the control action need to touch the tcf_result (Paolo) v4 -> v5: fix sparse warning (kbuild bot) Signed-off-by: Paolo Abeni Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 53 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 10 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index eeb335f03102..b26d060da08e 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,18 @@ static bool tcf_mirred_act_wants_ingress(int action) } } +static bool tcf_mirred_can_reinsert(int action) +{ + switch (action) { + case TC_ACT_SHOT: + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return true; + } + return false; +} + static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); @@ -171,10 +184,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); + struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - struct sk_buff *skb2; int retval, err = 0; + bool use_reinsert; + bool want_ingress; + bool is_redirect; int m_eaction; int mac_len; @@ -196,16 +212,25 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, goto out; } - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - goto out; + /* we could easily avoid the clone only if called by ingress and clsact; + * since we can't easily detect the clsact caller, skip clone only for + * ingress - that covers the TC S/W datapath. + */ + is_redirect = tcf_mirred_is_act_redirect(m_eaction); + use_reinsert = skb_at_tc_ingress(skb) && is_redirect && + tcf_mirred_can_reinsert(retval); + if (!use_reinsert) { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out; + } /* If action's target direction differs than filter's direction, * and devices expect a mac header on xmit, then mac push/pull is * needed. */ - if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) && - m_mac_header_xmit) { + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); + if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) { if (!skb_at_tc_ingress(skb)) { /* caught at egress, act ingress: pull mac */ mac_len = skb_network_header(skb) - skb_mac_header(skb); @@ -216,15 +241,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, } } + skb2->skb_iif = skb->dev->ifindex; + skb2->dev = dev; + /* mirror is always swallowed */ - if (tcf_mirred_is_act_redirect(m_eaction)) { + if (is_redirect) { skb2->tc_redirected = 1; skb2->tc_from_ingress = skb2->tc_at_ingress; + + /* let's the caller reinsert the packet, if possible */ + if (use_reinsert) { + res->ingress = want_ingress; + res->qstats = this_cpu_ptr(m->common.cpu_qstats); + return TC_ACT_REINSERT; + } } - skb2->skb_iif = skb->dev->ifindex; - skb2->dev = dev; - if (!tcf_mirred_act_wants_ingress(m_eaction)) + if (!want_ingress) err = dev_queue_xmit(skb2); else err = netif_receive_skb(skb2); -- cgit v1.2.3 From 3d32f4c548bd8f3af58c59521bee9be127f3e87d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:55 +0200 Subject: net: sched: change name of zombie chain to "held_by_acts_only" As mentioned by Cong and Jakub during the review process, it is a bit odd to sometimes (act flow) create a new chain which would be immediately a "zombie". So just rename it to "held_by_acts_only". Signed-off-by: Jiri Pirko Suggested-by: Cong Wang Suggested-by: Jakub Kicinski Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e20aad1987b8..2f78341f2888 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -272,11 +272,10 @@ static void tcf_chain_release_by_act(struct tcf_chain *chain) --chain->action_refcnt; } -static bool tcf_chain_is_zombie(struct tcf_chain *chain) +static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { /* In case all the references are action references, this - * chain is a zombie and should not be listed in the chain - * dump list. + * chain should not be shown to the user. */ return chain->refcnt == chain->action_refcnt; } @@ -1838,10 +1837,9 @@ replay: chain = tcf_chain_lookup(block, chain_index); if (n->nlmsg_type == RTM_NEWCHAIN) { if (chain) { - if (tcf_chain_is_zombie(chain)) { + if (tcf_chain_held_by_acts_only(chain)) { /* The chain exists only because there is - * some action referencing it, meaning it - * is a zombie. + * some action referencing it. */ tcf_chain_hold(chain); } else { @@ -1860,7 +1858,7 @@ replay: } } } else { - if (!chain || tcf_chain_is_zombie(chain)) { + if (!chain || tcf_chain_held_by_acts_only(chain)) { NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); return -EINVAL; } @@ -1988,7 +1986,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) index++; continue; } - if (tcf_chain_is_zombie(chain)) + if (tcf_chain_held_by_acts_only(chain)) continue; err = tc_chain_fill_node(chain, net, skb, block, NETLINK_CB(cb->skb).portid, -- cgit v1.2.3 From 5368140730e4a67169303edd3a13e31fd9b9d355 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:56 +0200 Subject: net: sched: fix notifications for action-held chains Chains that only have action references serve as placeholders. Until a non-action reference is created, user should not be aware of the chain. Also he should not receive any notifications about it. So send notifications for the new chain only in case the chain gets the first non-action reference. Symmetrically to that, when the last non-action reference is dropped, send the notification about deleted chain. Reported-by: Cong Wang Signed-off-by: Jiri Pirko Acked-by: Cong Wang v1->v2: - made __tcf_chain_{get,put}() static as suggested by Cong Signed-off-by: David S. Miller --- net/sched/cls_api.c | 71 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 28 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2f78341f2888..b194a5abfc6a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -262,16 +262,6 @@ static void tcf_chain_hold(struct tcf_chain *chain) ++chain->refcnt; } -static void tcf_chain_hold_by_act(struct tcf_chain *chain) -{ - ++chain->action_refcnt; -} - -static void tcf_chain_release_by_act(struct tcf_chain *chain) -{ - --chain->action_refcnt; -} - static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain) { /* In case all the references are action references, this @@ -295,52 +285,77 @@ static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block, static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, u32 seq, u16 flags, int event, bool unicast); -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, + u32 chain_index, bool create, + bool by_act) { struct tcf_chain *chain = tcf_chain_lookup(block, chain_index); if (chain) { tcf_chain_hold(chain); - return chain; + } else { + if (!create) + return NULL; + chain = tcf_chain_create(block, chain_index); + if (!chain) + return NULL; } - if (!create) - return NULL; - chain = tcf_chain_create(block, chain_index); - if (!chain) - return NULL; - tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + if (by_act) + ++chain->action_refcnt; + + /* Send notification only in case we got the first + * non-action reference. Until then, the chain acts only as + * a placeholder for actions pointing to it and user ought + * not know about them. + */ + if (chain->refcnt - chain->action_refcnt == 1 && !by_act) + tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, + RTM_NEWCHAIN, false); + return chain; } + +struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) +{ + return __tcf_chain_get(block, chain_index, create, false); +} EXPORT_SYMBOL(tcf_chain_get); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { - struct tcf_chain *chain = tcf_chain_get(block, chain_index, true); - - tcf_chain_hold_by_act(chain); - return chain; + return __tcf_chain_get(block, chain_index, true, true); } EXPORT_SYMBOL(tcf_chain_get_by_act); static void tc_chain_tmplt_del(struct tcf_chain *chain); -void tcf_chain_put(struct tcf_chain *chain) +static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) { - if (--chain->refcnt == 0) { + if (by_act) + chain->action_refcnt--; + chain->refcnt--; + + /* The last dropped non-action reference will trigger notification. */ + if (chain->refcnt - chain->action_refcnt == 0 && !by_act) tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false); + + if (chain->refcnt == 0) { tc_chain_tmplt_del(chain); tcf_chain_destroy(chain); } } + +void tcf_chain_put(struct tcf_chain *chain) +{ + __tcf_chain_put(chain, false); +} EXPORT_SYMBOL(tcf_chain_put); void tcf_chain_put_by_act(struct tcf_chain *chain) { - tcf_chain_release_by_act(chain); - tcf_chain_put(chain); + __tcf_chain_put(chain, true); } EXPORT_SYMBOL(tcf_chain_put_by_act); -- cgit v1.2.3 From 290b1c8b1a902c0902df9ec05577ab209296f345 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 1 Aug 2018 12:36:57 +0200 Subject: net: sched: make tcf_chain_{get,put}() static These are no longer used outside of cls_api.c so make them static. Move tcf_chain_flush() to avoid fwd declaration of tcf_chain_put(). Signed-off-by: Jiri Pirko v1->v2: - new patch Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- net/sched/cls_api.c | 34 ++++++++++++++++------------------ 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'net/sched') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22bfc3a13c25..ef727f71336e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -40,11 +40,8 @@ struct tcf_block_cb; bool tcf_queue_work(struct rcu_work *rwork, work_func_t func); #ifdef CONFIG_NET_CLS -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index); -void tcf_chain_put(struct tcf_chain *chain); void tcf_chain_put_by_act(struct tcf_chain *chain); void tcf_block_netif_keep_dst(struct tcf_block *block); int tcf_block_get(struct tcf_block **p_block, diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index b194a5abfc6a..e8b0bbd0883f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -232,19 +232,6 @@ static void tcf_chain0_head_change(struct tcf_chain *chain, tcf_chain_head_change_item(item, tp_head); } -static void tcf_chain_flush(struct tcf_chain *chain) -{ - struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); - - tcf_chain0_head_change(chain, NULL); - while (tp) { - RCU_INIT_POINTER(chain->filter_chain, tp->next); - tcf_proto_destroy(tp, NULL); - tp = rtnl_dereference(chain->filter_chain); - tcf_chain_put(chain); - } -} - static void tcf_chain_destroy(struct tcf_chain *chain) { struct tcf_block *block = chain->block; @@ -316,12 +303,11 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, return chain; } -struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, - bool create) +static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, + bool create) { return __tcf_chain_get(block, chain_index, create, false); } -EXPORT_SYMBOL(tcf_chain_get); struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index) { @@ -347,11 +333,10 @@ static void __tcf_chain_put(struct tcf_chain *chain, bool by_act) } } -void tcf_chain_put(struct tcf_chain *chain) +static void tcf_chain_put(struct tcf_chain *chain) { __tcf_chain_put(chain, false); } -EXPORT_SYMBOL(tcf_chain_put); void tcf_chain_put_by_act(struct tcf_chain *chain) { @@ -365,6 +350,19 @@ static void tcf_chain_put_explicitly_created(struct tcf_chain *chain) tcf_chain_put(chain); } +static void tcf_chain_flush(struct tcf_chain *chain) +{ + struct tcf_proto *tp = rtnl_dereference(chain->filter_chain); + + tcf_chain0_head_change(chain, NULL); + while (tp) { + RCU_INIT_POINTER(chain->filter_chain, tp->next); + tcf_proto_destroy(tp, NULL); + tp = rtnl_dereference(chain->filter_chain); + tcf_chain_put(chain); + } +} + static bool tcf_block_offload_in_use(struct tcf_block *block) { return block->offloadcnt; -- cgit v1.2.3 From 5ca8a25c144dbb04511147601943691baab0aaca Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 3 Aug 2018 11:08:47 +0200 Subject: net: sched: fix flush on non-existing chain User was able to perform filter flush on chain 0 even if it didn't have any filters in it. With the patch that avoided implicit chain 0 creation, this changed. So in case user wants filter flush on chain which does not exist, just return success. There's no reason for non-0 chains to behave differently than chain 0, so do the same for them. Reported-by: Ido Schimmel Fixes: f71e0ca4db18 ("net: sched: Avoid implicit chain 0 creation") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e8b0bbd0883f..194c2e0b2737 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1389,6 +1389,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, } chain = tcf_chain_get(block, chain_index, false); if (!chain) { + /* User requested flush on non-existent chain. Nothing to do, + * so just return success. + */ + if (prio == 0) { + err = 0; + goto errout; + } NL_SET_ERR_MSG(extack, "Cannot find specified filter chain"); err = -EINVAL; goto errout; -- cgit v1.2.3 From 1cbc36a53b60d43daa686280385b1ddbe51d5809 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 3 Aug 2018 22:27:55 +0300 Subject: net: sched: cls_flower: Fix an error code in fl_tmplt_create() We forgot to set the error code on this path, so we return NULL instead of an error pointer. In the current code kzalloc() won't fail for small allocations so this doesn't really affect runtime. Fixes: b95ec7eb3b4d ("net: sched: cls_flower: implement chain templates") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index e8bd08ba998a..a3b69bb6f4b0 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1250,8 +1250,10 @@ static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain, goto errout_tb; tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL); - if (!tmplt) + if (!tmplt) { + err = -ENOMEM; goto errout_tb; + } tmplt->chain = chain; err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack); if (err) -- cgit v1.2.3 From 0a6e77784f490912d81b92cfd48424541c04691e Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Tue, 7 Aug 2018 17:36:01 +0200 Subject: net/sched: allow flower to match tunnel options Allow matching on options in Geneve tunnel headers. This makes use of existing tunnel metadata support. The options can be described in the form CLASS:TYPE:DATA/CLASS_MASK:TYPE_MASK:DATA_MASK, where CLASS is represented as a 16bit hexadecimal value, TYPE as an 8bit hexadecimal value and DATA as a variable length hexadecimal value. e.g. # ip link add name geneve0 type geneve dstport 0 external # tc qdisc add dev geneve0 ingress # tc filter add dev geneve0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ geneve_opts 0102:80:1122334421314151/ffff:ff:ffffffffffffffff \ ip_proto udp \ action mirred egress redirect dev eth1 This patch adds support for matching Geneve options in the order supplied by the user. This leads to an efficient implementation in the software datapath (and in our opinion hardware datapaths that offload this feature). It is also compatible with Geneve options matching provided by the Open vSwitch kernel datapath which is relevant here as the Flower classifier may be used as a mechanism to program flows into hardware as a form of Open vSwitch datapath offload (sometimes referred to as OVS-TC). The netlink Kernel/Userspace API may be extended, for example by adding a flag, if other matching options are desired, for example matching given options in any order. This would require an implementation in the TC software datapath. And be done in a way that drivers that facilitate offload of the Flower classifier can reject or accept such flows based on hardware datapath capabilities. This approach was discussed and agreed on at Netconf 2017 in Seoul. Signed-off-by: Simon Horman Signed-off-by: Pieter Jansen van Vuuren Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 26 +++++ net/sched/cls_flower.c | 244 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 269 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 48e5b5d49a34..be382fb0592d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -480,11 +480,37 @@ enum { TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + __TCA_FLOWER_MAX, }; #define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index a3b69bb6f4b0..9da244235170 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ struct fl_flow_key { struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; struct flow_dissector_key_ip enc_ip; + struct flow_dissector_key_enc_opts enc_opts; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -482,6 +484,21 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPTS] = { .type = NLA_NESTED }, + [TCA_FLOWER_KEY_ENC_OPTS_MASK] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, + .len = 128 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -603,6 +620,145 @@ static void fl_set_key_ip(struct nlattr **tb, bool encap, fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)); } +static int fl_set_geneve_opt(const struct nlattr *nla, struct fl_flow_key *key, + int depth, int option_len, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1]; + struct nlattr *class = NULL, *type = NULL, *data = NULL; + struct geneve_opt *opt; + int err, data_len = 0; + + if (option_len > sizeof(struct geneve_opt)) + data_len = option_len - sizeof(struct geneve_opt); + + opt = (struct geneve_opt *)&key->enc_opts.data[key->enc_opts.len]; + memset(opt, 0xff, option_len); + opt->length = data_len / 4; + opt->r1 = 0; + opt->r2 = 0; + opt->r3 = 0; + + /* If no mask has been prodived we assume an exact match. */ + if (!depth) + return sizeof(struct geneve_opt) + data_len; + + if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_GENEVE) { + NL_SET_ERR_MSG(extack, "Non-geneve option type for mask"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, + nla, geneve_opt_policy, extack); + if (err < 0) + return err; + + /* We are not allowed to omit any of CLASS, TYPE or DATA + * fields from the key. + */ + if (!option_len && + (!tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] || + !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] || + !tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA])) { + NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); + return -EINVAL; + } + + /* Omitting any of CLASS, TYPE or DATA fields is allowed + * for the mask. + */ + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]) { + int new_len = key->enc_opts.len; + + data = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA]; + data_len = nla_len(data); + if (data_len < 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); + return -ERANGE; + } + if (data_len % 4) { + NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); + return -ERANGE; + } + + new_len += sizeof(struct geneve_opt) + data_len; + BUILD_BUG_ON(FLOW_DIS_TUN_OPTS_MAX != IP_TUNNEL_OPTS_MAX); + if (new_len > FLOW_DIS_TUN_OPTS_MAX) { + NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); + return -ERANGE; + } + opt->length = data_len / 4; + memcpy(opt->opt_data, nla_data(data), data_len); + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]) { + class = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS]; + opt->opt_class = nla_get_be16(class); + } + + if (tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]) { + type = tb[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE]; + opt->type = nla_get_u8(type); + } + + return sizeof(struct geneve_opt) + data_len; +} + +static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, + struct fl_flow_key *mask, + struct netlink_ext_ack *extack) +{ + const struct nlattr *nla_enc_key, *nla_opt_key, *nla_opt_msk = NULL; + int option_len, key_depth, msk_depth = 0; + + nla_enc_key = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS]); + + if (tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]) { + nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + } + + nla_for_each_attr(nla_opt_key, nla_enc_key, + nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS]), key_depth) { + switch (nla_type(nla_opt_key)) { + case TCA_FLOWER_KEY_ENC_OPTS_GENEVE: + option_len = 0; + key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + option_len = fl_set_geneve_opt(nla_opt_key, key, + key_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + key->enc_opts.len += option_len; + /* At the same time we need to parse through the mask + * in order to verify exact and mask attribute lengths. + */ + mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT; + option_len = fl_set_geneve_opt(nla_opt_msk, mask, + msk_depth, option_len, + extack); + if (option_len < 0) + return option_len; + + mask->enc_opts.len += option_len; + if (key->enc_opts.len != mask->enc_opts.len) { + NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); + return -EINVAL; + } + + if (msk_depth) + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); + break; + default: + NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); + return -EINVAL; + } + } + + return 0; +} + static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask, struct netlink_ext_ack *extack) @@ -799,6 +955,12 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip); + if (tb[TCA_FLOWER_KEY_ENC_OPTS]) { + ret = fl_set_enc_opt(tb, key, mask, extack); + if (ret) + return ret; + } + if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); @@ -894,6 +1056,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IP, enc_ip); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_ENC_OPTS, enc_opts); skb_flow_dissector_init(dissector, keys, cnt); } @@ -1414,6 +1578,83 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } +static int fl_dump_key_geneve_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct geneve_opt *opt; + struct nlattr *nest; + int opt_off = 0; + + nest = nla_nest_start(skb, TCA_FLOWER_KEY_ENC_OPTS_GENEVE); + if (!nest) + goto nla_put_failure; + + while (enc_opts->len > opt_off) { + opt = (struct geneve_opt *)&enc_opts->data[opt_off]; + + if (nla_put_be16(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, + opt->opt_class)) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, + opt->type)) + goto nla_put_failure; + if (nla_put(skb, TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, + opt->length * 4, opt->opt_data)) + goto nla_put_failure; + + opt_off += sizeof(struct geneve_opt) + opt->length * 4; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type, + struct flow_dissector_key_enc_opts *enc_opts) +{ + struct nlattr *nest; + int err; + + if (!enc_opts->len) + return 0; + + nest = nla_nest_start(skb, enc_opt_type); + if (!nest) + goto nla_put_failure; + + switch (enc_opts->dst_opt_type) { + case TUNNEL_GENEVE_OPT: + err = fl_dump_key_geneve_opt(skb, enc_opts); + if (err) + goto nla_put_failure; + break; + default: + goto nla_put_failure; + } + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int fl_dump_key_enc_opt(struct sk_buff *skb, + struct flow_dissector_key_enc_opts *key_opts, + struct flow_dissector_key_enc_opts *msk_opts) +{ + int err; + + err = fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS, key_opts); + if (err) + return err; + + return fl_dump_key_options(skb, TCA_FLOWER_KEY_ENC_OPTS_MASK, msk_opts); +} + static int fl_dump_key(struct sk_buff *skb, struct net *net, struct fl_flow_key *key, struct fl_flow_key *mask) { @@ -1594,7 +1835,8 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)) || - fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip)) + fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip) || + fl_dump_key_enc_opt(skb, &key->enc_opts, &mask->enc_opts)) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) -- cgit v1.2.3 From 9ca6163005e6abdf2a39eb581abc6060f677a2d7 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 6 Aug 2018 11:27:10 +0300 Subject: net: sched: cls_flower: set correct offload data in fl_reoffload fl_reoffload implementation sets following members of struct tc_cls_flower_offload incorrectly: - masked key instead of mask - key instead of masked key Fix fl_reoffload to provide correct data to offload callback. Fixes: 31533cba4327 ("net: sched: cls_flower: implement offload tcf_proto_op") Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9da244235170..6fd9bdd93796 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1338,8 +1338,8 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, TC_CLSFLOWER_REPLACE : TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long)f; cls_flower.dissector = &mask->dissector; - cls_flower.mask = &f->mkey; - cls_flower.key = &f->key; + cls_flower.mask = &mask->key; + cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; cls_flower.classid = f->res.classid; -- cgit v1.2.3 From 63cc5bcc9fc467bbe61cc9ee52509294bdf04c4b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 8 Aug 2018 14:04:13 +0200 Subject: net: sched: fix block->refcnt decrement Currently the refcnt is never decremented in case the value is not 1. Fix it by adding decrement in case the refcnt is not 1. Reported-by: Vlad Buslov Fixes: f71e0ca4db18 ("net: sched: Avoid implicit chain 0 creation") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/sched') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 194c2e0b2737..f922ce27ed5e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -780,6 +780,8 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q, block->refcnt--; if (list_empty(&block->chain_list)) kfree(block); + } else { + block->refcnt--; } } EXPORT_SYMBOL(tcf_block_put_ext); -- cgit v1.2.3 From 2142236b45843dbcbe9691d24cf06caff91a78fd Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:41 +0300 Subject: net: sched: act_bpf: remove dependency on rtnl lock Use tcf spinlock to protect bpf action private data from concurrent modification during dump and init. Remove rtnl lock assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6203eb075c9a..9e8a33f9fee3 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -143,11 +143,12 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, .index = prog->tcf_index, .refcnt = refcount_read(&prog->tcf_refcnt) - ref, .bindcnt = atomic_read(&prog->tcf_bindcnt) - bind, - .action = prog->tcf_action, }; struct tcf_t tm; int ret; + spin_lock(&prog->tcf_lock); + opt.action = prog->tcf_action; if (nla_put(skb, TCA_ACT_BPF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -163,9 +164,11 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, TCA_ACT_BPF_PAD)) goto nla_put_failure; + spin_unlock(&prog->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&prog->tcf_lock); nlmsg_trim(skb, tp); return -1; } @@ -264,7 +267,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); /* updates to prog->filter are prevented, since it's called either - * with rtnl lock or during final cleanup in rcu callback + * with tcf lock or during final cleanup in rcu callback */ cfg->filter = rcu_dereference_protected(prog->filter, 1); @@ -336,8 +339,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, goto out; prog = to_bpf(*act); - ASSERT_RTNL(); + spin_lock(&prog->tcf_lock); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); @@ -349,6 +352,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->tcf_action = parm->action; rcu_assign_pointer(prog->filter, cfg.filter); + spin_unlock(&prog->tcf_lock); if (res == ACT_P_CREATED) { tcf_idr_insert(tn, *act); -- cgit v1.2.3 From b6a2b971c0b00253197682fbdf1c55fc0e2610a4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:42 +0300 Subject: net: sched: act_csum: remove dependency on rtnl lock Use tcf lock to protect csum action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_csum.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 648a3a35b720..f01c59ba6d12 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -50,7 +50,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); - struct tcf_csum_params *params_old, *params_new; + struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tc_csum *parm; struct tcf_csum *p; @@ -88,20 +88,22 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } p = to_tcf_csum(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); return -ENOMEM; } - params_old = rtnl_dereference(p->params); + params_new->update_flags = parm->update_flags; + spin_lock(&p->tcf_lock); p->tcf_action = parm->action; - params_new->update_flags = parm->update_flags; - rcu_assign_pointer(p->params, params_new); - if (params_old) - kfree_rcu(params_old, rcu); + rcu_swap_protected(p->params, params_new, + lockdep_is_held(&p->tcf_lock)); + spin_unlock(&p->tcf_lock); + + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -599,11 +601,13 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, - .action = p->tcf_action, }; struct tcf_t t; - params = rtnl_dereference(p->params); + spin_lock(&p->tcf_lock); + params = rcu_dereference_protected(p->params, + lockdep_is_held(&p->tcf_lock)); + opt.action = p->tcf_action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -612,10 +616,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; + spin_unlock(&p->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From e8917f437006686b8fa1b9e54f31d7abc0ea7e97 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:43 +0300 Subject: net: sched: act_gact: remove dependency on rtnl lock Use tcf spinlock to protect gact action private state from concurrent modification during dump and init. Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_gact.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 661b72b9147d..bfccd34a3968 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -113,7 +113,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(*a); - ASSERT_RTNL(); + spin_lock(&gact->tcf_lock); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { @@ -126,6 +126,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact->tcfg_ptype = p_parm->ptype; } #endif + spin_unlock(&gact->tcf_lock); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; @@ -178,10 +180,11 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, .index = gact->tcf_index, .refcnt = refcount_read(&gact->tcf_refcnt) - ref, .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, - .action = gact->tcf_action, }; struct tcf_t t; + spin_lock(&gact->tcf_lock); + opt.action = gact->tcf_action; if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; #ifdef CONFIG_GACT_PROB @@ -199,9 +202,12 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &gact->tcf_tm); if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; + spin_unlock(&gact->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&gact->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 54d0d423a48aa0e61bb39665d20376ba7b940535 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:44 +0300 Subject: net: sched: act_ife: remove dependency on rtnl lock Use tcf spinlock and rcu to protect params pointer from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Ife action has meta-actions that are compiled as standalone modules. Rtnl mutex must be released while loading a kernel module. In order to support execution without rtnl mutex, propagate 'rtnl_held' argument to meta action loading functions. When requesting meta action module, conditionally release rtnl lock depending on 'rtnl_held' argument. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ife.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index df4060e32d43..5d200495e467 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -268,7 +268,8 @@ static const char *ife_meta_id2name(u32 metaid) * under ife->tcf_lock for existing action */ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, - void *val, int len, bool exists) + void *val, int len, bool exists, + bool rtnl_held) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; @@ -278,9 +279,11 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, #ifdef CONFIG_MODULES if (exists) spin_unlock_bh(&ife->tcf_lock); - rtnl_unlock(); + if (rtnl_held) + rtnl_unlock(); request_module("ife-meta-%s", ife_meta_id2name(metaid)); - rtnl_lock(); + if (rtnl_held) + rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); ops = find_ife_oplist(metaid); @@ -421,7 +424,7 @@ static void tcf_ife_cleanup(struct tc_action *a) /* under ife->tcf_lock for existing action */ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, - bool exists) + bool exists, bool rtnl_held) { int len = 0; int rc = 0; @@ -433,7 +436,8 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = load_metaops_and_vet(ife, i, val, len, exists); + rc = load_metaops_and_vet(ife, i, val, len, exists, + rtnl_held); if (rc != 0) return rc; @@ -454,7 +458,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; - struct tcf_ife_params *p, *p_old; + struct tcf_ife_params *p; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -558,7 +562,7 @@ metadata_parse_err: return err; } - err = populate_metalist(ife, tb2, exists); + err = populate_metalist(ife, tb2, exists, rtnl_held); if (err) goto metadata_parse_err; @@ -581,13 +585,13 @@ metadata_parse_err: } ife->tcf_action = parm->action; + /* protected by tcf_lock when modifying existing action */ + rcu_swap_protected(ife->params, p, 1); + if (exists) spin_unlock_bh(&ife->tcf_lock); - - p_old = rtnl_dereference(ife->params); - rcu_assign_pointer(ife->params, p); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -600,16 +604,20 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); - struct tcf_ife_params *p = rtnl_dereference(ife->params); + struct tcf_ife_params *p; struct tc_ife opt = { .index = ife->tcf_index, .refcnt = refcount_read(&ife->tcf_refcnt) - ref, .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - .action = ife->tcf_action, - .flags = p->flags, }; struct tcf_t t; + spin_lock_bh(&ife->tcf_lock); + opt.action = ife->tcf_action; + p = rcu_dereference_protected(ife->params, + lockdep_is_held(&ife->tcf_lock)); + opt.flags = p->flags; + if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -635,9 +643,11 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, pr_info("Failed to dump metalist\n"); } + spin_unlock_bh(&ife->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&ife->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From ff25276de997f41197ebab91935627c249a30fc4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:45 +0300 Subject: net: sched: act_ipt: remove dependency on rtnl lock Use tcf spinlock to protect ipt action private data from concurrent modification during dump. Ipt init already takes tcf spinlock when modifying ipt state. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/sched') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 0dc787a57798..e149f0e66cb6 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -288,6 +288,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, * for foolproof you need to not assume this */ + spin_lock_bh(&ipt->tcf_lock); t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) goto nla_put_failure; @@ -307,10 +308,12 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; + spin_unlock_bh(&ipt->tcf_lock); kfree(t); return skb->len; nla_put_failure: + spin_unlock_bh(&ipt->tcf_lock); nlmsg_trim(skb, b); kfree(t); return -1; -- cgit v1.2.3 From 67b0c1a3c9ced3726dea73000f8900f453fc894f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:46 +0300 Subject: net: sched: act_pedit: remove dependency on rtnl lock Rearrange pedit init code to only access pedit action data while holding tcf spinlock. Change keys allocation type to atomic to allow it to execute while holding tcf spinlock. Take tcf spinlock in dump function when accessing pedit action data. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 43ba999b2d23..3f62da72ab6a 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -187,44 +187,38 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, tcf_idr_cleanup(tn, parm->index); goto out_free; } - p = to_pedit(*a); - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - tcf_idr_release(*a, bind); - ret = -ENOMEM; - goto out_free; - } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) goto out_free; if (!ovr) { - tcf_idr_release(*a, bind); ret = -EEXIST; - goto out_free; - } - p = to_pedit(*a); - if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) { - keys = kmalloc(ksize, GFP_KERNEL); - if (!keys) { - ret = -ENOMEM; - goto out_free; - } + goto out_release; } } else { return err; } + p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); - p->tcfp_flags = parm->flags; - p->tcf_action = parm->action; - if (keys) { + + if (ret == ACT_P_CREATED || + (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { + keys = kmalloc(ksize, GFP_ATOMIC); + if (!keys) { + spin_unlock_bh(&p->tcf_lock); + ret = -ENOMEM; + goto out_release; + } kfree(p->tcfp_keys); p->tcfp_keys = keys; p->tcfp_nkeys = parm->nkeys; } memcpy(p->tcfp_keys, parm->keys, ksize); + p->tcfp_flags = parm->flags; + p->tcf_action = parm->action; + kfree(p->tcfp_keys_ex); p->tcfp_keys_ex = keys_ex; @@ -232,6 +226,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; + +out_release: + tcf_idr_release(*a, bind); out_free: kfree(keys_ex); return ret; @@ -410,6 +407,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, if (unlikely(!opt)) return -ENOBUFS; + spin_lock_bh(&p->tcf_lock); memcpy(opt->keys, p->tcfp_keys, p->tcfp_nkeys * sizeof(struct tc_pedit_key)); opt->index = p->tcf_index; @@ -432,11 +430,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; + spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: + spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; -- cgit v1.2.3 From d7728495665601658c7f94f3b5fa4e3f54d71c18 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:47 +0300 Subject: net: sched: act_sample: remove dependency on rtnl lock Use tcf spinlock to protect private sample action data from concurrent modification during dump and init. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_sample.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 2608ccc83e5e..81071afe1b43 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -80,11 +80,13 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, } s = to_sample(*a); + spin_lock(&s->tcf_lock); s->tcf_action = parm->action; s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { + spin_unlock(&s->tcf_lock); tcf_idr_release(*a, bind); return -ENOMEM; } @@ -94,6 +96,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } + spin_unlock(&s->tcf_lock); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -105,7 +108,8 @@ static void tcf_sample_cleanup(struct tc_action *a) struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; - psample_group = rtnl_dereference(s->psample_group); + /* last reference to action, no need to lock */ + psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); @@ -174,12 +178,13 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, - .action = s->tcf_action, .refcnt = refcount_read(&s->tcf_refcnt) - ref, .bindcnt = atomic_read(&s->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock(&s->tcf_lock); + opt.action = s->tcf_action; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -196,9 +201,12 @@ static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; + spin_unlock(&s->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&s->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 5e48180ed8bcfa60e02887ba801307caf14bbe40 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:48 +0300 Subject: net: sched: act_simple: remove dependency on rtnl lock Use tcf spinlock to protect private simple action data from concurrent modification during dump. (simple init already uses tcf spinlock when changing action state) Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_simple.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index aa51152e0066..18e4452574cd 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -156,10 +156,11 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; @@ -167,9 +168,12 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From c8814552fe51358f5fc46bc1c4aa4bb68454f4eb Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:49 +0300 Subject: net: sched: act_skbmod: remove dependency on rtnl lock Move read of skbmod_p rcu pointer to be protected by tcf spinlock. Use tcf spinlock to protect private skbmod data from concurrent modification during dump. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_skbmod.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index c437c6d51a71..e9c86ade3b40 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -156,7 +156,6 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, d = to_skbmod(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL); if (unlikely(!p)) { tcf_idr_release(*a, bind); @@ -166,10 +165,10 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, p->flags = lflags; d->tcf_action = parm->action; - p_old = rtnl_dereference(d->skbmod_p); - if (ovr) spin_lock_bh(&d->tcf_lock); + /* Protected by tcf_lock if overwriting existing action. */ + p_old = rcu_dereference_protected(d->skbmod_p, 1); if (lflags & SKBMOD_F_DMAC) ether_addr_copy(p->eth_dst, daddr); @@ -205,15 +204,18 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, { struct tcf_skbmod *d = to_skbmod(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbmod_params *p = rtnl_dereference(d->skbmod_p); + struct tcf_skbmod_params *p; struct tc_skbmod opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, - .action = d->tcf_action, }; struct tcf_t t; + spin_lock_bh(&d->tcf_lock); + opt.action = d->tcf_action; + p = rcu_dereference_protected(d->skbmod_p, + lockdep_is_held(&d->tcf_lock)); opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -231,8 +233,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; + spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 729e01260989cc06c8a78491b46545793aef323a Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:50 +0300 Subject: net: sched: act_tunnel_key: remove dependency on rtnl lock Use tcf lock to protect tunnel key action struct private data from concurrent modification in init and dump. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl lock assertion that is no longer required. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index d42d9e112789..ba2ae9f75ef5 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -204,7 +204,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; - struct tcf_tunnel_key_params *params_old; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; struct tc_tunnel_key *parm; @@ -346,24 +345,22 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, t = to_tunnel_key(*a); - ASSERT_RTNL(); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { tcf_idr_release(*a, bind); NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); return -ENOMEM; } - - params_old = rtnl_dereference(t->params); - - t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; - rcu_assign_pointer(t->params, params_new); - - if (params_old) - kfree_rcu(params_old, rcu); + spin_lock(&t->tcf_lock); + t->tcf_action = parm->action; + rcu_swap_protected(t->params, params_new, + lockdep_is_held(&t->tcf_lock)); + spin_unlock(&t->tcf_lock); + if (params_new) + kfree_rcu(params_new, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -485,12 +482,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, .index = t->tcf_index, .refcnt = refcount_read(&t->tcf_refcnt) - ref, .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, - .action = t->tcf_action, }; struct tcf_t tm; - params = rtnl_dereference(t->params); - + spin_lock(&t->tcf_lock); + params = rcu_dereference_protected(t->params, + lockdep_is_held(&t->tcf_lock)); + opt.action = t->tcf_action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -522,10 +520,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; + spin_unlock(&t->tcf_lock); return skb->len; nla_put_failure: + spin_unlock(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 764e9a24480f6ffba5493fb21e6a7b030d6b8b67 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:51 +0300 Subject: net: sched: act_vlan: remove dependency on rtnl lock Use tcf spinlock to protect vlan action private data from concurrent modification during dump and init. Use rcu swap operation to reassign params pointer under protection of tcf lock. (old params value is not used by init, so there is no need of standalone rcu dereference step) Remove rtnl assertion that is no longer necessary. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 15a0ee214c9c..5bde17fe3608 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -109,7 +109,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, vlan_net_id); struct nlattr *tb[TCA_VLAN_MAX + 1]; - struct tcf_vlan_params *p, *p_old; + struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; int action; @@ -202,26 +202,24 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, v = to_vlan(*a); - ASSERT_RTNL(); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { tcf_idr_release(*a, bind); return -ENOMEM; } - v->tcf_action = parm->action; - - p_old = rtnl_dereference(v->vlan_p); - p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; p->tcfv_push_proto = push_proto; - rcu_assign_pointer(v->vlan_p, p); + spin_lock(&v->tcf_lock); + v->tcf_action = parm->action; + rcu_swap_protected(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); + spin_unlock(&v->tcf_lock); - if (p_old) - kfree_rcu(p_old, rcu); + if (p) + kfree_rcu(p, rcu); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -243,16 +241,18 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); - struct tcf_vlan_params *p = rtnl_dereference(v->vlan_p); + struct tcf_vlan_params *p; struct tc_vlan opt = { .index = v->tcf_index, .refcnt = refcount_read(&v->tcf_refcnt) - ref, .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, - .action = v->tcf_action, - .v_action = p->tcfv_action, }; struct tcf_t t; + spin_lock(&v->tcf_lock); + opt.action = v->tcf_action; + p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -268,9 +268,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; + spin_unlock(&v->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&v->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 84a75b329be84c108a21ab9c02a52a9bf9e5a919 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:52 +0300 Subject: net: sched: extend action ops with put_dev callback As a preparation for removing dependency on rtnl lock from rules update path, all users of shared objects must take reference while working with them. Extend action ops with put_dev() API to be used on net device returned by get_dev(). Modify mirred action (only action that implements get_dev callback): - Take reference to net device in get_dev. - Implement put_dev API that releases reference to net device. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + net/sched/act_mirred.c | 12 +++++++++++- net/sched/cls_api.c | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net/sched') diff --git a/include/net/act_api.h b/include/net/act_api.h index 8c9bc02d05e1..1ad5b19e83a9 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -101,6 +101,7 @@ struct tc_action_ops { void (*stats_update)(struct tc_action *, u64, u32, u64); size_t (*get_fill_size)(const struct tc_action *act); struct net_device *(*get_dev)(const struct tc_action *a); + void (*put_dev)(struct net_device *dev); int (*delete)(struct net *net, u32 index); }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b26d060da08e..7a045cc7fe3b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -358,8 +358,17 @@ static struct notifier_block mirred_device_notifier = { static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rtnl_dereference(m->tcfm_dev); + + if (dev) + dev_hold(dev); - return rtnl_dereference(m->tcfm_dev); + return dev; +} + +static void tcf_mirred_put_dev(struct net_device *dev) +{ + dev_put(dev); } static int tcf_mirred_delete(struct net *net, u32 index) @@ -382,6 +391,7 @@ static struct tc_action_ops act_mirred_ops = { .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, + .put_dev = tcf_mirred_put_dev, .delete = tcf_mirred_delete, }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f922ce27ed5e..31bd1439cf60 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -2176,6 +2176,7 @@ static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, if (!dev) continue; ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + a->ops->put_dev(dev); if (ret < 0) return ret; ok_count += ret; -- cgit v1.2.3 From 4e232818bd32b29f15bef532f320a14367d172b4 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:53 +0300 Subject: net: sched: act_mirred: remove dependency on rtnl lock Re-introduce mirred list spinlock, that was removed some time ago, in order to protect it from concurrent modifications, instead of relying on rtnl lock. Use tcf spinlock to protect mirred action private data from concurrent modification in init and dump. Rearrange access to mirred data in order to be performed only while holding the lock. Rearrange net dev access to always hold reference while working with it, instead of relying on rntl lock. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 78 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 27 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7a045cc7fe3b..327be257033d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,6 +30,7 @@ #include static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock); static bool tcf_mirred_is_act_redirect(int action) { @@ -62,13 +63,23 @@ static bool tcf_mirred_can_reinsert(int action) return false; } +static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) +{ + return rcu_dereference_protected(m->tcfm_dev, + lockdep_is_held(&m->tcf_lock)); +} + static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; + spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); - dev = rtnl_dereference(m->tcfm_dev); + spin_unlock(&mirred_list_lock); + + /* last reference to action, no need to lock */ + dev = rcu_dereference_protected(m->tcfm_dev, 1); if (dev) dev_put(dev); } @@ -128,22 +139,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } - if (parm->ifindex) { - dev = __dev_get_by_index(net, parm->ifindex); - if (dev == NULL) { - if (exists) - tcf_idr_release(*a, bind); - else - tcf_idr_cleanup(tn, parm->index); - return -ENODEV; - } - mac_header_xmit = dev_is_mac_header_xmit(dev); - } else { - dev = NULL; - } if (!exists) { - if (!dev) { + if (!parm->ifindex) { tcf_idr_cleanup(tn, parm->index); NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist"); return -EINVAL; @@ -161,19 +159,31 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } m = to_mirred(*a); - ASSERT_RTNL(); + spin_lock(&m->tcf_lock); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; - if (dev != NULL) { - if (ret != ACT_P_CREATED) - dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); - dev_hold(dev); - rcu_assign_pointer(m->tcfm_dev, dev); + + if (parm->ifindex) { + dev = dev_get_by_index(net, parm->ifindex); + if (!dev) { + spin_unlock(&m->tcf_lock); + tcf_idr_release(*a, bind); + return -ENODEV; + } + mac_header_xmit = dev_is_mac_header_xmit(dev); + rcu_swap_protected(m->tcfm_dev, dev, + lockdep_is_held(&m->tcf_lock)); + if (dev) + dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } + spin_unlock(&m->tcf_lock); if (ret == ACT_P_CREATED) { + spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); + spin_unlock(&mirred_list_lock); + tcf_idr_insert(tn, *a); } @@ -287,26 +297,33 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); - struct net_device *dev = rtnl_dereference(m->tcfm_dev); struct tc_mirred opt = { .index = m->tcf_index, - .action = m->tcf_action, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, - .eaction = m->tcfm_eaction, - .ifindex = dev ? dev->ifindex : 0, }; + struct net_device *dev; struct tcf_t t; + spin_lock(&m->tcf_lock); + opt.action = m->tcf_action; + opt.eaction = m->tcfm_eaction; + dev = tcf_mirred_dev_dereference(m); + if (dev) + opt.ifindex = dev->ifindex; + if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; + spin_unlock(&m->tcf_lock); + return skb->len; nla_put_failure: + spin_unlock(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } @@ -337,15 +354,19 @@ static int mirred_device_event(struct notifier_block *unused, ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { + spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { - if (rcu_access_pointer(m->tcfm_dev) == dev) { + spin_lock(&m->tcf_lock); + if (tcf_mirred_dev_dereference(m) == dev) { dev_put(dev); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } + spin_unlock(&m->tcf_lock); } + spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; @@ -358,10 +379,13 @@ static struct notifier_block mirred_device_notifier = { static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); - struct net_device *dev = rtnl_dereference(m->tcfm_dev); + struct net_device *dev; + rcu_read_lock(); + dev = rcu_dereference(m->tcfm_dev); if (dev) dev_hold(dev); + rcu_read_unlock(); return dev; } -- cgit v1.2.3 From e329bc427395e2d74f2bb685ef3dddda91a6695f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 10 Aug 2018 20:51:55 +0300 Subject: net: sched: act_police: remove dependency on rtnl lock Use tcf spinlock to protect police action private data from concurrent modification during dump. (init already uses tcf spinlock when changing police action state) Pass tcf spinlock as estimator lock argument to gen_replace_estimator() during action init. Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- net/sched/act_police.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 1f3192ea8df7..88c16d80c1cf 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -274,14 +274,15 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, struct tcf_police *police = to_police(a); struct tc_police opt = { .index = police->tcf_index, - .action = police->tcf_action, - .mtu = police->tcfp_mtu, - .burst = PSCHED_NS2TICKS(police->tcfp_burst), .refcnt = refcount_read(&police->tcf_refcnt) - ref, .bindcnt = atomic_read(&police->tcf_bindcnt) - bind, }; struct tcf_t t; + spin_lock_bh(&police->tcf_lock); + opt.action = police->tcf_action; + opt.mtu = police->tcfp_mtu; + opt.burst = PSCHED_NS2TICKS(police->tcfp_burst); if (police->rate_present) psched_ratecfg_getrate(&opt.rate, &police->rate); if (police->peak_present) @@ -301,10 +302,12 @@ static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, t.expires = jiffies_to_clock_t(police->tcf_tm.expires); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; + spin_unlock_bh(&police->tcf_lock); return skb->len; nla_put_failure: + spin_unlock_bh(&police->tcf_lock); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 962ad1f937d86456e88d8cbcd93766746297f711 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:49 -0400 Subject: net: sched: act_connmark method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_connmark.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 2f9bc833d046..54c0bf54f2ac 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -31,8 +31,8 @@ static unsigned int connmark_net_id; static struct tc_action_ops act_connmark_ops; -static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; @@ -209,7 +209,7 @@ static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .type = TCA_ACT_CONNMARK, .owner = THIS_MODULE, - .act = tcf_connmark, + .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, .walk = tcf_connmark_walker, -- cgit v1.2.3 From 2fbec27f816bd95ebc468d17a784b20a72b95896 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:50 -0400 Subject: net: sched: act_bpf method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 9e8a33f9fee3..9b30e62805c7 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -34,8 +34,8 @@ struct tcf_bpf_cfg { static unsigned int bpf_net_id; static struct tc_action_ops act_bpf_ops; -static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, - struct tcf_result *res) +static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, + struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); @@ -406,7 +406,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .type = TCA_ACT_BPF, .owner = THIS_MODULE, - .act = tcf_bpf, + .act = tcf_bpf_act, .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, -- cgit v1.2.3 From c831549c3f53537e3b8a205c5d67cbc16c054f6a Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:51 -0400 Subject: net: sched: act_sum method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_csum.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index f01c59ba6d12..5596fae4e478 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -555,8 +555,8 @@ fail: return 0; } -static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); struct tcf_csum_params *params; @@ -670,7 +670,7 @@ static struct tc_action_ops act_csum_ops = { .kind = "csum", .type = TCA_ACT_CSUM, .owner = THIS_MODULE, - .act = tcf_csum, + .act = tcf_csum_act, .dump = tcf_csum_dump, .init = tcf_csum_init, .cleanup = tcf_csum_cleanup, -- cgit v1.2.3 From 1740005e2a0cae1ba87e3efe2690755fe169837b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:52 -0400 Subject: net: sched: act_gact method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_gact.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index bfccd34a3968..52a3e474d822 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -133,8 +133,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, return ret; } -static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); @@ -254,7 +254,7 @@ static struct tc_action_ops act_gact_ops = { .kind = "gact", .type = TCA_ACT_GACT, .owner = THIS_MODULE, - .act = tcf_gact, + .act = tcf_gact_act, .stats_update = tcf_gact_stats_update, .dump = tcf_gact_dump, .init = tcf_gact_init, -- cgit v1.2.3 From 11b9695b3ff06990333d77963607944574953c98 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:53 -0400 Subject: net: sched: act_ipt method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index e149f0e66cb6..51f235bbeb5b 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -222,8 +222,8 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla, bind); } -static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); @@ -348,7 +348,7 @@ static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .type = TCA_ACT_IPT, .owner = THIS_MODULE, - .act = tcf_ipt, + .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, @@ -406,7 +406,7 @@ static struct tc_action_ops act_xt_ops = { .kind = "xt", .type = TCA_ACT_XT, .owner = THIS_MODULE, - .act = tcf_ipt, + .act = tcf_ipt_act, .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_xt_init, -- cgit v1.2.3 From 0390514fe15501e9ddc4e87bdeed35fec9fc4802 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:54 -0400 Subject: net: sched: act_nat method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_nat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 4dd9188a72fd..822e903bfc25 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -93,8 +93,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, return ret; } -static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; @@ -311,7 +311,7 @@ static struct tc_action_ops act_nat_ops = { .kind = "nat", .type = TCA_ACT_NAT, .owner = THIS_MODULE, - .act = tcf_nat, + .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, .walk = tcf_nat_walker, -- cgit v1.2.3 From 6a2b401cd17d41944672563b2edf65cdef44c242 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:55 -0400 Subject: net: sched: act_pedit method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_pedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3f62da72ab6a..8a7a7cb94e83 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -288,8 +288,8 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, return ret; } -static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); int i; @@ -471,7 +471,7 @@ static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .type = TCA_ACT_PEDIT, .owner = THIS_MODULE, - .act = tcf_pedit, + .act = tcf_pedit_act, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, -- cgit v1.2.3 From 2ac063474dc738700eab3425d1f7c4ad98776bcd Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:56 -0400 Subject: net: sched: act_police method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_police.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 88c16d80c1cf..06f0742db593 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -56,7 +56,7 @@ struct tc_police_compat { static unsigned int police_net_id; static struct tc_action_ops act_police_ops; -static int tcf_act_police_walker(struct net *net, struct sk_buff *skb, +static int tcf_police_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) @@ -73,7 +73,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, }; -static int tcf_act_police_init(struct net *net, struct nlattr *nla, +static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) @@ -203,7 +203,7 @@ failure: return err; } -static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, +static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); @@ -267,7 +267,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, return police->tcf_action; } -static int tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, +static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); @@ -335,10 +335,10 @@ static struct tc_action_ops act_police_ops = { .kind = "police", .type = TCA_ID_POLICE, .owner = THIS_MODULE, - .act = tcf_act_police, - .dump = tcf_act_police_dump, - .init = tcf_act_police_init, - .walk = tcf_act_police_walker, + .act = tcf_police_act, + .dump = tcf_police_dump, + .init = tcf_police_init, + .walk = tcf_police_walker, .lookup = tcf_police_search, .delete = tcf_police_delete, .size = sizeof(struct tcf_police), -- cgit v1.2.3 From 798de374e50309dba39cee16527cb3534e84ba86 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:57 -0400 Subject: net: sched: act_simple method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_simple.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 18e4452574cd..e616523ba3c1 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -28,8 +28,8 @@ static unsigned int simp_net_id; static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 -static int tcf_simp(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_defact *d = to_defact(a); @@ -207,7 +207,7 @@ static struct tc_action_ops act_simp_ops = { .kind = "simple", .type = TCA_ACT_SIMP, .owner = THIS_MODULE, - .act = tcf_simp, + .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, -- cgit v1.2.3 From 45da1dac612c0658ed946d573000e88f0d8ec5bc Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:58 -0400 Subject: net: sched: act_skbedit method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index a6db47ebec11..926d7bc4a89d 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -33,8 +33,8 @@ static unsigned int skbedit_net_id; static struct tc_action_ops act_skbedit_ops; -static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; @@ -310,7 +310,7 @@ static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .type = TCA_ACT_SKBEDIT, .owner = THIS_MODULE, - .act = tcf_skbedit, + .act = tcf_skbedit_act, .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, -- cgit v1.2.3 From 353d2c253f4cc5c7b28a041a79949f46ed5edb25 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:34:59 -0400 Subject: net: sched: act_skbmod method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_skbmod.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index e9c86ade3b40..d6a1af0c4171 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -24,7 +24,7 @@ static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; #define MAX_EDIT_LEN ETH_HLEN -static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a, +static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); @@ -270,7 +270,7 @@ static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .type = TCA_ACT_SKBMOD, .owner = THIS_MODULE, - .act = tcf_skbmod_run, + .act = tcf_skbmod_act, .dump = tcf_skbmod_dump, .init = tcf_skbmod_init, .cleanup = tcf_skbmod_cleanup, -- cgit v1.2.3 From 8aa7f22e5649db6f994cda1212a37e7f8ae4e63e Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:35:00 -0400 Subject: net: sched: act_vlan method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_vlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 5bde17fe3608..d1f5028384c9 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -22,8 +22,8 @@ static unsigned int vlan_net_id; static struct tc_action_ops act_vlan_ops; -static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; @@ -307,7 +307,7 @@ static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .type = TCA_ACT_VLAN, .owner = THIS_MODULE, - .act = tcf_vlan, + .act = tcf_vlan_act, .dump = tcf_vlan_dump, .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, -- cgit v1.2.3 From 7c5790c4da0e5b96b147d683c94ba3ed93e5f0fe Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 12 Aug 2018 09:35:01 -0400 Subject: net: sched: act_mirred method rename for grep-ability and consistency Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 327be257033d..8ec216001077 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -190,8 +190,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; } -static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); struct sk_buff *skb2 = skb; @@ -406,7 +406,7 @@ static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .type = TCA_ACT_MIRRED, .owner = THIS_MODULE, - .act = tcf_mirred, + .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, -- cgit v1.2.3 From 42c625a486f367ad57d4257de6d9459daf9484a0 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 13 Aug 2018 20:20:11 +0300 Subject: net: sched: act_ife: disable bh when taking ife_mod_lock Lockdep reports deadlock for following locking scenario in ife action: Task one: 1) Executes ife action update. 2) Takes tcfa_lock. 3) Waits on ife_mod_lock which is already taken by task two. Task two: 1) Executes any path that obtains ife_mod_lock without disabling bh (any path that takes ife_mod_lock while holding tcfa_lock has bh disabled) like loading a meta module, or creating new action. 2) Takes ife_mod_lock. 3) Task is preempted by rate estimator timer. 4) Timer callback waits on tcfa_lock which is taken by task one. In described case tasks deadlock because they take same two locks in different order. To prevent potential deadlock reported by lockdep, always disable bh when obtaining ife_mod_lock. Lockdep warning: [ 508.101192] ===================================================== [ 508.107708] WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected [ 508.114728] 4.18.0-rc8+ #646 Not tainted [ 508.119050] ----------------------------------------------------- [ 508.125559] tc/5460 [HC0[0]:SC0[2]:HE1:SE0] is trying to acquire: [ 508.132025] 000000005a938c68 (ife_mod_lock){++++}, at: find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.140996] and this task is already holding: [ 508.147548] 00000000d46f6c56 (&(&p->tcfa_lock)->rlock){+.-.}, at: tcf_ife_init+0x6ae/0xf40 [act_ife] [ 508.157371] which would create a new lock dependency: [ 508.162828] (&(&p->tcfa_lock)->rlock){+.-.} -> (ife_mod_lock){++++} [ 508.169572] but this new dependency connects a SOFTIRQ-irq-safe lock: [ 508.178197] (&(&p->tcfa_lock)->rlock){+.-.} [ 508.178201] ... which became SOFTIRQ-irq-safe at: [ 508.189771] _raw_spin_lock+0x2c/0x40 [ 508.193906] est_fetch_counters+0x41/0xb0 [ 508.198391] est_timer+0x83/0x3c0 [ 508.202180] call_timer_fn+0x16a/0x5d0 [ 508.206400] run_timer_softirq+0x399/0x920 [ 508.210967] __do_softirq+0x157/0x97d [ 508.215102] irq_exit+0x152/0x1c0 [ 508.218888] smp_apic_timer_interrupt+0xc0/0x4e0 [ 508.223976] apic_timer_interrupt+0xf/0x20 [ 508.228540] cpuidle_enter_state+0xf8/0x5d0 [ 508.233198] do_idle+0x28a/0x350 [ 508.236881] cpu_startup_entry+0xc7/0xe0 [ 508.241296] start_secondary+0x2e8/0x3f0 [ 508.245678] secondary_startup_64+0xa5/0xb0 [ 508.250347] to a SOFTIRQ-irq-unsafe lock: (ife_mod_lock){++++} [ 508.256531] ... which became SOFTIRQ-irq-unsafe at: [ 508.267279] ... [ 508.267283] _raw_write_lock+0x2c/0x40 [ 508.273653] register_ife_op+0x118/0x2c0 [act_ife] [ 508.278926] do_one_initcall+0xf7/0x4d9 [ 508.283214] do_init_module+0x18b/0x44e [ 508.287521] load_module+0x4167/0x5730 [ 508.291739] __do_sys_finit_module+0x16d/0x1a0 [ 508.296654] do_syscall_64+0x7a/0x3f0 [ 508.300788] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.306302] other info that might help us debug this: [ 508.315286] Possible interrupt unsafe locking scenario: [ 508.322771] CPU0 CPU1 [ 508.327681] ---- ---- [ 508.332604] lock(ife_mod_lock); [ 508.336300] local_irq_disable(); [ 508.342608] lock(&(&p->tcfa_lock)->rlock); [ 508.349793] lock(ife_mod_lock); [ 508.355990] [ 508.358974] lock(&(&p->tcfa_lock)->rlock); [ 508.363803] *** DEADLOCK *** [ 508.370715] 2 locks held by tc/5460: [ 508.374680] #0: 00000000e27e4fa4 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x583/0x7b0 [ 508.383366] #1: 00000000d46f6c56 (&(&p->tcfa_lock)->rlock){+.-.}, at: tcf_ife_init+0x6ae/0xf40 [act_ife] [ 508.393648] the dependencies between SOFTIRQ-irq-safe lock and the holding lock: [ 508.403505] -> (&(&p->tcfa_lock)->rlock){+.-.} ops: 1001553 { [ 508.409646] HARDIRQ-ON-W at: [ 508.413136] _raw_spin_lock_bh+0x34/0x40 [ 508.419059] gnet_stats_start_copy_compat+0xa2/0x230 [ 508.426021] gnet_stats_start_copy+0x16/0x20 [ 508.432333] tcf_action_copy_stats+0x95/0x1d0 [ 508.438735] tcf_action_dump_1+0xb0/0x4e0 [ 508.444795] tcf_action_dump+0xca/0x200 [ 508.450673] tcf_exts_dump+0xd9/0x320 [ 508.456392] fl_dump+0x1b7/0x4a0 [cls_flower] [ 508.462798] tcf_fill_node+0x380/0x530 [ 508.468601] tfilter_notify+0xdf/0x1c0 [ 508.474404] tc_new_tfilter+0x84a/0xc90 [ 508.480270] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.486419] netlink_rcv_skb+0x184/0x220 [ 508.492394] netlink_unicast+0x31b/0x460 [ 508.507411] netlink_sendmsg+0x3fb/0x840 [ 508.513390] sock_sendmsg+0x7b/0xd0 [ 508.518907] ___sys_sendmsg+0x4c6/0x610 [ 508.524797] __sys_sendmsg+0xd7/0x150 [ 508.530510] do_syscall_64+0x7a/0x3f0 [ 508.536201] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.543301] IN-SOFTIRQ-W at: [ 508.546834] _raw_spin_lock+0x2c/0x40 [ 508.552522] est_fetch_counters+0x41/0xb0 [ 508.558571] est_timer+0x83/0x3c0 [ 508.563912] call_timer_fn+0x16a/0x5d0 [ 508.569699] run_timer_softirq+0x399/0x920 [ 508.575840] __do_softirq+0x157/0x97d [ 508.581538] irq_exit+0x152/0x1c0 [ 508.586882] smp_apic_timer_interrupt+0xc0/0x4e0 [ 508.593533] apic_timer_interrupt+0xf/0x20 [ 508.599686] cpuidle_enter_state+0xf8/0x5d0 [ 508.605895] do_idle+0x28a/0x350 [ 508.611147] cpu_startup_entry+0xc7/0xe0 [ 508.617097] start_secondary+0x2e8/0x3f0 [ 508.623029] secondary_startup_64+0xa5/0xb0 [ 508.629245] INITIAL USE at: [ 508.632686] _raw_spin_lock_bh+0x34/0x40 [ 508.638557] gnet_stats_start_copy_compat+0xa2/0x230 [ 508.645491] gnet_stats_start_copy+0x16/0x20 [ 508.651719] tcf_action_copy_stats+0x95/0x1d0 [ 508.657992] tcf_action_dump_1+0xb0/0x4e0 [ 508.663937] tcf_action_dump+0xca/0x200 [ 508.669716] tcf_exts_dump+0xd9/0x320 [ 508.675337] fl_dump+0x1b7/0x4a0 [cls_flower] [ 508.681650] tcf_fill_node+0x380/0x530 [ 508.687366] tfilter_notify+0xdf/0x1c0 [ 508.693031] tc_new_tfilter+0x84a/0xc90 [ 508.698820] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.704869] netlink_rcv_skb+0x184/0x220 [ 508.710758] netlink_unicast+0x31b/0x460 [ 508.716627] netlink_sendmsg+0x3fb/0x840 [ 508.722510] sock_sendmsg+0x7b/0xd0 [ 508.727931] ___sys_sendmsg+0x4c6/0x610 [ 508.733729] __sys_sendmsg+0xd7/0x150 [ 508.739346] do_syscall_64 +0x7a/0x3f0 [ 508.744943] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.751930] } [ 508.753964] ... key at: [] __key.61145+0x0/0x40 [ 508.760946] ... acquired at: [ 508.764294] _raw_read_lock+0x2f/0x40 [ 508.768513] find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.773692] tcf_ife_init+0x82f/0xf40 [act_ife] [ 508.778785] tcf_action_init_1+0x510/0x750 [ 508.783468] tcf_action_init+0x1e8/0x340 [ 508.787938] tcf_action_add+0xc5/0x240 [ 508.792241] tc_ctl_action+0x203/0x2a0 [ 508.796550] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.801200] netlink_rcv_skb+0x184/0x220 [ 508.805674] netlink_unicast+0x31b/0x460 [ 508.810129] netlink_sendmsg+0x3fb/0x840 [ 508.814611] sock_sendmsg+0x7b/0xd0 [ 508.818665] ___sys_sendmsg+0x4c6/0x610 [ 508.823029] __sys_sendmsg+0xd7/0x150 [ 508.827246] do_syscall_64+0x7a/0x3f0 [ 508.831483] entry_SYSCALL_64_after_hwframe+0x49/0xbe the dependencies between the lock to be acquired [ 508.838945] and SOFTIRQ-irq-unsafe lock: [ 508.851177] -> (ife_mod_lock){++++} ops: 95 { [ 508.855920] HARDIRQ-ON-W at: [ 508.859478] _raw_write_lock+0x2c/0x40 [ 508.865264] register_ife_op+0x118/0x2c0 [act_ife] [ 508.872071] do_one_initcall+0xf7/0x4d9 [ 508.877947] do_init_module+0x18b/0x44e [ 508.883819] load_module+0x4167/0x5730 [ 508.889595] __do_sys_finit_module+0x16d/0x1a0 [ 508.896043] do_syscall_64+0x7a/0x3f0 [ 508.901734] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 508.908827] HARDIRQ-ON-R at: [ 508.912359] _raw_read_lock+0x2f/0x40 [ 508.918043] find_ife_oplist+0x1e/0xc0 [act_ife] [ 508.924692] tcf_ife_init+0x82f/0xf40 [act_ife] [ 508.931252] tcf_action_init_1+0x510/0x750 [ 508.937393] tcf_action_init+0x1e8/0x340 [ 508.943366] tcf_action_add+0xc5/0x240 [ 508.949130] tc_ctl_action+0x203/0x2a0 [ 508.954922] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 508.961024] netlink_rcv_skb+0x184/0x220 [ 508.966970] netlink_unicast+0x31b/0x460 [ 508.972915] netlink_sendmsg+0x3fb/0x840 [ 508.978859] sock_sendmsg+0x7b/0xd0 [ 508.984400] ___sys_sendmsg+0x4c6/0x610 [ 508.990264] __sys_sendmsg+0xd7/0x150 [ 508.995952] do_syscall_64+0x7a/0x3f0 [ 509.001643] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.008722] SOFTIRQ-ON-W at:\ [ 509.012242] _raw_write_lock+0x2c/0x40 [ 509.018013] register_ife_op+0x118/0x2c0 [act_ife] [ 509.024841] do_one_initcall+0xf7/0x4d9 [ 509.030720] do_init_module+0x18b/0x44e [ 509.036604] load_module+0x4167/0x5730 [ 509.042397] __do_sys_finit_module+0x16d/0x1a0 [ 509.048865] do_syscall_64+0x7a/0x3f0 [ 509.054551] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.061636] SOFTIRQ-ON-R at: [ 509.065145] _raw_read_lock+0x2f/0x40 [ 509.070854] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.077515] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.084051] tcf_action_init_1+0x510/0x750 [ 509.090172] tcf_action_init+0x1e8/0x340 [ 509.096124] tcf_action_add+0xc5/0x240 [ 509.101891] tc_ctl_action+0x203/0x2a0 [ 509.107671] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.113811] netlink_rcv_skb+0x184/0x220 [ 509.119768] netlink_unicast+0x31b/0x460 [ 509.125716] netlink_sendmsg+0x3fb/0x840 [ 509.131668] sock_sendmsg+0x7b/0xd0 [ 509.137167] ___sys_sendmsg+0x4c6/0x610 [ 509.143010] __sys_sendmsg+0xd7/0x150 [ 509.148718] do_syscall_64+0x7a/0x3f0 [ 509.154443] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.161533] INITIAL USE at: [ 509.164956] _raw_read_lock+0x2f/0x40 [ 509.170574] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.177134] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.183619] tcf_action_init_1+0x510/0x750 [ 509.189674] tcf_action_init+0x1e8/0x340 [ 509.195534] tcf_action_add+0xc5/0x240 [ 509.201229] tc_ctl_action+0x203/0x2a0 [ 509.206920] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.212936] netlink_rcv_skb+0x184/0x220 [ 509.218818] netlink_unicast+0x31b/0x460 [ 509.224699] netlink_sendmsg+0x3fb/0x840 [ 509.230581] sock_sendmsg+0x7b/0xd0 [ 509.235984] ___sys_sendmsg+0x4c6/0x610 [ 509.241791] __sys_sendmsg+0xd7/0x150 [ 509.247425] do_syscall_64+0x7a/0x3f0 [ 509.253007] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.259975] } [ 509.261998] ... key at: [] ife_mod_lock+0x18/0xffffffffffff8dc0 [act_ife] [ 509.271569] ... acquired at: [ 509.274912] _raw_read_lock+0x2f/0x40 [ 509.279134] find_ife_oplist+0x1e/0xc0 [act_ife] [ 509.284324] tcf_ife_init+0x82f/0xf40 [act_ife] [ 509.289425] tcf_action_init_1+0x510/0x750 [ 509.294068] tcf_action_init+0x1e8/0x340 [ 509.298553] tcf_action_add+0xc5/0x240 [ 509.302854] tc_ctl_action+0x203/0x2a0 [ 509.307153] rtnetlink_rcv_msg+0x5bd/0x7b0 [ 509.311805] netlink_rcv_skb+0x184/0x220 [ 509.316282] netlink_unicast+0x31b/0x460 [ 509.320769] netlink_sendmsg+0x3fb/0x840 [ 509.325248] sock_sendmsg+0x7b/0xd0 [ 509.329290] ___sys_sendmsg+0x4c6/0x610 [ 509.333687] __sys_sendmsg+0xd7/0x150 [ 509.337902] do_syscall_64+0x7a/0x3f0 [ 509.342116] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 509.349601] stack backtrace: [ 509.354663] CPU: 6 PID: 5460 Comm: tc Not tainted 4.18.0-rc8+ #646 [ 509.361216] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Vlad Buslov Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_ife.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net/sched') diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 5d200495e467..fdb928ca81bb 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -167,16 +167,16 @@ static struct tcf_meta_ops *find_ife_oplist(u16 metaid) { struct tcf_meta_ops *o; - read_lock(&ife_mod_lock); + read_lock_bh(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { if (o->metaid == metaid) { if (!try_module_get(o->owner)) o = NULL; - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); return o; } } - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); return NULL; } @@ -190,12 +190,12 @@ int register_ife_op(struct tcf_meta_ops *mops) !mops->get || !mops->alloc) return -EINVAL; - write_lock(&ife_mod_lock); + write_lock_bh(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid || (strcmp(mops->name, m->name) == 0)) { - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return -EEXIST; } } @@ -204,7 +204,7 @@ int register_ife_op(struct tcf_meta_ops *mops) mops->release = ife_release_meta_gen; list_add_tail(&mops->list, &ifeoplist); - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return 0; } EXPORT_SYMBOL_GPL(unregister_ife_op); @@ -214,7 +214,7 @@ int unregister_ife_op(struct tcf_meta_ops *mops) struct tcf_meta_ops *m; int err = -ENOENT; - write_lock(&ife_mod_lock); + write_lock_bh(&ife_mod_lock); list_for_each_entry(m, &ifeoplist, list) { if (m->metaid == mops->metaid) { list_del(&mops->list); @@ -222,7 +222,7 @@ int unregister_ife_op(struct tcf_meta_ops *mops) break; } } - write_unlock(&ife_mod_lock); + write_unlock_bh(&ife_mod_lock); return err; } @@ -343,13 +343,13 @@ static int use_all_metadata(struct tcf_ife_info *ife) int rc = 0; int installed = 0; - read_lock(&ife_mod_lock); + read_lock_bh(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { rc = add_metainfo(ife, o->metaid, NULL, 0, true); if (rc == 0) installed += 1; } - read_unlock(&ife_mod_lock); + read_unlock_bh(&ife_mod_lock); if (installed) return 0; -- cgit v1.2.3 From 2df8bee5654bb2b7312662ca6810d4dc16b0b67f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 13 Aug 2018 18:44:03 +0800 Subject: net_sched: fix NULL pointer dereference when delete tcindex filter Li Shuang reported the following crash: [ 71.267724] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 [ 71.276456] PGD 800000085d9bd067 P4D 800000085d9bd067 PUD 859a0b067 PMD 0 [ 71.284127] Oops: 0000 [#1] SMP PTI [ 71.288015] CPU: 12 PID: 2386 Comm: tc Not tainted 4.18.0-rc8.latest+ #131 [ 71.295686] Hardware name: Dell Inc. PowerEdge R730/0WCJNT, BIOS 2.1.5 04/11/2016 [ 71.304037] RIP: 0010:tcindex_delete+0x72/0x280 [cls_tcindex] [ 71.310446] Code: 00 31 f6 48 87 75 20 48 85 f6 74 11 48 8b 47 18 48 8b 40 08 48 8b 40 50 e8 fb a6 f8 fc 48 85 db 0f 84 dc 00 00 00 48 8b 73 18 <8b> 56 04 48 8d 7e 04 85 d2 0f 84 7b 01 00 [ 71.331517] RSP: 0018:ffffb45207b3f898 EFLAGS: 00010282 [ 71.337345] RAX: ffff8ad3d72d6360 RBX: ffff8acc84393680 RCX: 000000000000002e [ 71.345306] RDX: ffff8ad3d72c8570 RSI: 0000000000000000 RDI: ffff8ad847a45800 [ 71.353277] RBP: ffff8acc84393688 R08: ffff8ad3d72c8400 R09: 0000000000000000 [ 71.361238] R10: ffff8ad3de786e00 R11: 0000000000000000 R12: ffffb45207b3f8c7 [ 71.369199] R13: ffff8ad3d93bd2a0 R14: 000000000000002e R15: ffff8ad3d72c9600 [ 71.377161] FS: 00007f9d3ec3e740(0000) GS:ffff8ad3df980000(0000) knlGS:0000000000000000 [ 71.386188] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 71.392597] CR2: 0000000000000004 CR3: 0000000852f06003 CR4: 00000000001606e0 [ 71.400558] Call Trace: [ 71.403299] tcindex_destroy_element+0x25/0x40 [cls_tcindex] [ 71.409611] tcindex_walk+0xbb/0x110 [cls_tcindex] [ 71.414953] tcindex_destroy+0x44/0x90 [cls_tcindex] [ 71.420492] ? tcindex_delete+0x280/0x280 [cls_tcindex] [ 71.426323] tcf_proto_destroy+0x16/0x40 [ 71.430696] tcf_chain_flush+0x51/0x70 [ 71.434876] tcf_block_put_ext.part.30+0x8f/0x1b0 [ 71.440122] tcf_block_put+0x4d/0x70 [ 71.444108] cbq_destroy+0x4d/0xd0 [sch_cbq] [ 71.448869] qdisc_destroy+0x62/0x130 [ 71.452951] dsmark_destroy+0x2a/0x70 [sch_dsmark] [ 71.458300] qdisc_destroy+0x62/0x130 [ 71.462373] qdisc_graft+0x3ba/0x470 [ 71.466359] tc_get_qdisc+0x2a6/0x2c0 [ 71.470443] ? cred_has_capability+0x7d/0x130 [ 71.475307] rtnetlink_rcv_msg+0x263/0x2d0 [ 71.479875] ? rtnl_calcit.isra.30+0x110/0x110 [ 71.484832] netlink_rcv_skb+0x4d/0x130 [ 71.489109] netlink_unicast+0x1a3/0x250 [ 71.493482] netlink_sendmsg+0x2ae/0x3a0 [ 71.497859] sock_sendmsg+0x36/0x40 [ 71.501748] ___sys_sendmsg+0x26f/0x2d0 [ 71.506029] ? handle_pte_fault+0x586/0xdf0 [ 71.510694] ? __handle_mm_fault+0x389/0x500 [ 71.515457] ? __sys_sendmsg+0x5e/0xa0 [ 71.519636] __sys_sendmsg+0x5e/0xa0 [ 71.523626] do_syscall_64+0x5b/0x180 [ 71.527711] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 71.533345] RIP: 0033:0x7f9d3e257f10 [ 71.537331] Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 [ 71.558401] RSP: 002b:00007fff6f893398 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 71.566848] RAX: ffffffffffffffda RBX: 000000005b71274d RCX: 00007f9d3e257f10 [ 71.574810] RDX: 0000000000000000 RSI: 00007fff6f8933e0 RDI: 0000000000000003 [ 71.582770] RBP: 00007fff6f8933e0 R08: 000000000000ffff R09: 0000000000000003 [ 71.590729] R10: 00007fff6f892e20 R11: 0000000000000246 R12: 0000000000000000 [ 71.598689] R13: 0000000000662ee0 R14: 0000000000000000 R15: 0000000000000000 [ 71.606651] Modules linked in: sch_cbq cls_tcindex sch_dsmark xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_coni [ 71.685425] libahci i2c_algo_bit i2c_core i40e libata dca mdio megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [ 71.697075] CR2: 0000000000000004 [ 71.700792] ---[ end trace f604eb1acacd978b ]--- Reproducer: tc qdisc add dev lo handle 1:0 root dsmark indices 64 set_tc_index tc filter add dev lo parent 1:0 protocol ip prio 1 tcindex mask 0xfc shift 2 tc qdisc add dev lo parent 1:0 handle 2:0 cbq bandwidth 10Mbit cell 8 avpkt 1000 mpu 64 tc class add dev lo parent 2:0 classid 2:1 cbq bandwidth 10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated allot 1514 weight 1 maxburst 10 tc filter add dev lo parent 2:0 protocol ip prio 1 handle 0x2e tcindex classid 2:1 pass_on tc qdisc add dev lo parent 2:1 pfifo limit 5 tc qdisc del dev lo root This is because in tcindex_set_parms, when there is no old_r, we set new exts to cr.exts. And we didn't set it to filter when r == &new_filter_result. Then in tcindex_delete() -> tcf_exts_get_net(), we will get NULL pointer dereference as we didn't init exts. Fix it by moving tcf_exts_change() after "if (old_r && old_r != r)" check. Then we don't need "cr" as there is no errout after that. Fixes: bf63ac73b3e13 ("net_sched: fix an oops in tcindex filter") Reported-by: Li Shuang Signed-off-by: Hangbin Liu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/sched') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 32f4bbd82f35..ddaa4e63ce94 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -447,11 +447,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, tcf_bind_filter(tp, &cr.res, base); } - if (old_r) - tcf_exts_change(&r->exts, &e); - else - tcf_exts_change(&cr.exts, &e); - if (old_r && old_r != r) { err = tcindex_filter_result_init(old_r); if (err < 0) { @@ -462,6 +457,8 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, oldp = p; r->res = cr.res; + tcf_exts_change(&r->exts, &e); + rcu_assign_pointer(tp->root, cp); if (r == &new_filter_result) { -- cgit v1.2.3 From 008369dcc5f7bfba526c98054f8525322acf0ea3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 13 Aug 2018 18:44:04 +0800 Subject: net_sched: Fix missing res info when create new tc_index filter Li Shuang reported the following warn: [ 733.484610] WARNING: CPU: 6 PID: 21123 at net/sched/sch_cbq.c:1418 cbq_destroy_class+0x5d/0x70 [sch_cbq] [ 733.495190] Modules linked in: sch_cbq cls_tcindex sch_dsmark rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat l [ 733.574155] syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm igb ixgbe ahci libahci i2c_algo_bit libata i40e i2c_core dca mdio megaraid_sas dm_mirror dm_region_hash dm_log dm_mod [ 733.592500] CPU: 6 PID: 21123 Comm: tc Not tainted 4.18.0-rc8.latest+ #131 [ 733.600169] Hardware name: Dell Inc. PowerEdge R730/0WCJNT, BIOS 2.1.5 04/11/2016 [ 733.608518] RIP: 0010:cbq_destroy_class+0x5d/0x70 [sch_cbq] [ 733.614734] Code: e7 d9 d2 48 8b 7b 48 e8 61 05 da d2 48 8d bb f8 00 00 00 e8 75 ae d5 d2 48 39 eb 74 0a 48 89 df 5b 5d e9 16 6c 94 d2 5b 5d c3 <0f> 0b eb b6 0f 1f 44 00 00 66 2e 0f 1f 84 [ 733.635798] RSP: 0018:ffffbfbb066bb9d8 EFLAGS: 00010202 [ 733.641627] RAX: 0000000000000001 RBX: ffff9cdd17392800 RCX: 000000008010000f [ 733.649588] RDX: ffff9cdd1df547e0 RSI: ffff9cdd17392800 RDI: ffff9cdd0f84c800 [ 733.657547] RBP: ffff9cdd0f84c800 R08: 0000000000000001 R09: 0000000000000000 [ 733.665508] R10: ffff9cdd0f84d000 R11: 0000000000000001 R12: 0000000000000001 [ 733.673469] R13: 0000000000000000 R14: 0000000000000001 R15: ffff9cdd17392200 [ 733.681430] FS: 00007f911890a740(0000) GS:ffff9cdd1f8c0000(0000) knlGS:0000000000000000 [ 733.690456] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 733.696864] CR2: 0000000000b5544c CR3: 0000000859374002 CR4: 00000000001606e0 [ 733.704826] Call Trace: [ 733.707554] cbq_destroy+0xa1/0xd0 [sch_cbq] [ 733.712318] qdisc_destroy+0x62/0x130 [ 733.716401] dsmark_destroy+0x2a/0x70 [sch_dsmark] [ 733.721745] qdisc_destroy+0x62/0x130 [ 733.725829] qdisc_graft+0x3ba/0x470 [ 733.729817] tc_get_qdisc+0x2a6/0x2c0 [ 733.733901] ? cred_has_capability+0x7d/0x130 [ 733.738761] rtnetlink_rcv_msg+0x263/0x2d0 [ 733.743330] ? rtnl_calcit.isra.30+0x110/0x110 [ 733.748287] netlink_rcv_skb+0x4d/0x130 [ 733.752576] netlink_unicast+0x1a3/0x250 [ 733.756949] netlink_sendmsg+0x2ae/0x3a0 [ 733.761324] sock_sendmsg+0x36/0x40 [ 733.765213] ___sys_sendmsg+0x26f/0x2d0 [ 733.769493] ? handle_pte_fault+0x586/0xdf0 [ 733.774158] ? __handle_mm_fault+0x389/0x500 [ 733.778919] ? __sys_sendmsg+0x5e/0xa0 [ 733.783099] __sys_sendmsg+0x5e/0xa0 [ 733.787087] do_syscall_64+0x5b/0x180 [ 733.791171] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 733.796805] RIP: 0033:0x7f9117f23f10 [ 733.800791] Code: c3 48 8b 05 82 6f 2c 00 f7 db 64 89 18 48 83 cb ff eb dd 0f 1f 80 00 00 00 00 83 3d 8d d0 2c 00 00 75 10 b8 2e 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 [ 733.821873] RSP: 002b:00007ffe96818398 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 733.830319] RAX: ffffffffffffffda RBX: 000000005b71244c RCX: 00007f9117f23f10 [ 733.838280] RDX: 0000000000000000 RSI: 00007ffe968183e0 RDI: 0000000000000003 [ 733.846241] RBP: 00007ffe968183e0 R08: 000000000000ffff R09: 0000000000000003 [ 733.854202] R10: 00007ffe96817e20 R11: 0000000000000246 R12: 0000000000000000 [ 733.862161] R13: 0000000000662ee0 R14: 0000000000000000 R15: 0000000000000000 [ 733.870121] ---[ end trace 28edd4aad712ddca ]--- This is because we didn't update f->result.res when create new filter. Then in tcindex_delete() -> tcf_unbind_filter(), we will failed to find out the res and unbind filter, which will trigger the WARN_ON() in cbq_destroy_class(). Fix it by updating f->result.res when create new filter. Fixes: 6e0565697a106 ("net_sched: fix another crash in cls_tcindex") Reported-by: Li Shuang Signed-off-by: Hangbin Liu Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sched') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index ddaa4e63ce94..9ccc93f257db 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -465,6 +465,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tcindex_filter *nfp; struct tcindex_filter __rcu **fp; + f->result.res = r->res; tcf_exts_change(&f->result.exts, &r->exts); fp = cp->h + (handle % cp->hash); -- cgit v1.2.3