1 files changed, 507 insertions, 240 deletions
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index c7041999eb5d..9ecfb8f5902a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -13,13 +13,18 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/skbuff.h>
+#include <linux/math64.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
 #include <net/sch_generic.h>
 
+static LIST_HEAD(taprio_list);
+static DEFINE_SPINLOCK(taprio_list_lock);
+
 #define TAPRIO_ALL_GATES_OPEN -1
 
 struct sched_entry {
@@ -37,24 +42,88 @@ struct sched_entry {
 	u8 command;
 };
 
+struct sched_gate_list {
+	struct rcu_head rcu;
+	struct list_head entries;
+	size_t num_entries;
+	ktime_t cycle_close_time;
+	s64 cycle_time;
+	s64 cycle_time_extension;
+	s64 base_time;
+};
+
 struct taprio_sched {
 	struct Qdisc **qdiscs;
 	struct Qdisc *root;
-	s64 base_time;
 	int clockid;
-	int picos_per_byte; /* Using picoseconds because for 10Gbps+
-			     * speeds it's sub-nanoseconds per byte
-			     */
-	size_t num_entries;
+	atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
+				    * speeds it's sub-nanoseconds per byte
+				    */
 
 	/* Protects the update side of the RCU protected current_entry */
 	spinlock_t current_entry_lock;
 	struct sched_entry __rcu *current_entry;
-	struct list_head entries;
+	struct sched_gate_list __rcu *oper_sched;
+	struct sched_gate_list __rcu *admin_sched;
 	ktime_t (*get_time)(void);
 	struct hrtimer advance_timer;
+	struct list_head taprio_list;
 };
 
+static ktime_t sched_base_time(const struct sched_gate_list *sched)
+{
+	if (!sched)
+		return KTIME_MAX;
+
+	return ns_to_ktime(sched->base_time);
+}
+
+static void taprio_free_sched_cb(struct rcu_head *head)
+{
+	struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
+	struct sched_entry *entry, *n;
+
+	if (!sched)
+		return;
+
+	list_for_each_entry_safe(entry, n, &sched->entries, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+
+	kfree(sched);
+}
+
+static void switch_schedules(struct taprio_sched *q,
+			     struct sched_gate_list **admin,
+			     struct sched_gate_list **oper)
+{
+	rcu_assign_pointer(q->oper_sched, *admin);
+	rcu_assign_pointer(q->admin_sched, NULL);
+
+	if (*oper)
+		call_rcu(&(*oper)->rcu, taprio_free_sched_cb);
+
+	*oper = *admin;
+	*admin = NULL;
+}
+
+static ktime_t get_cycle_time(struct sched_gate_list *sched)
+{
+	struct sched_entry *entry;
+	ktime_t cycle = 0;
+
+	if (sched->cycle_time != 0)
+		return sched->cycle_time;
+
+	list_for_each_entry(entry, &sched->entries, list)
+		cycle = ktime_add_ns(cycle, entry->interval);
+
+	sched->cycle_time = cycle;
+
+	return cycle;
+}
+
 static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 			  struct sk_buff **to_free)
 {
@@ -85,7 +154,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 
 	rcu_read_lock();
 	entry = rcu_dereference(q->current_entry);
-	gate_mask = entry ? entry->gate_mask : -1;
+	gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
 	rcu_read_unlock();
 
 	if (!gate_mask)
@@ -107,7 +176,7 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 		tc = netdev_get_prio_tc_map(dev, prio);
 
 		if (!(gate_mask & BIT(tc)))
-			return NULL;
+			continue;
 
 		return skb;
 	}
@@ -117,18 +186,30 @@ static struct sk_buff *taprio_peek(struct Qdisc *sch)
 
 static inline int length_to_duration(struct taprio_sched *q, int len)
 {
-	return (len * q->picos_per_byte) / 1000;
+	return div_u64(len * atomic64_read(&q->picos_per_byte), 1000);
+}
+
+static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+{
+	atomic_set(&entry->budget,
+		   div64_u64((u64)entry->interval * 1000,
+			     atomic64_read(&q->picos_per_byte)));
 }
 
 static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	struct sk_buff *skb = NULL;
 	struct sched_entry *entry;
-	struct sk_buff *skb;
 	u32 gate_mask;
 	int i;
 
+	if (atomic64_read(&q->picos_per_byte) == -1) {
+		WARN_ONCE(1, "taprio: dequeue() called with unknown picos per byte.");
+		return NULL;
+	}
+
 	rcu_read_lock();
 	entry = rcu_dereference(q->current_entry);
 	/* if there's no entry, it means that the schedule didn't
@@ -137,10 +218,9 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 	 * "AdminGateSates"
 	 */
 	gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
-	rcu_read_unlock();
 
 	if (!gate_mask)
-		return NULL;
+		goto done;
 
 	for (i = 0; i < dev->num_tx_queues; i++) {
 		struct Qdisc *child = q->qdiscs[i];
@@ -171,39 +251,81 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
 		 */
 		if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
 		    ktime_after(guard, entry->close_time))
-			return NULL;
+			continue;
 
 		/* ... and no budget. */
 		if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
 		    atomic_sub_return(len, &entry->budget) < 0)
-			return NULL;
+			continue;
 
 		skb = child->ops->dequeue(child);
 		if (unlikely(!skb))
-			return NULL;
+			goto done;
 
 		qdisc_bstats_update(sch, skb);
 		qdisc_qstats_backlog_dec(sch, skb);
 		sch->q.qlen--;
 
-		return skb;
+		goto done;
 	}
 
-	return NULL;
+done:
+	rcu_read_unlock();
+
+	return skb;
 }
 
-static bool should_restart_cycle(const struct taprio_sched *q,
+static bool should_restart_cycle(const struct sched_gate_list *oper,
 				 const struct sched_entry *entry)
 {
-	WARN_ON(!entry);
+	if (list_is_last(&entry->list, &oper->entries))
+		return true;
+
+	if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
+		return true;
+
+	return false;
+}
+
+static bool should_change_schedules(const struct sched_gate_list *admin,
+				    const struct sched_gate_list *oper,
+				    ktime_t close_time)
+{
+	ktime_t next_base_time, extension_time;
+
+	if (!admin)
+		return false;
 
-	return list_is_last(&entry->list, &q->entries);
+	next_base_time = sched_base_time(admin);
+
+	/* This is the simple case, the close_time would fall after
+	 * the next schedule base_time.
+	 */
+	if (ktime_compare(next_base_time, close_time) <= 0)
+		return true;
+
+	/* This is the cycle_time_extension case, if the close_time
+	 * plus the amount that can be extended would fall after the
+	 * next schedule base_time, we can extend the current schedule
+	 * for that amount.
+	 */
+	extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
+
+	/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
+	 * how precisely the extension should be made. So after
+	 * conformance testing, this logic may change.
+	 */
+	if (ktime_compare(next_base_time, extension_time) <= 0)
+		return true;
+
+	return false;
 }
 
 static enum hrtimer_restart advance_sched(struct hrtimer *timer)
 {
 	struct taprio_sched *q = container_of(timer, struct taprio_sched,
 					      advance_timer);
+	struct sched_gate_list *oper, *admin;
 	struct sched_entry *entry, *next;
 	struct Qdisc *sch = q->root;
 	ktime_t close_time;
@@ -211,29 +333,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
 	spin_lock(&q->current_entry_lock);
 	entry = rcu_dereference_protected(q->current_entry,
 					  lockdep_is_held(&q->current_entry_lock));
+	oper = rcu_dereference_protected(q->oper_sched,
+					 lockdep_is_held(&q->current_entry_lock));
+	admin = rcu_dereference_protected(q->admin_sched,
+					  lockdep_is_held(&q->current_entry_lock));
 
-	/* This is the case that it's the first time that the schedule
-	 * runs, so it only happens once per schedule. The first entry
-	 * is pre-calculated during the schedule initialization.
+	if (!oper)
+		switch_schedules(q, &admin, &oper);
+
+	/* This can happen in two cases: 1. this is the very first run
+	 * of this function (i.e. we weren't running any schedule
+	 * previously); 2. The previous schedule just ended. The first
+	 * entry of all schedules are pre-calculated during the
+	 * schedule initialization.
 	 */
-	if (unlikely(!entry)) {
-		next = list_first_entry(&q->entries, struct sched_entry,
+	if (unlikely(!entry || entry->close_time == oper->base_time)) {
+		next = list_first_entry(&oper->entries, struct sched_entry,
 					list);
 		close_time = next->close_time;
 		goto first_run;
 	}
 
-	if (should_restart_cycle(q, entry))
-		next = list_first_entry(&q->entries, struct sched_entry,
+	if (should_restart_cycle(oper, entry)) {
+		next = list_first_entry(&oper->entries, struct sched_entry,
 					list);
-	else
+		oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
+						      oper->cycle_time);
+	} else {
 		next = list_next_entry(entry, list);
+	}
 
 	close_time = ktime_add_ns(entry->close_time, next->interval);
+	close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
+
+	if (should_change_schedules(admin, oper, close_time)) {
+		/* Set things so the next time this runs, the new
+		 * schedule runs.
+		 */
+		close_time = sched_base_time(admin);
+		switch_schedules(q, &admin, &oper);
+	}
 
 	next->close_time = close_time;
-	atomic_set(&next->budget,
-		   (next->interval * 1000) / q->picos_per_byte);
+	taprio_set_budget(q, next);
 
 first_run:
 	rcu_assign_pointer(q->current_entry, next);
@@ -263,10 +405,12 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
 		.len = sizeof(struct tc_mqprio_qopt)
 	},
-	[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]     = { .type = NLA_NESTED },
-	[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]      = { .type = NLA_S64 },
-	[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]   = { .type = NLA_NESTED },
-	[TCA_TAPRIO_ATTR_SCHED_CLOCKID]        = { .type = NLA_S32 },
+	[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]           = { .type = NLA_NESTED },
+	[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]            = { .type = NLA_S64 },
+	[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]         = { .type = NLA_NESTED },
+	[TCA_TAPRIO_ATTR_SCHED_CLOCKID]              = { .type = NLA_S32 },
+	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]           = { .type = NLA_S64 },
+	[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
 };
 
 static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry,
@@ -302,8 +446,8 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
 	struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
 	int err;
 
-	err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
-			       entry_policy, NULL);
+	err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
+					  entry_policy, NULL);
 	if (err < 0) {
 		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
 		return -EINVAL;
@@ -314,70 +458,8 @@ static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry,
 	return fill_sched_entry(tb, entry, extack);
 }
 
-/* Returns the number of entries in case of success */
-static int parse_sched_single_entry(struct nlattr *n,
-				    struct taprio_sched *q,
-				    struct netlink_ext_ack *extack)
-{
-	struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
-	struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { };
-	struct sched_entry *entry;
-	bool found = false;
-	u32 index;
-	int err;
-
-	err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX,
-			       n, entry_list_policy, NULL);
-	if (err < 0) {
-		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
-		return -EINVAL;
-	}
-
-	if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) {
-		NL_SET_ERR_MSG(extack, "Single-entry must include an entry");
-		return -EINVAL;
-	}
-
-	err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX,
-			       tb_list[TCA_TAPRIO_SCHED_ENTRY],
-			       entry_policy, NULL);
-	if (err < 0) {
-		NL_SET_ERR_MSG(extack, "Could not parse nested entry");
-		return -EINVAL;
-	}
-
-	if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) {
-		NL_SET_ERR_MSG(extack, "Entry must specify an index\n");
-		return -EINVAL;
-	}
-
-	index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]);
-	if (index >= q->num_entries) {
-		NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule");
-		return -EINVAL;
-	}
-
-	list_for_each_entry(entry, &q->entries, list) {
-		if (entry->index == index) {
-			found = true;
-			break;
-		}
-	}
-
-	if (!found) {
-		NL_SET_ERR_MSG(extack, "Could not find entry");
-		return -ENOENT;
-	}
-
-	err = fill_sched_entry(tb_entry, entry, extack);
-	if (err < 0)
-		return err;
-
-	return q->num_entries;
-}
-
 static int parse_sched_list(struct nlattr *list,
-			    struct taprio_sched *q,
+			    struct sched_gate_list *sched,
 			    struct netlink_ext_ack *extack)
 {
 	struct nlattr *n;
@@ -407,64 +489,42 @@ static int parse_sched_list(struct nlattr *list,
 			return err;
 		}
 
-		list_add_tail(&entry->list, &q->entries);
+		list_add_tail(&entry->list, &sched->entries);
 		i++;
 	}
 
-	q->num_entries = i;
+	sched->num_entries = i;
 
 	return i;
 }
 
-/* Returns the number of entries in case of success */
-static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q,
-			    struct netlink_ext_ack *extack)
+static int parse_taprio_schedule(struct nlattr **tb,
+				 struct sched_gate_list *new,
+				 struct netlink_ext_ack *extack)
 {
 	int err = 0;
-	int clockid;
 
-	if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] &&
-	    tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
-		return -EINVAL;
-
-	if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0)
-		return -EINVAL;
-
-	if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID])
-		return -EINVAL;
+	if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
+		NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
+		return -ENOTSUPP;
+	}
 
 	if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
-		q->base_time = nla_get_s64(
-			tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
+		new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);
 
-	if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
-		clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+	if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
+		new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);
 
-		/* We only support static clockids and we don't allow
-		 * for it to be modified after the first init.
-		 */
-		if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid))
-			return -EINVAL;
-
-		q->clockid = clockid;
-	}
+	if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
+		new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);
 
 	if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
 		err = parse_sched_list(
-			tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack);
-	else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY])
-		err = parse_sched_single_entry(
-			tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack);
-
-	/* parse_sched_* return the number of entries in the schedule,
-	 * a schedule with zero entries is an error.
-	 */
-	if (err == 0) {
-		NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry");
-		return -EINVAL;
-	}
+			tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack);
+	if (err < 0)
+		return err;
 
-	return err;
+	return 0;
 }
 
 static int taprio_parse_mqprio_opt(struct net_device *dev,
@@ -473,11 +533,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
 {
 	int i, j;
 
-	if (!qopt) {
+	if (!qopt && !dev->num_tc) {
 		NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
 		return -EINVAL;
 	}
 
+	/* If num_tc is already set, it means that the user already
+	 * configured the mqprio part
+	 */
+	if (dev->num_tc)
+		return 0;
+
 	/* Verify num_tc is not out of max range */
 	if (qopt->num_tc > TC_MAX_QUEUE) {
 		NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
@@ -523,76 +589,141 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
 	return 0;
 }
 
-static ktime_t taprio_get_start_time(struct Qdisc *sch)
+static int taprio_get_start_time(struct Qdisc *sch,
+				 struct sched_gate_list *sched,
+				 ktime_t *start)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
-	struct sched_entry *entry;
 	ktime_t now, base, cycle;
 	s64 n;
 
-	base = ns_to_ktime(q->base_time);
-	cycle = 0;
-
-	/* Calculate the cycle_time, by summing all the intervals.
-	 */
-	list_for_each_entry(entry, &q->entries, list)
-		cycle = ktime_add_ns(cycle, entry->interval);
+	base = sched_base_time(sched);
+	now = q->get_time();
 
-	if (!cycle)
-		return base;
+	if (ktime_after(base, now)) {
+		*start = base;
+		return 0;
+	}
 
-	now = q->get_time();
+	cycle = get_cycle_time(sched);
 
-	if (ktime_after(base, now))
-		return base;
+	/* The qdisc is expected to have at least one sched_entry.  Moreover,
+	 * any entry must have 'interval' > 0. Thus if the cycle time is zero,
+	 * something went really wrong. In that case, we should warn about this
+	 * inconsistent state and return error.
+	 */
+	if (WARN_ON(!cycle))
+		return -EFAULT;
 
 	/* Schedule the start time for the beginning of the next
 	 * cycle.
 	 */
 	n = div64_s64(ktime_sub_ns(now, base), cycle);
-
-	return ktime_add_ns(base, (n + 1) * cycle);
+	*start = ktime_add_ns(base, (n + 1) * cycle);
+	return 0;
 }
 
-static void taprio_start_sched(struct Qdisc *sch, ktime_t start)
+static void setup_first_close_time(struct taprio_sched *q,
+				   struct sched_gate_list *sched, ktime_t base)
 {
-	struct taprio_sched *q = qdisc_priv(sch);
 	struct sched_entry *first;
-	unsigned long flags;
+	ktime_t cycle;
 
-	spin_lock_irqsave(&q->current_entry_lock, flags);
+	first = list_first_entry(&sched->entries,
+				 struct sched_entry, list);
+
+	cycle = get_cycle_time(sched);
 
-	first = list_first_entry(&q->entries, struct sched_entry,
-				 list);
+	/* FIXME: find a better place to do this */
+	sched->cycle_close_time = ktime_add_ns(base, cycle);
 
-	first->close_time = ktime_add_ns(start, first->interval);
-	atomic_set(&first->budget,
-		   (first->interval * 1000) / q->picos_per_byte);
+	first->close_time = ktime_add_ns(base, first->interval);
+	taprio_set_budget(q, first);
 	rcu_assign_pointer(q->current_entry, NULL);
+}
 
-	spin_unlock_irqrestore(&q->current_entry_lock, flags);
+static void taprio_start_sched(struct Qdisc *sch,
+			       ktime_t start, struct sched_gate_list *new)
+{
+	struct taprio_sched *q = qdisc_priv(sch);
+	ktime_t expires;
+
+	expires = hrtimer_get_expires(&q->advance_timer);
+	if (expires == 0)
+		expires = KTIME_MAX;
+
+	/* If the new schedule starts before the next expiration, we
+	 * reprogram it to the earliest one, so we change the admin
+	 * schedule to the operational one at the right time.
+	 */
+	start = min_t(ktime_t, start, expires);
 
 	hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
 }
 
+static void taprio_set_picos_per_byte(struct net_device *dev,
+				      struct taprio_sched *q)
+{
+	struct ethtool_link_ksettings ecmd;
+	int picos_per_byte = -1;
+
+	if (!__ethtool_get_link_ksettings(dev, &ecmd) &&
+	    ecmd.base.speed != SPEED_UNKNOWN)
+		picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
+					   ecmd.base.speed * 1000 * 1000);
+
+	atomic64_set(&q->picos_per_byte, picos_per_byte);
+	netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
+		   dev->name, (long long)atomic64_read(&q->picos_per_byte),
+		   ecmd.base.speed);
+}
+
+static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
+			       void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct net_device *qdev;
+	struct taprio_sched *q;
+	bool found = false;
+
+	ASSERT_RTNL();
+
+	if (event != NETDEV_UP && event != NETDEV_CHANGE)
+		return NOTIFY_DONE;
+
+	spin_lock(&taprio_list_lock);
+	list_for_each_entry(q, &taprio_list, taprio_list) {
+		qdev = qdisc_dev(q->root);
+		if (qdev == dev) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&taprio_list_lock);
+
+	if (found)
+		taprio_set_picos_per_byte(dev, q);
+
+	return NOTIFY_DONE;
+}
+
 static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 			 struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
+	struct sched_gate_list *oper, *admin, *new_admin;
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_mqprio_qopt *mqprio = NULL;
-	struct ethtool_link_ksettings ecmd;
-	int i, err, size;
-	s64 link_speed;
+	int i, err, clockid;
+	unsigned long flags;
 	ktime_t start;
 
-	err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt,
-			       taprio_policy, extack);
+	err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
+					  taprio_policy, extack);
 	if (err < 0)
 		return err;
 
-	err = -EINVAL;
 	if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
 		mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);
 
@@ -600,13 +731,78 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		return err;
 
-	/* A schedule with less than one entry is an error */
-	size = parse_taprio_opt(tb, q, extack);
-	if (size < 0)
-		return size;
+	new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
+	if (!new_admin) {
+		NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&new_admin->entries);
 
-	hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
-	q->advance_timer.function = advance_sched;
+	rcu_read_lock();
+	oper = rcu_dereference(q->oper_sched);
+	admin = rcu_dereference(q->admin_sched);
+	rcu_read_unlock();
+
+	if (mqprio && (oper || admin)) {
+		NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
+		err = -ENOTSUPP;
+		goto free_sched;
+	}
+
+	err = parse_taprio_schedule(tb, new_admin, extack);
+	if (err < 0)
+		goto free_sched;
+
+	if (new_admin->num_entries == 0) {
+		NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
+		err = -EINVAL;
+		goto free_sched;
+	}
+
+	if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+		clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
+
+		/* We only support static clockids and we don't allow
+		 * for it to be modified after the first init.
+		 */
+		if (clockid < 0 ||
+		    (q->clockid != -1 && q->clockid != clockid)) {
+			NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported");
+			err = -ENOTSUPP;
+			goto free_sched;
+		}
+
+		q->clockid = clockid;
+	}
+
+	if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
+		NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
+		err = -EINVAL;
+		goto free_sched;
+	}
+
+	taprio_set_picos_per_byte(dev, q);
+
+	/* Protects against enqueue()/dequeue() */
+	spin_lock_bh(qdisc_lock(sch));
+
+	if (!hrtimer_active(&q->advance_timer)) {
+		hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS);
+		q->advance_timer.function = advance_sched;
+	}
+
+	if (mqprio) {
+		netdev_set_num_tc(dev, mqprio->num_tc);
+		for (i = 0; i < mqprio->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    mqprio->count[i],
+					    mqprio->offset[i]);
+
+		/* Always use supplied priority mappings */
+		for (i = 0; i < TC_BITMASK + 1; i++)
+			netdev_set_prio_tc_map(dev, i,
+					       mqprio->prio_tc_map[i]);
+	}
 
 	switch (q->clockid) {
 	case CLOCK_REALTIME:
@@ -622,65 +818,52 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 		q->get_time = ktime_get_clocktai;
 		break;
 	default:
-		return -ENOTSUPP;
+		NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
+		err = -EINVAL;
+		goto unlock;
 	}
 
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *dev_queue;
-		struct Qdisc *qdisc;
-
-		dev_queue = netdev_get_tx_queue(dev, i);
-		qdisc = qdisc_create_dflt(dev_queue,
-					  &pfifo_qdisc_ops,
-					  TC_H_MAKE(TC_H_MAJ(sch->handle),
-						    TC_H_MIN(i + 1)),
-					  extack);
-		if (!qdisc)
-			return -ENOMEM;
+	err = taprio_get_start_time(sch, new_admin, &start);
+	if (err < 0) {
+		NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
+		goto unlock;
+	}
 
-		if (i < dev->real_num_tx_queues)
-			qdisc_hash_add(qdisc, false);
+	setup_first_close_time(q, new_admin, start);
 
-		q->qdiscs[i] = qdisc;
-	}
+	/* Protects against advance_sched() */
+	spin_lock_irqsave(&q->current_entry_lock, flags);
 
-	if (mqprio) {
-		netdev_set_num_tc(dev, mqprio->num_tc);
-		for (i = 0; i < mqprio->num_tc; i++)
-			netdev_set_tc_queue(dev, i,
-					    mqprio->count[i],
-					    mqprio->offset[i]);
+	taprio_start_sched(sch, start, new_admin);
 
-		/* Always use supplied priority mappings */
-		for (i = 0; i < TC_BITMASK + 1; i++)
-			netdev_set_prio_tc_map(dev, i,
-					       mqprio->prio_tc_map[i]);
-	}
+	rcu_assign_pointer(q->admin_sched, new_admin);
+	if (admin)
+		call_rcu(&admin->rcu, taprio_free_sched_cb);
+	new_admin = NULL;
 
-	if (!__ethtool_get_link_ksettings(dev, &ecmd))
-		link_speed = ecmd.base.speed;
-	else
-		link_speed = SPEED_1000;
+	spin_unlock_irqrestore(&q->current_entry_lock, flags);
 
-	q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8,
-				      link_speed * 1000 * 1000);
+	err = 0;
 
-	start = taprio_get_start_time(sch);
-	if (!start)
-		return 0;
+unlock:
+	spin_unlock_bh(qdisc_lock(sch));
 
-	taprio_start_sched(sch, start);
+free_sched:
+	kfree(new_admin);
 
-	return 0;
+	return err;
 }
 
 static void taprio_destroy(struct Qdisc *sch)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
-	struct sched_entry *entry, *n;
 	unsigned int i;
 
+	spin_lock(&taprio_list_lock);
+	list_del(&q->taprio_list);
+	spin_unlock(&taprio_list_lock);
+
 	hrtimer_cancel(&q->advance_timer);
 
 	if (q->qdiscs) {
@@ -693,10 +876,11 @@ static void taprio_destroy(struct Qdisc *sch)
 
 	netdev_set_num_tc(dev, 0);
 
-	list_for_each_entry_safe(entry, n, &q->entries, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
+	if (q->oper_sched)
+		call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb);
+
+	if (q->admin_sched)
+		call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb);
 }
 
 static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -704,12 +888,12 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	int i;
 
-	INIT_LIST_HEAD(&q->entries);
 	spin_lock_init(&q->current_entry_lock);
 
-	/* We may overwrite the configuration later */
 	hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS);
+	q->advance_timer.function = advance_sched;
 
 	q->root = sch;
 
@@ -735,6 +919,29 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
 	if (!opt)
 		return -EINVAL;
 
+	spin_lock(&taprio_list_lock);
+	list_add(&q->taprio_list, &taprio_list);
+	spin_unlock(&taprio_list_lock);
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *dev_queue;
+		struct Qdisc *qdisc;
+
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue,
+					  &pfifo_qdisc_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)),
+					  extack);
+		if (!qdisc)
+			return -ENOMEM;
+
+		if (i < dev->real_num_tx_queues)
+			qdisc_hash_add(qdisc, false);
+
+		q->qdiscs[i] = qdisc;
+	}
+
 	return taprio_change(sch, opt, extack);
 }
 
@@ -781,7 +988,7 @@ static int dump_entry(struct sk_buff *msg,
 {
 	struct nlattr *item;
 
-	item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY);
+	item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
 	if (!item)
 		return -ENOSPC;
 
@@ -806,15 +1013,55 @@ nla_put_failure:
 	return -1;
 }
 
+static int dump_schedule(struct sk_buff *msg,
+			 const struct sched_gate_list *root)
+{
+	struct nlattr *entry_list;
+	struct sched_entry *entry;
+
+	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
+			root->base_time, TCA_TAPRIO_PAD))
+		return -1;
+
+	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
+			root->cycle_time, TCA_TAPRIO_PAD))
+		return -1;
+
+	if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
+			root->cycle_time_extension, TCA_TAPRIO_PAD))
+		return -1;
+
+	entry_list = nla_nest_start_noflag(msg,
+					   TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
+	if (!entry_list)
+		goto error_nest;
+
+	list_for_each_entry(entry, &root->entries, list) {
+		if (dump_entry(msg, entry) < 0)
+			goto error_nest;
+	}
+
+	nla_nest_end(msg, entry_list);
+	return 0;
+
+error_nest:
+	nla_nest_cancel(msg, entry_list);
+	return -1;
+}
+
 static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct taprio_sched *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
+	struct sched_gate_list *oper, *admin;
 	struct tc_mqprio_qopt opt = { 0 };
-	struct nlattr *nest, *entry_list;
-	struct sched_entry *entry;
+	struct nlattr *nest, *sched_nest;
 	unsigned int i;
 
+	rcu_read_lock();
+	oper = rcu_dereference(q->oper_sched);
+	admin = rcu_dereference(q->admin_sched);
+
 	opt.num_tc = netdev_get_num_tc(dev);
 	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
 
@@ -823,36 +1070,45 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		opt.offset[i] = dev->tc_to_txq[i].offset;
 	}
 
-	nest = nla_nest_start(skb, TCA_OPTIONS);
+	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
 	if (!nest)
-		return -ENOSPC;
+		goto start_error;
 
 	if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
 		goto options_error;
 
-	if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
-			q->base_time, TCA_TAPRIO_PAD))
+	if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
 		goto options_error;
 
-	if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
+	if (oper && dump_schedule(skb, oper))
 		goto options_error;
 
-	entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
-	if (!entry_list)
+	if (!admin)
+		goto done;
+
+	sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
+	if (!sched_nest)
 		goto options_error;
 
-	list_for_each_entry(entry, &q->entries, list) {
-		if (dump_entry(skb, entry) < 0)
-			goto options_error;
-	}
+	if (dump_schedule(skb, admin))
+		goto admin_error;
+
+	nla_nest_end(skb, sched_nest);
 
-	nla_nest_end(skb, entry_list);
+done:
+	rcu_read_unlock();
 
 	return nla_nest_end(skb, nest);
 
+admin_error:
+	nla_nest_cancel(skb, sched_nest);
+
 options_error:
 	nla_nest_cancel(skb, nest);
-	return -1;
+
+start_error:
+	rcu_read_unlock();
+	return -ENOSPC;
 }
 
 static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
@@ -939,6 +1195,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
 	.id		= "taprio",
 	.priv_size	= sizeof(struct taprio_sched),
 	.init		= taprio_init,
+	.change		= taprio_change,
 	.destroy	= taprio_destroy,
 	.peek		= taprio_peek,
 	.dequeue	= taprio_dequeue,
@@ -947,14 +1204,24 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
+static struct notifier_block taprio_device_notifier = {
+	.notifier_call = taprio_dev_notifier,
+};
+
 static int __init taprio_module_init(void)
 {
+	int err = register_netdevice_notifier(&taprio_device_notifier);
+
+	if (err)
+		return err;
+
 	return register_qdisc(&taprio_qdisc_ops);
 }
 
 static void __exit taprio_module_exit(void)
 {
 	unregister_qdisc(&taprio_qdisc_ops);
+	unregister_netdevice_notifier(&taprio_device_notifier);
 }
 
 module_init(taprio_module_init);