diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 657 |
1 files changed, 382 insertions, 275 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cf34a961821a..8e880c09ab59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -144,14 +144,16 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); +static bool rcu_rdp_cpu_online(struct rcu_data *rdp); +static bool rcu_init_invoked(void); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); /* * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" @@ -215,27 +217,6 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays for debugging. */ /* - * Compute the mask of online CPUs for the specified rcu_node structure. - * This will not be stable unless the rcu_node structure's ->lock is - * held, but the bit corresponding to the current CPU will be stable - * in most contexts. - */ -static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->qsmaskinitnext); -} - -/* - * Is the CPU corresponding to the specified rcu_data structure online - * from RCU's perspective? This perspective is given by that structure's - * ->qsmaskinitnext field rather than by the global cpu_online_mask. - */ -static bool rcu_rdp_cpu_online(struct rcu_data *rdp) -{ - return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); -} - -/* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. @@ -734,46 +715,6 @@ void rcu_request_urgent_qs_task(struct task_struct *t) smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); } -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) - -/* - * Is the current CPU online as far as RCU is concerned? - * - * Disable preemption to avoid false positives that could otherwise - * happen due to the current CPU number being sampled, this task being - * preempted, its old CPU being taken offline, resuming on some other CPU, - * then determining that its old CPU is now offline. - * - * Disable checking if in an NMI handler because we cannot safely - * report errors from NMI handlers anyway. In addition, it is OK to use - * RCU on an offline processor during initial boot, hence the check for - * rcu_scheduler_fully_active. - */ -bool rcu_lockdep_current_cpu_online(void) -{ - struct rcu_data *rdp; - bool ret = false; - - if (in_nmi() || !rcu_scheduler_fully_active) - return true; - preempt_disable_notrace(); - rdp = this_cpu_ptr(&rcu_data); - /* - * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask - * not being up to date. So arch_spin_is_locked() might have a - * false positive if it's held by some *other* CPU, but that's - * OK because that just means a false *negative* on the warning. - */ - if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) - ret = true; - preempt_enable_notrace(); - return ret; -} -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ - /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow @@ -925,6 +866,24 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); } + + if (rcu_cpu_stall_cputime && rdp->snap_record.gp_seq != rdp->gp_seq) { + int cpu = rdp->cpu; + struct rcu_snap_record *rsrp; + struct kernel_cpustat *kcsp; + + kcsp = &kcpustat_cpu(cpu); + + rsrp = &rdp->snap_record; + rsrp->cputime_irq = kcpustat_field(kcsp, CPUTIME_IRQ, cpu); + rsrp->cputime_softirq = kcpustat_field(kcsp, CPUTIME_SOFTIRQ, cpu); + rsrp->cputime_system = kcpustat_field(kcsp, CPUTIME_SYSTEM, cpu); + rsrp->nr_hardirqs = kstat_cpu_irqs_sum(rdp->cpu); + rsrp->nr_softirqs = kstat_cpu_softirqs_sum(rdp->cpu); + rsrp->nr_csw = nr_context_switches_cpu(rdp->cpu); + rsrp->jiffies = jiffies; + rsrp->gp_seq = rdp->gp_seq; + } } return 0; @@ -1350,13 +1309,6 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } -// Has rcu_init() been invoked? This is used (for example) to determine -// whether spinlocks may be acquired safely. -static bool rcu_init_invoked(void) -{ - return !!rcu_state.n_online_cpus; -} - // Make the polled API aware of the beginning of a grace period. static void rcu_poll_gp_seq_start(unsigned long *snap) { @@ -2092,92 +2044,6 @@ rcu_check_quiescent_state(struct rcu_data *rdp) } /* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - -/* - * All CPUs for the specified rcu_node structure have gone offline, - * and all tasks that were preempted within an RCU read-side critical - * section while running on one of those CPUs have since exited their RCU - * read-side critical section. Some other CPU is reporting this fact with - * the specified rcu_node structure's ->lock held and interrupts disabled. - * This function therefore goes up the tree of rcu_node structures, - * clearing the corresponding bits in the ->qsmaskinit fields. Note that - * the leaf rcu_node structure's ->qsmaskinit field has already been - * updated. - * - * This function does check that the specified rcu_node structure has - * all CPUs offline and no blocked tasks, so it is OK to invoke it - * prematurely. That said, invoking it after the fact will cost you - * a needless lock acquisition. So once it has done its work, don't - * invoke it again. - */ -static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) -{ - long mask; - struct rcu_node *rnp = rnp_leaf; - - raw_lockdep_assert_held_rcu_node(rnp_leaf); - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - WARN_ON_ONCE(rnp_leaf->qsmaskinit) || - WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) - return; - for (;;) { - mask = rnp->grpmask; - rnp = rnp->parent; - if (!rnp) - break; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - /* Between grace periods, so better already be zero! */ - WARN_ON_ONCE(rnp->qsmask); - if (rnp->qsmaskinit) { - raw_spin_unlock_rcu_node(rnp); - /* irqs remain disabled. */ - return; - } - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - } -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - /* Adjust any no-longer-needed kthreads. */ - rcu_boost_kthread_setaffinity(rnp, -1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. */ @@ -2209,7 +2075,7 @@ static void rcu_do_batch(struct rcu_data *rdp) */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - pending = rcu_segcblist_n_cbs(&rdp->cblist); + pending = rcu_segcblist_get_seglen(&rdp->cblist, RCU_DONE_TAIL); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); @@ -2727,10 +2593,11 @@ static void check_cb_ovld(struct rcu_data *rdp) } static void -__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) { static atomic_t doublefrees; unsigned long flags; + bool lazy; struct rcu_data *rdp; bool was_alldone; @@ -2755,6 +2622,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy) kasan_record_aux_stack_noalloc(head); local_irq_save(flags); rdp = this_cpu_ptr(&rcu_data); + lazy = lazy_in && !rcu_async_should_hurry(); /* Add the callback to our list. */ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { @@ -2876,13 +2744,15 @@ EXPORT_SYMBOL_GPL(call_rcu); /** * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers + * @list: List node. All blocks are linked between each other + * @gp_snap: Snapshot of RCU state for objects placed to this bulk * @nr_records: Number of active pointers in the array - * @next: Next bulk object in the block chain * @records: Array of the kvfree_rcu() pointers */ struct kvfree_rcu_bulk_data { + struct list_head list; + unsigned long gp_snap; unsigned long nr_records; - struct kvfree_rcu_bulk_data *next; void *records[]; }; @@ -2898,26 +2768,28 @@ struct kvfree_rcu_bulk_data { * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period - * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period + * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { struct rcu_work rcu_work; struct rcu_head *head_free; - struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; + struct list_head bulk_head_free[FREE_N_CHANNELS]; struct kfree_rcu_cpu *krcp; }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period + * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" + * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @initialized: The @rcu_work fields have been initialized - * @count: Number of objects for which GP not started + * @head_count: Number of objects in rcu_head singular list + * @bulk_count: Number of objects in bulk-list * @bkvcache: * A simple cache list that contains objects for reuse purpose. * In order to save some per-cpu space the list is singular. @@ -2935,13 +2807,20 @@ struct kfree_rcu_cpu_work { * the interactions with the slab allocators. */ struct kfree_rcu_cpu { + // Objects queued on a linked list + // through their rcu_head structures. struct rcu_head *head; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; + unsigned long head_gp_snap; + atomic_t head_count; + + // Objects queued on a bulk-list. + struct list_head bulk_head[FREE_N_CHANNELS]; + atomic_t bulk_count[FREE_N_CHANNELS]; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; raw_spinlock_t lock; struct delayed_work monitor_work; bool initialized; - int count; struct delayed_work page_cache_work; atomic_t backoff_page_cache_fill; @@ -3029,29 +2908,87 @@ drain_page_cache(struct kfree_rcu_cpu *krcp) return freed; } +static void +kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, + struct kvfree_rcu_bulk_data *bnode, int idx) +{ + unsigned long flags; + int i; + + debug_rcu_bhead_unqueue(bnode); + + rcu_lock_acquire(&rcu_callback_map); + if (idx == 0) { // kmalloc() / kfree(). + trace_rcu_invoke_kfree_bulk_callback( + rcu_state.name, bnode->nr_records, + bnode->records); + + kfree_bulk(bnode->nr_records, bnode->records); + } else { // vmalloc() / vfree(). + for (i = 0; i < bnode->nr_records; i++) { + trace_rcu_invoke_kvfree_callback( + rcu_state.name, bnode->records[i], 0); + + vfree(bnode->records[i]); + } + } + rcu_lock_release(&rcu_callback_map); + + raw_spin_lock_irqsave(&krcp->lock, flags); + if (put_cached_bnode(krcp, bnode)) + bnode = NULL; + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (bnode) + free_page((unsigned long) bnode); + + cond_resched_tasks_rcu_qs(); +} + +static void +kvfree_rcu_list(struct rcu_head *head) +{ + struct rcu_head *next; + + for (; head; head = next) { + void *ptr = (void *) head->func; + unsigned long offset = (void *) head - ptr; + + next = head->next; + debug_rcu_head_unqueue((struct rcu_head *)ptr); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); + + if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) + kvfree(ptr); + + rcu_lock_release(&rcu_callback_map); + cond_resched_tasks_rcu_qs(); + } +} + /* * This function is invoked in workqueue context after a grace period. - * It frees all the objects queued on ->bkvhead_free or ->head_free. + * It frees all the objects queued on ->bulk_head_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { unsigned long flags; - struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; - struct rcu_head *head, *next; + struct kvfree_rcu_bulk_data *bnode, *n; + struct list_head bulk_head[FREE_N_CHANNELS]; + struct rcu_head *head; struct kfree_rcu_cpu *krcp; struct kfree_rcu_cpu_work *krwp; - int i, j; + int i; krwp = container_of(to_rcu_work(work), - struct kfree_rcu_cpu_work, rcu_work); + struct kfree_rcu_cpu_work, rcu_work); krcp = krwp->krcp; raw_spin_lock_irqsave(&krcp->lock, flags); // Channels 1 and 2. - for (i = 0; i < FREE_N_CHANNELS; i++) { - bkvhead[i] = krwp->bkvhead_free[i]; - krwp->bkvhead_free[i] = NULL; - } + for (i = 0; i < FREE_N_CHANNELS; i++) + list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]); // Channel 3. head = krwp->head_free; @@ -3060,39 +2997,9 @@ static void kfree_rcu_work(struct work_struct *work) // Handle the first two channels. for (i = 0; i < FREE_N_CHANNELS; i++) { - for (; bkvhead[i]; bkvhead[i] = bnext) { - bnext = bkvhead[i]->next; - debug_rcu_bhead_unqueue(bkvhead[i]); - - rcu_lock_acquire(&rcu_callback_map); - if (i == 0) { // kmalloc() / kfree(). - trace_rcu_invoke_kfree_bulk_callback( - rcu_state.name, bkvhead[i]->nr_records, - bkvhead[i]->records); - - kfree_bulk(bkvhead[i]->nr_records, - bkvhead[i]->records); - } else { // vmalloc() / vfree(). - for (j = 0; j < bkvhead[i]->nr_records; j++) { - trace_rcu_invoke_kvfree_callback( - rcu_state.name, - bkvhead[i]->records[j], 0); - - vfree(bkvhead[i]->records[j]); - } - } - rcu_lock_release(&rcu_callback_map); - - raw_spin_lock_irqsave(&krcp->lock, flags); - if (put_cached_bnode(krcp, bkvhead[i])) - bkvhead[i] = NULL; - raw_spin_unlock_irqrestore(&krcp->lock, flags); - - if (bkvhead[i]) - free_page((unsigned long) bkvhead[i]); - - cond_resched_tasks_rcu_qs(); - } + // Start from the tail page, so a GP is likely passed for it. + list_for_each_entry_safe(bnode, n, &bulk_head[i], list) + kvfree_rcu_bulk(krcp, bnode, i); } /* @@ -3102,21 +3009,7 @@ static void kfree_rcu_work(struct work_struct *work) * queued on a linked list through their rcu_head structures. * This list is named "Channel 3". */ - for (; head; head = next) { - unsigned long offset = (unsigned long)head->func; - void *ptr = (void *)head - offset; - - next = head->next; - debug_rcu_head_unqueue((struct rcu_head *)ptr); - rcu_lock_acquire(&rcu_callback_map); - trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); - - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); - - rcu_lock_release(&rcu_callback_map); - cond_resched_tasks_rcu_qs(); - } + kvfree_rcu_list(head); } static bool @@ -3125,10 +3018,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) int i; for (i = 0; i < FREE_N_CHANNELS; i++) - if (krcp->bkvhead[i]) + if (!list_empty(&krcp->bulk_head[i])) return true; - return !!krcp->head; + return !!READ_ONCE(krcp->head); +} + +static int krc_count(struct kfree_rcu_cpu *krcp) +{ + int sum = atomic_read(&krcp->head_count); + int i; + + for (i = 0; i < FREE_N_CHANNELS; i++) + sum += atomic_read(&krcp->bulk_count[i]); + + return sum; } static void @@ -3136,7 +3040,7 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { long delay, delay_left; - delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; if (delayed_work_pending(&krcp->monitor_work)) { delay_left = krcp->monitor_work.timer.expires - jiffies; if (delay < delay_left) @@ -3146,6 +3050,44 @@ schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) queue_delayed_work(system_wq, &krcp->monitor_work, delay); } +static void +kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) +{ + struct list_head bulk_ready[FREE_N_CHANNELS]; + struct kvfree_rcu_bulk_data *bnode, *n; + struct rcu_head *head_ready = NULL; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&krcp->lock, flags); + for (i = 0; i < FREE_N_CHANNELS; i++) { + INIT_LIST_HEAD(&bulk_ready[i]); + + list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) { + if (!poll_state_synchronize_rcu(bnode->gp_snap)) + break; + + atomic_sub(bnode->nr_records, &krcp->bulk_count[i]); + list_move(&bnode->list, &bulk_ready[i]); + } + } + + if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) { + head_ready = krcp->head; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); + } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + for (i = 0; i < FREE_N_CHANNELS; i++) { + list_for_each_entry_safe(bnode, n, &bulk_ready[i], list) + kvfree_rcu_bulk(krcp, bnode, i); + } + + if (head_ready) + kvfree_rcu_list(head_ready); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3156,26 +3098,31 @@ static void kfree_rcu_monitor(struct work_struct *work) unsigned long flags; int i, j; + // Drain ready for reclaim. + kvfree_rcu_drain_ready(krcp); + raw_spin_lock_irqsave(&krcp->lock, flags); // Attempt to start a new batch. for (i = 0; i < KFREE_N_BATCHES; i++) { struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]); - // Try to detach bkvhead or head and attach it over any + // Try to detach bulk_head or head and attach it over any // available corresponding free channel. It can be that // a previous RCU batch is in progress, it means that // immediately to queue another one is not possible so // in that case the monitor work is rearmed. - if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || - (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || - (krcp->head && !krwp->head_free)) { + if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) || + (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) || + (READ_ONCE(krcp->head) && !krwp->head_free)) { + // Channel 1 corresponds to the SLAB-pointer bulk path. // Channel 2 corresponds to vmalloc-pointer bulk path. for (j = 0; j < FREE_N_CHANNELS; j++) { - if (!krwp->bkvhead_free[j]) { - krwp->bkvhead_free[j] = krcp->bkvhead[j]; - krcp->bkvhead[j] = NULL; + if (list_empty(&krwp->bulk_head_free[j])) { + atomic_set(&krcp->bulk_count[j], 0); + list_replace_init(&krcp->bulk_head[j], + &krwp->bulk_head_free[j]); } } @@ -3183,11 +3130,10 @@ static void kfree_rcu_monitor(struct work_struct *work) // objects queued on the linked list. if (!krwp->head_free) { krwp->head_free = krcp->head; - krcp->head = NULL; + atomic_set(&krcp->head_count, 0); + WRITE_ONCE(krcp->head, NULL); } - WRITE_ONCE(krcp->count, 0); - // One work is per one batch, so there are three // "free channels", the batch can handle. It can // be that the work is in the pending state when @@ -3197,6 +3143,8 @@ static void kfree_rcu_monitor(struct work_struct *work) } } + raw_spin_unlock_irqrestore(&krcp->lock, flags); + // If there is nothing to detach, it means that our job is // successfully done here. In case of having at least one // of the channels that is still busy we should rearm the @@ -3204,8 +3152,6 @@ static void kfree_rcu_monitor(struct work_struct *work) // still in progress. if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - - raw_spin_unlock_irqrestore(&krcp->lock, flags); } static enum hrtimer_restart @@ -3288,10 +3234,11 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return false; idx = !!is_vmalloc_addr(ptr); + bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx], + struct kvfree_rcu_bulk_data, list); /* Check if a new block is required. */ - if (!(*krcp)->bkvhead[idx] || - (*krcp)->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { + if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(*krcp); if (!bnode && can_alloc) { krc_this_cpu_unlock(*krcp, *flags); @@ -3315,17 +3262,15 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, if (!bnode) return false; - /* Initialize the new block. */ + // Initialize the new block and attach it. bnode->nr_records = 0; - bnode->next = (*krcp)->bkvhead[idx]; - - /* Attach it to the head. */ - (*krcp)->bkvhead[idx] = bnode; + list_add(&bnode->list, &(*krcp)->bulk_head[idx]); } - /* Finally insert. */ - (*krcp)->bkvhead[idx]->records - [(*krcp)->bkvhead[idx]->nr_records++] = ptr; + // Finally insert and update the GP for this page. + bnode->records[bnode->nr_records++] = ptr; + bnode->gp_snap = get_state_synchronize_rcu(); + atomic_inc(&(*krcp)->bulk_count[idx]); return true; } @@ -3342,26 +3287,21 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ -void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +void kvfree_call_rcu(struct rcu_head *head, void *ptr) { unsigned long flags; struct kfree_rcu_cpu *krcp; bool success; - void *ptr; - if (head) { - ptr = (void *) head - (unsigned long) func; - } else { - /* - * Please note there is a limitation for the head-less - * variant, that is why there is a clear rule for such - * objects: it can be used from might_sleep() context - * only. For other places please embed an rcu_head to - * your data. - */ + /* + * Please note there is a limitation for the head-less + * variant, that is why there is a clear rule for such + * objects: it can be used from might_sleep() context + * only. For other places please embed an rcu_head to + * your data. + */ + if (!head) might_sleep(); - ptr = (unsigned long *) func; - } // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { @@ -3382,14 +3322,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; - head->func = func; + head->func = ptr; head->next = krcp->head; - krcp->head = head; + WRITE_ONCE(krcp->head, head); + atomic_inc(&krcp->head_count); + + // Take a snapshot for this krcp. + krcp->head_gp_snap = get_state_synchronize_rcu(); success = true; } - WRITE_ONCE(krcp->count, krcp->count + 1); - // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); @@ -3420,7 +3362,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count += READ_ONCE(krcp->count); + count += krc_count(krcp); count += READ_ONCE(krcp->nr_bkv_objs); atomic_set(&krcp->backoff_page_cache_fill, 1); } @@ -3437,7 +3379,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) int count; struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - count = krcp->count; + count = krc_count(krcp); count += drain_page_cache(krcp); kfree_rcu_monitor(&krcp->monitor_work.work); @@ -3461,15 +3403,12 @@ static struct shrinker kfree_rcu_shrinker = { void __init kfree_rcu_scheduler_running(void) { int cpu; - unsigned long flags; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) schedule_delayed_monitor_work(krcp); - raw_spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3485,9 +3424,10 @@ void __init kfree_rcu_scheduler_running(void) */ static int rcu_blocking_is_gp(void) { - if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) + if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE) { + might_sleep(); return false; - might_sleep(); /* Check for RCU read-side critical section. */ + } return true; } @@ -3711,7 +3651,9 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate - * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). + * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() + * on the one hand or by directly invoking either synchronize_rcu() or + * synchronize_rcu_expedited() on the other. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for @@ -3722,6 +3664,12 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full); * completed. Alternatively, they can use get_completed_synchronize_rcu() * to get a guaranteed-completed grace-period state. * + * In addition, because oldstate compresses the grace-period state for + * both normal and expedited grace periods into a single unsigned long, + * it can miss a grace period when synchronize_rcu() runs concurrently + * with synchronize_rcu_expedited(). If this is unacceptable, please + * instead use the _full() variant of these polling APIs. + * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @oldstate, and that returned at the end @@ -4080,6 +4028,155 @@ retry: EXPORT_SYMBOL_GPL(rcu_barrier); /* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return READ_ONCE(rnp->qsmaskinitnext); +} + +/* + * Is the CPU corresponding to the specified rcu_data structure online + * from RCU's perspective? This perspective is given by that structure's + * ->qsmaskinitnext field rather than by the global cpu_online_mask. + */ +static bool rcu_rdp_cpu_online(struct rcu_data *rdp) +{ + return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode)); +} + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online as far as RCU is concerned? + * + * Disable preemption to avoid false positives that could otherwise + * happen due to the current CPU number being sampled, this task being + * preempted, its old CPU being taken offline, resuming on some other CPU, + * then determining that its old CPU is now offline. + * + * Disable checking if in an NMI handler because we cannot safely + * report errors from NMI handlers anyway. In addition, it is OK to use + * RCU on an offline processor during initial boot, hence the check for + * rcu_scheduler_fully_active. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + bool ret = false; + + if (in_nmi() || !rcu_scheduler_fully_active) + return true; + preempt_disable_notrace(); + rdp = this_cpu_ptr(&rcu_data); + /* + * Strictly, we care here about the case where the current CPU is + * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * not being up to date. So arch_spin_is_locked() might have a + * false positive if it's held by some *other* CPU, but that's + * OK because that just means a false *negative* on the warning. + */ + if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock)) + ret = true; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return 0; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * All CPUs for the specified rcu_node structure have gone offline, + * and all tasks that were preempted within an RCU read-side critical + * section while running on one of those CPUs have since exited their RCU + * read-side critical section. Some other CPU is reporting this fact with + * the specified rcu_node structure's ->lock held and interrupts disabled. + * This function therefore goes up the tree of rcu_node structures, + * clearing the corresponding bits in the ->qsmaskinit fields. Note that + * the leaf rcu_node structure's ->qsmaskinit field has already been + * updated. + * + * This function does check that the specified rcu_node structure has + * all CPUs offline and no blocked tasks, so it is OK to invoke it + * prematurely. That said, invoking it after the fact will cost you + * a needless lock acquisition. So once it has done its work, don't + * invoke it again. + */ +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + raw_lockdep_assert_held_rcu_node(rnp_leaf); + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || + WARN_ON_ONCE(rnp_leaf->qsmaskinit) || + WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) + return; + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (!rnp) + break; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + /* Between grace periods, so better already be zero! */ + WARN_ON_ONCE(rnp->qsmask); + if (rnp->qsmaskinit) { + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ + return; + } + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + } +} + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return 0; + + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller * must hold the corresponding leaf rcu_node ->lock with interrupts @@ -4408,11 +4505,13 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: + rcu_async_hurry(); rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: rcu_unexpedite_gp(); + rcu_async_relax(); break; default: break; @@ -4766,7 +4865,7 @@ struct workqueue_struct *rcu_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; - int i; + int i, j; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4786,8 +4885,14 @@ static void __init kfree_rcu_batch_init(void) for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; + + for (j = 0; j < FREE_N_CHANNELS; j++) + INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]); } + for (i = 0; i < FREE_N_CHANNELS; i++) + INIT_LIST_HEAD(&krcp->bulk_head[i]); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; @@ -4838,6 +4943,8 @@ void __init rcu_init(void) // Kick-start any polled grace periods that started early. if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) (void)start_poll_synchronize_rcu_expedited(); + + rcu_test_sync_prims(); } #include "tree_stall.h" |