rcutree.c 98.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright IBM Corporation, 2008
 *
 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
 *	    Manfred Spraul <manfred@colorfullife.com>
 *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
 *
 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 *
 * For detailed explanation of Read-Copy Update mechanism see -
28
 *	Documentation/RCU
29 30 31 32 33 34 35 36 37
 */
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/rcupdate.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
38
#include <linux/nmi.h>
39
#include <linux/atomic.h>
40
#include <linux/bitops.h>
41
#include <linux/export.h>
42 43 44 45 46 47 48
#include <linux/completion.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
#include <linux/time.h>
49
#include <linux/kernel_stat.h>
50 51
#include <linux/wait.h>
#include <linux/kthread.h>
52
#include <linux/prefetch.h>
53 54
#include <linux/delay.h>
#include <linux/stop_machine.h>
55
#include <linux/random.h>
56

57
#include "rcutree.h"
58 59 60
#include <trace/events/rcu.h>

#include "rcu.h"
61

62 63
/* Data structures. */

64
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66

67
#define RCU_STATE_INITIALIZER(sname, cr) { \
68
	.level = { &sname##_state.node[0] }, \
69
	.call = cr, \
70
	.fqs_state = RCU_GP_IDLE, \
71 72
	.gpnum = 0UL - 300UL, \
	.completed = 0UL - 300UL, \
73
	.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
74 75
	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
	.orphan_donetail = &sname##_state.orphan_donelist, \
76
	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77
	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78
	.name = #sname, \
79 80
}

81 82
struct rcu_state rcu_sched_state =
	RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
83
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84

85
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
86
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87

88
static struct rcu_state *rcu_state;
89
LIST_HEAD(rcu_struct_flavors);
90

91 92
/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
93
module_param(rcu_fanout_leaf, int, 0444);
94 95 96 97 98 99 100 101 102 103
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */
	NUM_RCU_LVL_0,
	NUM_RCU_LVL_1,
	NUM_RCU_LVL_2,
	NUM_RCU_LVL_3,
	NUM_RCU_LVL_4,
};
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */

104 105 106 107
/*
 * The rcu_scheduler_active variable transitions from zero to one just
 * before the first task is spawned.  So when this variable is zero, RCU
 * can assume that there is but one task, allowing RCU to (for example)
108
 * optimize synchronize_sched() to a simple barrier().  When this variable
109 110 111 112
 * is one, RCU must actually do all the hard work required to detect real
 * grace periods.  This variable is also used to suppress boot-time false
 * positives from lockdep-RCU error checking.
 */
113 114 115
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);

116 117 118 119 120 121 122 123 124 125 126 127 128 129
/*
 * The rcu_scheduler_fully_active variable transitions from zero to one
 * during the early_initcall() processing, which is after the scheduler
 * is capable of creating new tasks.  So RCU processing (for example,
 * creating tasks for RCU priority boosting) must be delayed until after
 * rcu_scheduler_fully_active transitions from zero to one.  We also
 * currently delay invocation of any RCU callbacks until after this point.
 *
 * It might later prove better for people registering RCU callbacks during
 * early boot to take responsibility for these callbacks, but one step at
 * a time.
 */
static int rcu_scheduler_fully_active __read_mostly;

130 131
#ifdef CONFIG_RCU_BOOST

132 133 134 135 136
/*
 * Control variables for per-CPU and per-rcu_node kthreads.  These
 * handle all flavors of RCU.
 */
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
137
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
138
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
139
DEFINE_PER_CPU(char, rcu_cpu_has_work);
140

141 142
#endif /* #ifdef CONFIG_RCU_BOOST */

Thomas Gleixner's avatar
Thomas Gleixner committed
143
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
144 145
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
146

147 148 149 150 151 152 153 154 155 156 157 158
/*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
 * on every rcutorture module load and unload, so has an odd value
 * when a test is running.  The rcutorture_vernum is set to zero
 * when rcutorture starts and is incremented on each rcutorture update.
 * These variables enable correlating rcutorture output with the
 * RCU tracing information.
 */
unsigned long rcutorture_testseq;
unsigned long rcutorture_vernum;

159 160 161 162 163 164 165 166 167 168
/*
 * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
 * permit this function to be invoked without holding the root rcu_node
 * structure's ->lock, but of course results can be subject to change.
 */
static int rcu_gp_in_progress(struct rcu_state *rsp)
{
	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
}

169
/*
170
 * Note a quiescent state.  Because we do not need to know
171
 * how many quiescent states passed, just if there was at least
172
 * one since the start of the grace period, this just sets a flag.
173
 * The caller must have disabled preemption.
174
 */
175
void rcu_sched_qs(int cpu)
176
{
177
	struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
178

179
	if (rdp->passed_quiesce == 0)
180
		trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
181
	rdp->passed_quiesce = 1;
182 183
}

184
void rcu_bh_qs(int cpu)
185
{
186
	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
187

188
	if (rdp->passed_quiesce == 0)
189
		trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
190
	rdp->passed_quiesce = 1;
191
}
192

193 194 195
/*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
196
 * The caller must have disabled preemption.
197 198 199
 */
void rcu_note_context_switch(int cpu)
{
200
	trace_rcu_utilization("Start context switch");
201
	rcu_sched_qs(cpu);
202
	rcu_preempt_note_context_switch(cpu);
203
	trace_rcu_utilization("End context switch");
204
}
205
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
206

207
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
208
	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
209
	.dynticks = ATOMIC_INIT(1),
210
};
211

212 213 214
static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
static long qhimark = 10000;	/* If this many pending, ignore blimit. */
static long qlowmark = 100;	/* Once only this many pending, use blimit. */
215

216 217 218
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
219

220 221 222 223 224 225
static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;

module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);

226 227
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
static void force_quiescent_state(struct rcu_state *rsp);
228
static int rcu_pending(int cpu);
229 230

/*
231
 * Return the number of RCU-sched batches processed thus far for debug & stats.
232
 */
233
long rcu_batches_completed_sched(void)
234
{
235
	return rcu_sched_state.completed;
236
}
237
EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
238 239 240 241 242 243 244 245 246 247

/*
 * Return the number of RCU BH batches processed thus far for debug & stats.
 */
long rcu_batches_completed_bh(void)
{
	return rcu_bh_state.completed;
}
EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);

248 249 250 251 252
/*
 * Force a quiescent state for RCU BH.
 */
void rcu_bh_force_quiescent_state(void)
{
253
	force_quiescent_state(&rcu_bh_state);
254 255 256
}
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);

257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
/*
 * Record the number of times rcutorture tests have been initiated and
 * terminated.  This information allows the debugfs tracing stats to be
 * correlated to the rcutorture messages, even when the rcutorture module
 * is being repeatedly loaded and unloaded.  In other words, we cannot
 * store this state in rcutorture itself.
 */
void rcutorture_record_test_transition(void)
{
	rcutorture_testseq++;
	rcutorture_vernum = 0;
}
EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);

/*
 * Record the number of writer passes through the current rcutorture test.
 * This is also used to correlate debugfs tracing stats with the rcutorture
 * messages.
 */
void rcutorture_record_progress(unsigned long vernum)
{
	rcutorture_vernum++;
}
EXPORT_SYMBOL_GPL(rcutorture_record_progress);

282 283 284 285 286
/*
 * Force a quiescent state for RCU-sched.
 */
void rcu_sched_force_quiescent_state(void)
{
287
	force_quiescent_state(&rcu_sched_state);
288 289 290
}
EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);

291 292 293 294 295 296
/*
 * Does the CPU have callbacks ready to be invoked?
 */
static int
cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
{
297 298
	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] &&
	       rdp->nxttail[RCU_DONE_TAIL] != NULL;
299 300 301
}

/*
302 303 304
 * Does the current CPU require a not-yet-started grace period?
 * The caller must have disabled interrupts to prevent races with
 * normal callback registry.
305 306 307 308
 */
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
309
	int i;
310

311 312 313 314 315 316 317 318 319 320 321 322
	if (rcu_gp_in_progress(rsp))
		return 0;  /* No, a grace period is already in progress. */
	if (!rdp->nxttail[RCU_NEXT_TAIL])
		return 0;  /* No, this is a no-CBs (or offline) CPU. */
	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
		return 1;  /* Yes, this CPU has newly registered callbacks. */
	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
		    ULONG_CMP_LT(ACCESS_ONCE(rsp->completed),
				 rdp->nxtcompleted[i]))
			return 1;  /* Yes, CBs for future grace period. */
	return 0; /* No grace period needed. */
323 324 325 326 327 328 329 330 331 332
}

/*
 * Return the root node of the specified rcu_state structure.
 */
static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
{
	return &rsp->node[0];
}

333
/*
334
 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
335 336 337 338 339
 *
 * If the new value of the ->dynticks_nesting counter now is zero,
 * we really have entered idle, and must do the appropriate accounting.
 * The caller must have disabled interrupts.
 */
340 341
static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
				bool user)
342
{
343
	trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting);
344
	if (!user && !is_idle_task(current)) {
345 346
		struct task_struct *idle = idle_task(smp_processor_id());

347
		trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
348
		ftrace_dump(DUMP_ORIG);
349 350 351
		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
			  current->pid, current->comm,
			  idle->pid, idle->comm); /* must be idle task! */
352
	}
353
	rcu_prepare_for_idle(smp_processor_id());
354 355 356 357 358
	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
	smp_mb__before_atomic_inc();  /* See above. */
	atomic_inc(&rdtp->dynticks);
	smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
359 360

	/*
361
	 * It is illegal to enter an extended quiescent state while
362 363 364 365 366 367 368 369
	 * in an RCU read-side critical section.
	 */
	rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
			   "Illegal idle entry in RCU read-side critical section.");
	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
			   "Illegal idle entry in RCU-bh read-side critical section.");
	rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
			   "Illegal idle entry in RCU-sched read-side critical section.");
370
}
371

372 373 374
/*
 * Enter an RCU extended quiescent state, which can be either the
 * idle loop or adaptive-tickless usermode execution.
375
 */
376
static void rcu_eqs_enter(bool user)
377
{
378
	long long oldval;
379 380 381
	struct rcu_dynticks *rdtp;

	rdtp = &__get_cpu_var(rcu_dynticks);
382
	oldval = rdtp->dynticks_nesting;
383 384 385 386 387
	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
		rdtp->dynticks_nesting = 0;
	else
		rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
388
	rcu_eqs_enter_common(rdtp, oldval, user);
389
}
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404

/**
 * rcu_idle_enter - inform RCU that current CPU is entering idle
 *
 * Enter idle mode, in other words, -leave- the mode in which RCU
 * read-side critical sections can occur.  (Though RCU read-side
 * critical sections can occur in irq handlers in idle, a possibility
 * handled by irq_enter() and irq_exit().)
 *
 * We crowbar the ->dynticks_nesting field to zero to allow for
 * the possibility of usermode upcalls having messed up our count
 * of interrupt nesting level during the prior busy period.
 */
void rcu_idle_enter(void)
{
405 406 407
	unsigned long flags;

	local_irq_save(flags);
408
	rcu_eqs_enter(false);
409
	local_irq_restore(flags);
410
}
411
EXPORT_SYMBOL_GPL(rcu_idle_enter);
412

413
#ifdef CONFIG_RCU_USER_QS
414 415 416 417 418 419 420 421 422 423
/**
 * rcu_user_enter - inform RCU that we are resuming userspace.
 *
 * Enter RCU idle mode right before resuming userspace.  No use of RCU
 * is permitted between this call and rcu_user_exit(). This way the
 * CPU doesn't need to maintain the tick for RCU maintenance purposes
 * when the CPU runs in userspace.
 */
void rcu_user_enter(void)
{
424
	rcu_eqs_enter(1);
425 426
}

427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
/**
 * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
 * after the current irq returns.
 *
 * This is similar to rcu_user_enter() but in the context of a non-nesting
 * irq. After this call, RCU enters into idle mode when the interrupt
 * returns.
 */
void rcu_user_enter_after_irq(void)
{
	unsigned long flags;
	struct rcu_dynticks *rdtp;

	local_irq_save(flags);
	rdtp = &__get_cpu_var(rcu_dynticks);
	/* Ensure this irq is interrupting a non-idle RCU state.  */
	WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
	rdtp->dynticks_nesting = 1;
	local_irq_restore(flags);
}
447
#endif /* CONFIG_RCU_USER_QS */
448

449 450 451 452 453 454
/**
 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
 *
 * Exit from an interrupt handler, which might possibly result in entering
 * idle mode, in other words, leaving the mode in which read-side critical
 * sections can occur.
455
 *
456 457 458 459 460 461 462 463
 * This code assumes that the idle loop never does anything that might
 * result in unbalanced calls to irq_enter() and irq_exit().  If your
 * architecture violates this assumption, RCU will give you what you
 * deserve, good and hard.  But very infrequently and irreproducibly.
 *
 * Use things like work queues to work around this limitation.
 *
 * You have been warned.
464
 */
465
void rcu_irq_exit(void)
466 467
{
	unsigned long flags;
468
	long long oldval;
469 470 471 472
	struct rcu_dynticks *rdtp;

	local_irq_save(flags);
	rdtp = &__get_cpu_var(rcu_dynticks);
473
	oldval = rdtp->dynticks_nesting;
474 475
	rdtp->dynticks_nesting--;
	WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
476 477 478
	if (rdtp->dynticks_nesting)
		trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
	else
479
		rcu_eqs_enter_common(rdtp, oldval, true);
480 481 482 483
	local_irq_restore(flags);
}

/*
484
 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
485 486 487 488 489
 *
 * If the new value of the ->dynticks_nesting counter was previously zero,
 * we really have exited idle, and must do the appropriate accounting.
 * The caller must have disabled interrupts.
 */
490 491
static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
			       int user)
492
{
493 494 495 496 497
	smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
	atomic_inc(&rdtp->dynticks);
	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
	smp_mb__after_atomic_inc();  /* See above. */
	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
498
	rcu_cleanup_after_idle(smp_processor_id());
499
	trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
500
	if (!user && !is_idle_task(current)) {
501 502
		struct task_struct *idle = idle_task(smp_processor_id());

503 504
		trace_rcu_dyntick("Error on exit: not idle task",
				  oldval, rdtp->dynticks_nesting);
505
		ftrace_dump(DUMP_ORIG);
506 507 508
		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
			  current->pid, current->comm,
			  idle->pid, idle->comm); /* must be idle task! */
509 510 511
	}
}

512 513 514
/*
 * Exit an RCU extended quiescent state, which can be either the
 * idle loop or adaptive-tickless usermode execution.
515
 */
516
static void rcu_eqs_exit(bool user)
517 518 519 520 521 522
{
	struct rcu_dynticks *rdtp;
	long long oldval;

	rdtp = &__get_cpu_var(rcu_dynticks);
	oldval = rdtp->dynticks_nesting;
523 524 525 526 527
	WARN_ON_ONCE(oldval < 0);
	if (oldval & DYNTICK_TASK_NEST_MASK)
		rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
	else
		rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
528
	rcu_eqs_exit_common(rdtp, oldval, user);
529
}
530 531 532 533 534 535 536 537 538 539 540 541 542 543

/**
 * rcu_idle_exit - inform RCU that current CPU is leaving idle
 *
 * Exit idle mode, in other words, -enter- the mode in which RCU
 * read-side critical sections can occur.
 *
 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
 * allow for the possibility of usermode upcalls messing up our count
 * of interrupt nesting level during the busy period that is just
 * now starting.
 */
void rcu_idle_exit(void)
{
544 545 546
	unsigned long flags;

	local_irq_save(flags);
547
	rcu_eqs_exit(false);
548
	local_irq_restore(flags);
549
}
550
EXPORT_SYMBOL_GPL(rcu_idle_exit);
551

552
#ifdef CONFIG_RCU_USER_QS
553 554 555 556 557 558 559 560
/**
 * rcu_user_exit - inform RCU that we are exiting userspace.
 *
 * Exit RCU idle mode while entering the kernel because it can
 * run a RCU read side critical section anytime.
 */
void rcu_user_exit(void)
{
561
	rcu_eqs_exit(1);
562 563
}

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584
/**
 * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
 * idle mode after the current non-nesting irq returns.
 *
 * This is similar to rcu_user_exit() but in the context of an irq.
 * This is called when the irq has interrupted a userspace RCU idle mode
 * context. When the current non-nesting interrupt returns after this call,
 * the CPU won't restore the RCU idle mode.
 */
void rcu_user_exit_after_irq(void)
{
	unsigned long flags;
	struct rcu_dynticks *rdtp;

	local_irq_save(flags);
	rdtp = &__get_cpu_var(rcu_dynticks);
	/* Ensure we are interrupting an RCU idle mode. */
	WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
	rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
	local_irq_restore(flags);
}
585
#endif /* CONFIG_RCU_USER_QS */
586

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
/**
 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
 *
 * Enter an interrupt handler, which might possibly result in exiting
 * idle mode, in other words, entering the mode in which read-side critical
 * sections can occur.
 *
 * Note that the Linux kernel is fully capable of entering an interrupt
 * handler that it never exits, for example when doing upcalls to
 * user mode!  This code assumes that the idle loop never does upcalls to
 * user mode.  If your architecture does do upcalls from the idle loop (or
 * does anything else that results in unbalanced calls to the irq_enter()
 * and irq_exit() functions), RCU will give you what you deserve, good
 * and hard.  But very infrequently and irreproducibly.
 *
 * Use things like work queues to work around this limitation.
 *
 * You have been warned.
 */
void rcu_irq_enter(void)
{
	unsigned long flags;
	struct rcu_dynticks *rdtp;
	long long oldval;

	local_irq_save(flags);
	rdtp = &__get_cpu_var(rcu_dynticks);
	oldval = rdtp->dynticks_nesting;
	rdtp->dynticks_nesting++;
	WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
617 618 619
	if (oldval)
		trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
	else
620
		rcu_eqs_exit_common(rdtp, oldval, true);
621 622 623 624 625 626 627 628 629 630 631 632 633 634
	local_irq_restore(flags);
}

/**
 * rcu_nmi_enter - inform RCU of entry to NMI context
 *
 * If the CPU was idle with dynamic ticks active, and there is no
 * irq handler running, this updates rdtp->dynticks_nmi to let the
 * RCU grace-period handling know that the CPU is active.
 */
void rcu_nmi_enter(void)
{
	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);

635 636
	if (rdtp->dynticks_nmi_nesting == 0 &&
	    (atomic_read(&rdtp->dynticks) & 0x1))
637
		return;
638 639 640 641 642 643
	rdtp->dynticks_nmi_nesting++;
	smp_mb__before_atomic_inc();  /* Force delay from prior write. */
	atomic_inc(&rdtp->dynticks);
	/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
	smp_mb__after_atomic_inc();  /* See above. */
	WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
644 645 646 647 648 649 650 651 652 653 654 655 656
}

/**
 * rcu_nmi_exit - inform RCU of exit from NMI context
 *
 * If the CPU was idle with dynamic ticks active, and there is no
 * irq handler running, this updates rdtp->dynticks_nmi to let the
 * RCU grace-period handling know that the CPU is no longer active.
 */
void rcu_nmi_exit(void)
{
	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);

657 658
	if (rdtp->dynticks_nmi_nesting == 0 ||
	    --rdtp->dynticks_nmi_nesting != 0)
659
		return;
660 661 662 663 664
	/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
	smp_mb__before_atomic_inc();  /* See above. */
	atomic_inc(&rdtp->dynticks);
	smp_mb__after_atomic_inc();  /* Force delay to next write. */
	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
665 666 667
}

/**
668
 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
669
 *
670
 * If the current CPU is in its idle loop and is neither in an interrupt
671
 * or NMI handler, return true.
672
 */
673
int rcu_is_cpu_idle(void)
674
{
675 676 677 678 679 680
	int ret;

	preempt_disable();
	ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
	preempt_enable();
	return ret;
681
}
682
EXPORT_SYMBOL(rcu_is_cpu_idle);
683

684
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
685 686 687 688 689 690 691

/*
 * Is the current CPU online?  Disable preemption to avoid false positives
 * that could otherwise happen due to the current CPU number being sampled,
 * this task being preempted, its old CPU being taken offline, resuming
 * on some other CPU, then determining that its old CPU is now offline.
 * It is OK to use RCU on an offline processor during initial boot, hence
692 693 694 695 696 697 698 699 700 701 702
 * the check for rcu_scheduler_fully_active.  Note also that it is OK
 * for a CPU coming online to use RCU for one jiffy prior to marking itself
 * online in the cpu_online_mask.  Similarly, it is OK for a CPU going
 * offline to continue to use RCU for one jiffy after marking itself
 * offline in the cpu_online_mask.  This leniency is necessary given the
 * non-atomic nature of the online and offline processing, for example,
 * the fact that a CPU enters the scheduler after completing the CPU_DYING
 * notifiers.
 *
 * This is also why RCU internally marks CPUs online during the
 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
703 704 705 706 707 708
 *
 * Disable checking if in an NMI handler because we cannot safely report
 * errors from NMI handlers anyway.
 */
bool rcu_lockdep_current_cpu_online(void)
{
709 710
	struct rcu_data *rdp;
	struct rcu_node *rnp;
711 712 713 714 715
	bool ret;

	if (in_nmi())
		return 1;
	preempt_disable();
716 717 718
	rdp = &__get_cpu_var(rcu_sched_data);
	rnp = rdp->mynode;
	ret = (rdp->grpmask & rnp->qsmaskinit) ||
719 720 721 722 723 724
	      !rcu_scheduler_fully_active;
	preempt_enable();
	return ret;
}
EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);

725
#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */
726

727
/**
728
 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
729
 *
730 731 732
 * If the current CPU is idle or running at a first-level (not nested)
 * interrupt from idle, return true.  The caller must have at least
 * disabled preemption.
733
 */
734
static int rcu_is_cpu_rrupt_from_idle(void)
735
{
736
	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
737 738 739 740 741
}

/*
 * Snapshot the specified CPU's dynticks counter so that we can later
 * credit them with an implicit quiescent state.  Return 1 if this CPU
742
 * is in dynticks idle mode, which is an extended quiescent state.
743 744 745
 */
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
746
	rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
747
	return (rdp->dynticks_snap & 0x1) == 0;
748 749 750 751 752 753
}

/*
 * Return true if the specified CPU has passed through a quiescent
 * state by virtue of being in or having passed through an dynticks
 * idle state since the last call to dyntick_save_progress_counter()
754
 * for this same CPU, or by virtue of having been offline.
755 756 757
 */
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
758 759
	unsigned int curr;
	unsigned int snap;
760

761 762
	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
	snap = (unsigned int)rdp->dynticks_snap;
763 764 765 766 767 768 769 770 771

	/*
	 * If the CPU passed through or entered a dynticks idle phase with
	 * no active irq/NMI handlers, then we can safely pretend that the CPU
	 * already acknowledged the request to pass through a quiescent
	 * state.  Either way, that CPU cannot possibly be in an RCU
	 * read-side critical section that started before the beginning
	 * of the current RCU grace period.
	 */
772
	if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
773
		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
774 775 776 777
		rdp->dynticks_fqs++;
		return 1;
	}

778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797
	/*
	 * Check for the CPU being offline, but only if the grace period
	 * is old enough.  We don't need to worry about the CPU changing
	 * state: If we see it offline even once, it has been through a
	 * quiescent state.
	 *
	 * The reason for insisting that the grace period be at least
	 * one jiffy old is that CPUs that are not quite online and that
	 * have just gone offline can still execute RCU read-side critical
	 * sections.
	 */
	if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
		return 0;  /* Grace period is not old enough. */
	barrier();
	if (cpu_is_offline(rdp->cpu)) {
		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
		rdp->offline_fqs++;
		return 1;
	}
	return 0;
798 799 800 801 802
}

static void record_gp_stall_check_time(struct rcu_state *rsp)
{
	rsp->gp_start = jiffies;
803
	rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check();
804 805
}

806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
/*
 * Dump stacks of all tasks running on stalled CPUs.  This is a fallback
 * for architectures that do not implement trigger_all_cpu_backtrace().
 * The NMI-triggered stack traces are more accurate because they are
 * printed by the target CPU.
 */
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
	int cpu;
	unsigned long flags;
	struct rcu_node *rnp;

	rcu_for_each_leaf_node(rsp, rnp) {
		raw_spin_lock_irqsave(&rnp->lock, flags);
		if (rnp->qsmask != 0) {
			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
				if (rnp->qsmask & (1UL << cpu))
					dump_cpu_task(rnp->grplo + cpu);
		}
		raw_spin_unlock_irqrestore(&rnp->lock, flags);
	}
}

829 830 831 832 833
static void print_other_cpu_stall(struct rcu_state *rsp)
{
	int cpu;
	long delta;
	unsigned long flags;
834
	int ndetected = 0;
835
	struct rcu_node *rnp = rcu_get_root(rsp);
836
	long totqlen = 0;
837 838 839

	/* Only let one CPU complain about others per time interval. */

840
	raw_spin_lock_irqsave(&rnp->lock, flags);
841
	delta = jiffies - rsp->jiffies_stall;
842
	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
843
		raw_spin_unlock_irqrestore(&rnp->lock, flags);
844 845
		return;
	}
846
	rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
847
	raw_spin_unlock_irqrestore(&rnp->lock, flags);
848

849 850 851 852 853
	/*
	 * OK, time to rat on our buddy...
	 * See Documentation/RCU/stallwarn.txt for info on how to debug
	 * RCU CPU stall warnings.
	 */
854
	printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
855
	       rsp->name);
856
	print_cpu_stall_info_begin();
857
	rcu_for_each_leaf_node(rsp, rnp) {
858
		raw_spin_lock_irqsave(&rnp->lock, flags);
859
		ndetected += rcu_print_task_stall(rnp);
860 861 862 863 864 865 866 867
		if (rnp->qsmask != 0) {
			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
				if (rnp->qsmask & (1UL << cpu)) {
					print_cpu_stall_info(rsp,
							     rnp->grplo + cpu);
					ndetected++;
				}
		}
868
		raw_spin_unlock_irqrestore(&rnp->lock, flags);
869
	}
870 871 872 873 874 875 876

	/*
	 * Now rat on any tasks that got kicked up to the root rcu_node
	 * due to CPU offlining.
	 */
	rnp = rcu_get_root(rsp);
	raw_spin_lock_irqsave(&rnp->lock, flags);
877
	ndetected += rcu_print_task_stall(rnp);
878 879 880
	raw_spin_unlock_irqrestore(&rnp->lock, flags);

	print_cpu_stall_info_end();
881 882 883
	for_each_possible_cpu(cpu)
		totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
	pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n",
884
	       smp_processor_id(), (long)(jiffies - rsp->gp_start),
885
	       rsp->gpnum, rsp->completed, totqlen);
886 887 888
	if (ndetected == 0)
		printk(KERN_ERR "INFO: Stall ended before state dump start\n");
	else if (!trigger_all_cpu_backtrace())
889
		rcu_dump_cpu_stacks(rsp);
890

891
	/* Complain about tasks blocking the grace period. */
892 893 894

	rcu_print_detail_task_stall(rsp);

895
	force_quiescent_state(rsp);  /* Kick them all. */
896 897 898 899
}

static void print_cpu_stall(struct rcu_state *rsp)
{
900
	int cpu;
901 902
	unsigned long flags;
	struct rcu_node *rnp = rcu_get_root(rsp);
903
	long totqlen = 0;
904

905 906 907 908 909
	/*
	 * OK, time to rat on ourselves...
	 * See Documentation/RCU/stallwarn.txt for info on how to debug
	 * RCU CPU stall warnings.
	 */
910 911 912 913
	printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
	print_cpu_stall_info_begin();
	print_cpu_stall_info(rsp, smp_processor_id());
	print_cpu_stall_info_end();
914 915 916 917
	for_each_possible_cpu(cpu)
		totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen;
	pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n",
		jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen);
918 919
	if (!trigger_all_cpu_backtrace())
		dump_stack();
920

921
	raw_spin_lock_irqsave(&rnp->lock, flags);
922
	if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
923
		rsp->jiffies_stall = jiffies +
924
				     3 * rcu_jiffies_till_stall_check() + 3;
925
	raw_spin_unlock_irqrestore(&rnp->lock, flags);
926

927 928 929 930 931
	set_need_resched();  /* kick ourselves to get things going. */
}

static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
932 933
	unsigned long j;
	unsigned long js;
934 935
	struct rcu_node *rnp;

936
	if (rcu_cpu_stall_suppress)
937
		return;
938 939
	j = ACCESS_ONCE(jiffies);
	js = ACCESS_ONCE(rsp->jiffies_stall);
940
	rnp = rdp->mynode;
941 942
	if (rcu_gp_in_progress(rsp) &&
	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
943 944 945 946

		/* We haven't checked in, so go dump stack. */
		print_cpu_stall(rsp);

947 948
	} else if (rcu_gp_in_progress(rsp) &&
		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
949

950
		/* They had a few time units to dump stack, so complain. */
951 952 953 954
		print_other_cpu_stall(rsp);
	}
}

955 956 957 958 959 960 961 962 963 964 965
/**
 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
 *
 * Set the stall-warning timeout way off into the future, thus preventing
 * any RCU CPU stall-warning messages from appearing in the current set of
 * RCU grace periods.
 *
 * The caller must disable hard irqs.
 */
void rcu_cpu_stall_reset(void)
{
966 967 968 969
	struct rcu_state *rsp;

	for_each_rcu_flavor(rsp)
		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;
970 971
}

972 973 974
/*
 * Update CPU-local rcu_data state to record the newly noticed grace period.
 * This is used both when we started the grace period and when we notice
975 976 977
 * that someone else started the grace period.  The caller must hold the
 * ->lock of the leaf rcu_node structure corresponding to the current CPU,
 *  and must have irqs disabled.
978
 */
979 980 981
static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
	if (rdp->gpnum != rnp->gpnum) {
982 983 984 985 986
		/*
		 * If the current grace period is waiting for this CPU,
		 * set up to detect a quiescent state, otherwise don't
		 * go looking for one.
		 */
987
		rdp->gpnum = rnp->gpnum;
988
		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
989 990
		rdp->passed_quiesce = 0;
		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
991
		zero_cpu_stall_ticks(rdp);
992 993 994
	}
}

995 996
static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
{
997 998 999 1000 1001 1002
	unsigned long flags;
	struct rcu_node *rnp;

	local_irq_save(flags);
	rnp = rdp->mynode;
	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
1003
	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1004 1005 1006 1007
		local_irq_restore(flags);
		return;
	}
	__note_new_gpnum(rsp, rnp, rdp);
1008
	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
}

/*
 * Did someone else start a new RCU grace period start since we last
 * checked?  Update local state appropriately if so.  Must be called
 * on the CPU corresponding to rdp.
 */
static int
check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
{
	unsigned long flags;
	int ret = 0;

	local_irq_save(flags);
	if (rdp->gpnum != rsp->gpnum) {
		note_new_gpnum(rsp, rdp);
		ret = 1;
	}
	local_irq_restore(flags);
	return ret;
}

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
/*
 * Initialize the specified rcu_data structure's callback list to empty.
 */
static void init_callback_list(struct rcu_data *rdp)
{
	int i;

	rdp->nxtlist = NULL;
	for (i = 0; i < RCU_NEXT_SIZE; i++)
		rdp->nxttail[i] = &rdp->nxtlist;
1041
	init_nocb_callback_list(rdp);
1042 1043
}

1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
/*
 * Determine the value that ->completed will have at the end of the
 * next subsequent grace period.  This is used to tag callbacks so that
 * a CPU can invoke callbacks in a timely fashion even if that CPU has
 * been dyntick-idle for an extended period with callbacks under the
 * influence of RCU_FAST_NO_HZ.
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
				       struct rcu_node *rnp)
{
	/*
	 * If RCU is idle, we just wait for the next grace period.
	 * But we can only be sure that RCU is idle if we are looking
	 * at the root rcu_node structure -- otherwise, a new grace
	 * period might have started, but just not yet gotten around
	 * to initializing the current non-root rcu_node structure.
	 */
	if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed)
		return rnp->completed + 1;

	/*
	 * Otherwise, wait for a possible partial grace period and
	 * then the subsequent full grace period.
	 */
	return rnp->completed + 2;
}

/*
 * If there is room, assign a ->completed number to any callbacks on
 * this CPU that have not already been assigned.  Also accelerate any
 * callbacks that were previously assigned a ->completed number that has
 * since proven to be too conservative, which can happen if callbacks get
 * assigned a ->completed number while RCU is idle, but with reference to
 * a non-root rcu_node structure.  This function is idempotent, so it does
 * not hurt to call it repeatedly.
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
			       struct rcu_data *rdp)
{
	unsigned long c;
	int i;

	/* If the CPU has no callbacks, nothing to do. */
	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
		return;

	/*
	 * Starting from the sublist containing the callbacks most
	 * recently assigned a ->completed number and working down, find the
	 * first sublist that is not assignable to an upcoming grace period.
	 * Such a sublist has something in it (first two tests) and has
	 * a ->completed number assigned that will complete sooner than
	 * the ->completed number for newly arrived callbacks (last test).
	 *
	 * The key point is that any later sublist can be assigned the
	 * same ->completed number as the newly arrived callbacks, which
	 * means that the callbacks in any of these later sublist can be
	 * grouped into a single sublist, whether or not they have already
	 * been assigned a ->completed number.
	 */
	c = rcu_cbs_completed(rsp, rnp);
	for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--)
		if (rdp->nxttail[i] != rdp->nxttail[i - 1] &&
		    !ULONG_CMP_GE(rdp->nxtcompleted[i], c))
			break;

	/*
	 * If there are no sublist for unassigned callbacks, leave.
	 * At the same time, advance "i" one sublist, so that "i" will
	 * index into the sublist where all the remaining callbacks should
	 * be grouped into.
	 */
	if (++i >= RCU_NEXT_TAIL)
		return;

	/*
	 * Assign all subsequent callbacks' ->completed number to the next
	 * full grace period and group them all in the sublist initially
	 * indexed by "i".
	 */
	for (; i <= RCU_NEXT_TAIL; i++) {
		rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
		rdp->nxtcompleted[i] = c;
	}
1132 1133 1134 1135 1136 1137

	/* Trace depending on how much we were able to accelerate. */
	if (!*rdp->nxttail[RCU_WAIT_TAIL])
		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB");
	else
		trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB");
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
}

/*
 * Move any callbacks whose grace period has completed to the
 * RCU_DONE_TAIL sublist, then compact the remaining sublists and
 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
 * sublist.  This function is idempotent, so it does not hurt to
 * invoke it repeatedly.  As long as it is not invoked -too- often...
 *
 * The caller must hold rnp->lock with interrupts disabled.
 */
static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
			    struct rcu_data *rdp)
{
	int i, j;

	/* If the CPU has no callbacks, nothing to do. */
	if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
		return;

	/*
	 * Find all callbacks whose ->completed numbers indicate that they
	 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist.
	 */
	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
		if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i]))
			break;
		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i];
	}
	/* Clean up any sublist tail pointers that were misordered above. */
	for (j = RCU_WAIT_TAIL; j < i; j++)
		rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL];

	/* Copy down callbacks to fill in empty sublists. */
	for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
		if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL])
			break;
		rdp->nxttail[j] = rdp->nxttail[i];
		rdp->nxtcompleted[j] = rdp->nxtcompleted[i];
	}

	/* Classify any remaining callbacks. */
	rcu_accelerate_cbs(rsp, rnp, rdp);
}

1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.  In addition, the corresponding leaf rcu_node structure's
 * ->lock must be held by the caller, with irqs disabled.
 */
static void
__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
	/* Did another grace period end? */
1193
	if (rdp->completed == rnp->completed) {
1194

1195 1196
		/* No, so just accelerate recent callbacks. */
		rcu_accelerate_cbs(rsp, rnp, rdp);
1197

1198 1199 1200 1201
	} else {

		/* Advance callbacks. */
		rcu_advance_cbs(rsp, rnp, rdp);
1202 1203 1204

		/* Remember that we saw this grace-period completion. */
		rdp->completed = rnp->completed;
1205
		trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
1206

1207 1208
		/*
		 * If we were in an extended quiescent state, we may have
1209
		 * missed some grace periods that others CPUs handled on
1210
		 * our behalf. Catch up with this state to avoid noting
1211 1212
		 * spurious new grace periods.  If another grace period
		 * has started, then rnp->gpnum will have advanced, so
1213 1214
		 * we will detect this later on.  Of course, any quiescent
		 * states we found for the old GP are now invalid.
1215
		 */
1216
		if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
1217
			rdp->gpnum = rdp->completed;
1218 1219
			rdp->passed_quiesce = 0;
		}
1220

1221
		/*
1222 1223
		 * If RCU does not need a quiescent state from this CPU,
		 * then make sure that this CPU doesn't go looking for one.
1224
		 */
1225
		if ((rnp->qsmask & rdp->grpmask) == 0)
1226
			rdp->qs_pending = 0;
1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243
	}
}

/*
 * Advance this CPU's callbacks, but only if the current grace period
 * has ended.  This may be called only from the CPU to whom the rdp
 * belongs.
 */
static void
rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
{
	unsigned long flags;
	struct rcu_node *rnp;

	local_irq_save(flags);
	rnp = rdp->mynode;
	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
1244
	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1245 1246 1247 1248
		local_irq_restore(flags);
		return;
	}
	__rcu_process_gp_end(rsp, rnp, rdp);
1249
	raw_spin_unlock_irqrestore(&rnp->lock, flags);
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262
}

/*
 * Do per-CPU grace-period initialization for running CPU.  The caller
 * must hold the lock of the leaf rcu_node structure corresponding to
 * this CPU.
 */
static void
rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
{
	/* Prior grace period ended, so advance callbacks for current CPU. */
	__rcu_process_gp_end(rsp, rnp, rdp);

1263 1264
	/* Set state so that this CPU will detect the next quiescent state. */
	__note_new_gpnum(rsp, rnp, rdp);
1265 1266
}

1267
/*
1268
 * Initialize a new grace period.
1269
 */
1270
static int rcu_gp_init(struct rcu_state *rsp)
1271 1272
{
	struct rcu_data *rdp;
1273
	struct rcu_node *rnp = rcu_get_root(rsp);
1274

1275
	raw_spin_lock_irq(&rnp->lock);
1276
	rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1277

1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290
	if (rcu_gp_in_progress(rsp)) {
		/* Grace period already in progress, don't start another.  */
		raw_spin_unlock_irq(&rnp->lock);
		return 0;
	}

	/* Advance to a new grace period and initialize state. */
	rsp->gpnum++;
	trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
	record_gp_stall_check_time(rsp);
	raw_spin_unlock_irq(&rnp->lock);

	/* Exclude any concurrent CPU-hotplug operations. */
1291
	mutex_lock(&rsp->onoff_mutex);
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306

	/*
	 * Set the quiescent-state-needed bits in all the rcu_node
	 * structures for all currently online CPUs in breadth-first order,
	 * starting from the root rcu_node structure, relying on the layout
	 * of the tree within the rsp->node[] array.  Note that other CPUs
	 * will access only the leaves of the hierarchy, thus seeing that no
	 * grace period is in progress, at least until the corresponding
	 * leaf node has been initialized.  In addition, we have excluded
	 * CPU-hotplug operations.
	 *
	 * The grace period cannot complete until the initialization
	 * process finishes, because this kthread handles both.
	 */
	rcu_for_each_node_breadth_first(rsp, rnp) {
1307 1308
		raw_spin_lock_irq(&rnp->lock);
		rdp = this_cpu_ptr(rsp->rda);
1309 1310 1311
		rcu_preempt_check_blocked_tasks(rnp);
		rnp->qsmask = rnp->qsmaskinit;
		rnp->gpnum = rsp->gpnum;
1312
		WARN_ON_ONCE(rnp->completed != rsp->completed);
1313 1314 1315 1316 1317 1318 1319 1320
		rnp->completed = rsp->completed;
		if (rnp == rdp->mynode)
			rcu_start_gp_per_cpu(rsp, rnp, rdp);
		rcu_preempt_boost_start_gp(rnp);
		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
					    rnp->level, rnp->grplo,
					    rnp->grphi, rnp->qsmask);
		raw_spin_unlock_irq(&rnp->lock);
1321 1322 1323 1324
#ifdef CONFIG_PROVE_RCU_DELAY
		if ((random32() % (rcu_num_nodes * 8)) == 0)
			schedule_timeout_uninterruptible(2);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 1326
		cond_resched();
	}
1327

1328
	mutex_unlock(&rsp->onoff_mutex);
1329 1330
	return 1;
}
1331

1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357
/*
 * Do one round of quiescent-state forcing.
 */
int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
{
	int fqs_state = fqs_state_in;
	struct rcu_node *rnp = rcu_get_root(rsp);

	rsp->n_force_qs++;
	if (fqs_state == RCU_SAVE_DYNTICK) {
		/* Collect dyntick-idle snapshots. */
		force_qs_rnp(rsp, dyntick_save_progress_counter);
		fqs_state = RCU_FORCE_QS;
	} else {
		/* Handle dyntick-idle and offline CPUs. */
		force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
	}
	/* Clear flag to prevent immediate re-entry. */
	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
		raw_spin_lock_irq(&rnp->lock);
		rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
		raw_spin_unlock_irq(&rnp->lock);
	}
	return fqs_state;
}

1358 1359 1360
/*
 * Clean up after the old grace period.
 */
1361
static void rcu_gp_cleanup(struct rcu_state *rsp)
1362 1363 1364 1365
{
	unsigned long gp_duration;
	struct rcu_data *rdp;
	struct rcu_node *rnp = rcu_get_root(rsp);
1366

1367 1368 1369 1370
	raw_spin_lock_irq(&rnp->lock);
	gp_duration = jiffies - rsp->gp_start;
	if (gp_duration > rsp->gp_max)
		rsp->gp_max = gp_duration;
1371

1372 1373 1374 1375 1376 1377 1378 1379
	/*
	 * We know the grace period is complete, but to everyone else
	 * it appears to still be ongoing.  But it is also the case
	 * that to everyone else it looks like there is nothing that
	 * they can do to advance the grace period.  It is therefore
	 * safe for us to drop the lock in order to mark the grace
	 * period as completed in all of the rcu_node structures.
	 */
1380
	raw_spin_unlock_irq(&rnp->lock);
1381

1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	/*
	 * Propagate new ->completed value to rcu_node structures so
	 * that other CPUs don't have to wait until the start of the next
	 * grace period to process their callbacks.  This also avoids
	 * some nasty RCU grace-period initialization races by forcing
	 * the end of the current grace period to be completely recorded in
	 * all of the rcu_node structures before the beginning of the next
	 * grace period is recorded in any of the rcu_node structures.
	 */
	rcu_for_each_node_breadth_first(rsp, rnp) {
1392
		raw_spin_lock_irq(&rnp->lock);
1393 1394 1395
		rnp->completed = rsp->gpnum;
		raw_spin_unlock_irq(&rnp->lock);
		cond_resched();
1396
	}
1397 1398
	rnp = rcu_get_root(rsp);
	raw_spin_lock_irq(&rnp->lock);
1399 1400 1401 1402

	rsp->completed = rsp->gpnum; /* Declare grace period done. */
	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
	rsp->fqs_state = RCU_GP_IDLE;
1403
	rdp = this_cpu_ptr(rsp->rda);
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
	if (cpu_needs_another_gp(rsp, rdp))
		rsp->gp_flags = 1;
	raw_spin_unlock_irq(&rnp->lock);
}

/*
 * Body of kthread that handles grace periods.
 */
static int __noreturn rcu_gp_kthread(void *arg)
{
1414
	int fqs_state;
1415
	unsigned long j;
1416
	int ret;
1417 1418 1419 1420 1421 1422 1423
	struct rcu_state *rsp = arg;
	struct rcu_node *rnp = rcu_get_root(rsp);

	for (;;) {

		/* Handle grace-period start. */
		for (;;) {
1424 1425 1426 1427 1428
			wait_event_interruptible(rsp->gp_wq,
						 rsp->gp_flags &
						 RCU_GP_FLAG_INIT);
			if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
			    rcu_gp_init(rsp))
1429 1430 1431 1432
				break;
			cond_resched();
			flush_signals(current);
		}
1433

1434 1435
		/* Handle quiescent-state forcing. */
		fqs_state = RCU_SAVE_DYNTICK;
1436 1437 1438 1439 1440
		j = jiffies_till_first_fqs;
		if (j > HZ) {
			j = HZ;
			jiffies_till_first_fqs = HZ;
		}
1441
		for (;;) {
1442
			rsp->jiffies_force_qs = jiffies + j;
1443 1444 1445 1446
			ret = wait_event_interruptible_timeout(rsp->gp_wq,
					(rsp->gp_flags & RCU_GP_FLAG_FQS) ||
					(!ACCESS_ONCE(rnp->qsmask) &&
					 !rcu_preempt_blocked_readers_cgp(rnp)),
1447
					j);
1448
			/* If grace period done, leave loop. */
1449
			if (!ACCESS_ONCE(rnp->qsmask) &&
1450
			    !rcu_preempt_blocked_readers_cgp(rnp))
1451
				break;
1452 1453 1454 1455 1456 1457 1458 1459 1460
			/* If time for quiescent-state forcing, do it. */
			if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
				fqs_state = rcu_gp_fqs(rsp, fqs_state);
				cond_resched();
			} else {
				/* Deal with stray signal. */
				cond_resched();
				flush_signals(current);
			}
1461 1462 1463 1464 1465 1466 1467 1468
			j = jiffies_till_next_fqs;
			if (j > HZ) {
				j = HZ;
				jiffies_till_next_fqs = HZ;
			} else if (j < 1) {
				j = 1;
				jiffies_till_next_fqs = 1;
			}
1469
		}
1470 1471 1472

		/* Handle grace-period end. */
		rcu_gp_cleanup(rsp);
1473 1474 1475
	}
}

1476 1477 1478 1479 1480
/*
 * Start a new RCU grace period if warranted, re-initializing the hierarchy
 * in preparation for detecting the next grace period.  The caller must hold
 * the root node's ->lock, which is released before return.  Hard irqs must
 * be disabled.
1481 1482 1483 1484
 *
 * Note that it is legal for a dying CPU (which is marked as offline) to
 * invoke this function.  This can happen when the dying CPU reports its
 * quiescent state.
1485 1486 1487 1488 1489
 */
static void
rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
	__releases(rcu_get_root(rsp)->lock)
{
1490
	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1491 1492
	struct rcu_node *rnp = rcu_get_root(rsp);

1493
	if (!rsp->gp_kthread ||
1494 1495
	    !cpu_needs_another_gp(rsp, rdp)) {
		/*
1496
		 * Either we have not yet spawned the grace-period
1497 1498
		 * task, this CPU does not need another grace period,
		 * or a grace period is already in progress.
1499
		 * Either way, don't start a new grace period.
1500 1501 1502 1503
		 */
		raw_spin_unlock_irqrestore(&rnp->lock, flags);
		return;
	}
1504

1505 1506 1507
	/*
	 * Because there is no grace period in progress right now,
	 * any callbacks we have up to this point will be satisfied
1508 1509
	 * by the next grace period.  So this is a good place to
	 * assign a grace period number to recently posted callbacks.
1510
	 */
1511
	rcu_accelerate_cbs(rsp, rnp, rdp);
1512

1513
	rsp->gp_flags = RCU_GP_FLAG_INIT;