core.c 178 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
2
 *  kernel/sched/core.c
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Ingo Molnar's avatar
Ingo Molnar committed
4
 *  Core kernel scheduler code and related syscalls
Linus Torvalds's avatar
Linus Torvalds committed
5
6
7
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 */
8
#include <linux/sched.h>
Linus Torvalds's avatar
Linus Torvalds committed
9
#include <linux/cpuset.h>
10
#include <linux/delayacct.h>
11
#include <linux/init_task.h>
12
#include <linux/context_tracking.h>
13
14
15
16
17
18

#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
19
#include <linux/prefetch.h>
20
21
22
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
Linus Torvalds's avatar
Linus Torvalds committed
23

24
#include <asm/switch_to.h>
25
#include <asm/tlb.h>
26
27
28
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
Linus Torvalds's avatar
Linus Torvalds committed
29

30
#include "sched.h"
31
#include "../workqueue_internal.h"
32
#include "../smpboot.h"
33

34
#define CREATE_TRACE_POINTS
35
#include <trace/events/sched.h>
36

37
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
38

39
40
41
/*
 * Debugging: various feature bits
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
42
43
44
45

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

46
const_debug unsigned int sysctl_sched_features =
47
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
48
49
50
51
	0;

#undef SCHED_FEAT

52
53
54
55
56
57
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

58
59
60
61
62
63
64
65
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
66
/*
Ingo Molnar's avatar
Ingo Molnar committed
67
 * period over which we measure -rt task CPU usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
68
69
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
70
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
71

72
__read_mostly int scheduler_running;
73

Peter Zijlstra's avatar
Peter Zijlstra committed
74
75
76
77
78
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
79

Ingo Molnar's avatar
Ingo Molnar committed
80
/* CPUs with isolated domains */
81
82
cpumask_var_t cpu_isolated_map;

Linus Torvalds's avatar
Linus Torvalds committed
83
/*
84
 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds's avatar
Linus Torvalds committed
85
 */
Alexey Dobriyan's avatar
Alexey Dobriyan committed
86
static struct rq *this_rq_lock(void)
Linus Torvalds's avatar
Linus Torvalds committed
87
88
	__acquires(rq->lock)
{
89
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
90
91
92

	local_irq_disable();
	rq = this_rq();
93
	raw_spin_lock(&rq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
94
95
96
97

	return rq;
}

98
99
100
/*
 * __task_rq_lock - lock the rq @p resides on.
 */
101
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
102
103
104
105
106
107
108
109
110
111
	__acquires(rq->lock)
{
	struct rq *rq;

	lockdep_assert_held(&p->pi_lock);

	for (;;) {
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
112
			rq_pin_lock(rq, rf);
113
114
115
116
117
118
119
120
121
122
123
124
			return rq;
		}
		raw_spin_unlock(&rq->lock);

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
125
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
126
127
128
129
130
131
	__acquires(p->pi_lock)
	__acquires(rq->lock)
{
	struct rq *rq;

	for (;;) {
132
		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		/*
		 *	move_queued_task()		task_rq_lock()
		 *
		 *	ACQUIRE (rq->lock)
		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
		 *	[S] ->cpu = new_cpu		[L] task_rq()
		 *					[L] ->on_rq
		 *	RELEASE (rq->lock)
		 *
		 * If we observe the old cpu in task_rq_lock, the acquire of
		 * the old rq->lock will fully serialize against the stores.
		 *
Ingo Molnar's avatar
Ingo Molnar committed
148
		 * If we observe the new CPU in task_rq_lock, the acquire will
149
150
151
		 * pair with the WMB to ensure we must then also see migrating.
		 */
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
152
			rq_pin_lock(rq, rf);
153
154
155
			return rq;
		}
		raw_spin_unlock(&rq->lock);
156
		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
157
158
159
160
161
162

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/*
 * RQ-clock updating methods:
 */

static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

	rq->clock_task += delta;

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
		sched_rt_avg_update(rq, irq_delta + steal);
#endif
}

void update_rq_clock(struct rq *rq)
{
	s64 delta;

	lockdep_assert_held(&rq->lock);

	if (rq->clock_update_flags & RQCF_ACT_SKIP)
		return;

#ifdef CONFIG_SCHED_DEBUG
	rq->clock_update_flags |= RQCF_UPDATED;
#endif
	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
	if (delta < 0)
		return;
	rq->clock += delta;
	update_rq_clock_task(rq, delta);
}


241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 */

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

262
	raw_spin_lock(&rq->lock);
263
	update_rq_clock(rq);
264
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
265
	raw_spin_unlock(&rq->lock);
266
267
268
269

	return HRTIMER_NORESTART;
}

270
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
271

272
static void __hrtick_restart(struct rq *rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
273
274
275
{
	struct hrtimer *timer = &rq->hrtick_timer;

276
	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra's avatar
Peter Zijlstra committed
277
278
}

279
280
281
282
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
283
{
284
	struct rq *rq = arg;
285

286
	raw_spin_lock(&rq->lock);
Peter Zijlstra's avatar
Peter Zijlstra committed
287
	__hrtick_restart(rq);
288
	rq->hrtick_csd_pending = 0;
289
	raw_spin_unlock(&rq->lock);
290
291
}

292
293
294
295
296
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
297
void hrtick_start(struct rq *rq, u64 delay)
298
{
299
	struct hrtimer *timer = &rq->hrtick_timer;
300
301
302
303
304
305
306
307
308
	ktime_t time;
	s64 delta;

	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense and can cause timer DoS.
	 */
	delta = max_t(s64, delay, 10000LL);
	time = ktime_add_ns(timer->base->get_time(), delta);
309

310
	hrtimer_set_expires(timer, time);
311
312

	if (rq == this_rq()) {
Peter Zijlstra's avatar
Peter Zijlstra committed
313
		__hrtick_restart(rq);
314
	} else if (!rq->hrtick_csd_pending) {
315
		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
316
317
		rq->hrtick_csd_pending = 1;
	}
318
319
}

320
321
322
323
324
325
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
326
void hrtick_start(struct rq *rq, u64 delay)
327
{
Wanpeng Li's avatar
Wanpeng Li committed
328
329
330
331
332
	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense. Rely on vruntime for fairness.
	 */
	delay = max_t(u64, delay, 10000LL);
333
334
	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
		      HRTIMER_MODE_REL_PINNED);
335
336
}
#endif /* CONFIG_SMP */
337

338
static void init_rq_hrtick(struct rq *rq)
339
{
340
341
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
342

343
344
345
346
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
347

348
349
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
350
}
Andrew Morton's avatar
Andrew Morton committed
351
#else	/* CONFIG_SCHED_HRTICK */
352
353
354
355
356
357
358
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}
Andrew Morton's avatar
Andrew Morton committed
359
#endif	/* CONFIG_SCHED_HRTICK */
360

361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/*
 * cmpxchg based fetch_or, macro so it works for different integer types
 */
#define fetch_or(ptr, mask)						\
	({								\
		typeof(ptr) _ptr = (ptr);				\
		typeof(mask) _mask = (mask);				\
		typeof(*_ptr) _old, _val = *_ptr;			\
									\
		for (;;) {						\
			_old = cmpxchg(_ptr, _val, _val | _mask);	\
			if (_old == _val)				\
				break;					\
			_val = _old;					\
		}							\
	_old;								\
})

379
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
380
381
382
383
384
385
386
387
388
389
/*
 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 * this avoids any races wrt polling state changes and thereby avoids
 * spurious IPIs.
 */
static bool set_nr_and_not_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
390
391
392
393
394
395
396
397
398
399

/*
 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 *
 * If this returns true, then the idle task promises to call
 * sched_ttwu_pending() and reschedule soon.
 */
static bool set_nr_if_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
400
	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
401
402
403
404
405
406
407
408
409
410
411
412
413
414

	for (;;) {
		if (!(val & _TIF_POLLING_NRFLAG))
			return false;
		if (val & _TIF_NEED_RESCHED)
			return true;
		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
		if (old == val)
			break;
		val = old;
	}
	return true;
}

415
416
417
418
419
420
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
	set_tsk_need_resched(p);
	return true;
}
421
422
423
424
425
426
427

#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
	return false;
}
#endif
428
429
#endif

430
431
432
433
434
435
436
437
438
439
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
	struct wake_q_node *node = &task->wake_q;

	/*
	 * Atomically grab the task, if ->wake_q is !nil already it means
	 * its already queued (either by us or someone else) and will get the
	 * wakeup due to that.
	 *
	 * This cmpxchg() implies a full barrier, which pairs with the write
440
	 * barrier implied by the wakeup in wake_up_q().
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
	 */
	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
		return;

	get_task_struct(task);

	/*
	 * The head is context local, there can be no concurrency.
	 */
	*head->lastp = node;
	head->lastp = &node->next;
}

void wake_up_q(struct wake_q_head *head)
{
	struct wake_q_node *node = head->first;

	while (node != WAKE_Q_TAIL) {
		struct task_struct *task;

		task = container_of(node, struct task_struct, wake_q);
		BUG_ON(!task);
Ingo Molnar's avatar
Ingo Molnar committed
463
		/* Task can safely be re-inserted now: */
464
465
466
467
468
469
470
471
472
473
474
475
		node = node->next;
		task->wake_q.next = NULL;

		/*
		 * wake_up_process() implies a wmb() to pair with the queueing
		 * in wake_q_add() so as not to miss wakeups.
		 */
		wake_up_process(task);
		put_task_struct(task);
	}
}

476
/*
477
 * resched_curr - mark rq's current task 'to be rescheduled now'.
478
479
480
481
482
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
483
void resched_curr(struct rq *rq)
484
{
485
	struct task_struct *curr = rq->curr;
486
487
	int cpu;

488
	lockdep_assert_held(&rq->lock);
489

490
	if (test_tsk_need_resched(curr))
491
492
		return;

493
	cpu = cpu_of(rq);
494

495
	if (cpu == smp_processor_id()) {
496
		set_tsk_need_resched(curr);
497
		set_preempt_need_resched();
498
		return;
499
	}
500

501
	if (set_nr_and_not_polling(curr))
502
		smp_send_reschedule(cpu);
503
504
	else
		trace_sched_wake_idle_without_ipi(cpu);
505
506
}

507
void resched_cpu(int cpu)
508
509
510
511
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

512
	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
513
		return;
514
	resched_curr(rq);
515
	raw_spin_unlock_irqrestore(&rq->lock, flags);
516
}
517

518
#ifdef CONFIG_SMP
519
#ifdef CONFIG_NO_HZ_COMMON
520
/*
Ingo Molnar's avatar
Ingo Molnar committed
521
522
 * In the semi idle case, use the nearest busy CPU for migrating timers
 * from an idle CPU.  This is good for power-savings.
523
524
 *
 * We don't do similar optimization for completely idle system, as
Ingo Molnar's avatar
Ingo Molnar committed
525
526
 * selecting an idle CPU will add more delays to the timers than intended
 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
527
 */
528
int get_nohz_timer_target(void)
529
{
530
	int i, cpu = smp_processor_id();
531
532
	struct sched_domain *sd;

533
	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
534
535
		return cpu;

536
	rcu_read_lock();
537
	for_each_domain(cpu, sd) {
538
		for_each_cpu(i, sched_domain_span(sd)) {
539
540
541
542
			if (cpu == i)
				continue;

			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
543
544
545
546
				cpu = i;
				goto unlock;
			}
		}
547
	}
548
549
550

	if (!is_housekeeping_cpu(cpu))
		cpu = housekeeping_any_cpu();
551
552
unlock:
	rcu_read_unlock();
553
554
	return cpu;
}
Ingo Molnar's avatar
Ingo Molnar committed
555

556
557
558
559
560
561
562
563
564
565
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
566
static void wake_up_idle_cpu(int cpu)
567
568
569
570
571
572
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

573
	if (set_nr_and_not_polling(rq->idle))
574
		smp_send_reschedule(cpu);
575
576
	else
		trace_sched_wake_idle_without_ipi(cpu);
577
578
}

579
static bool wake_up_full_nohz_cpu(int cpu)
580
{
581
582
583
584
585
586
	/*
	 * We just need the target to call irq_exit() and re-evaluate
	 * the next tick. The nohz full kick at least implies that.
	 * If needed we can still optimize that later with an
	 * empty IRQ.
	 */
587
588
	if (cpu_is_offline(cpu))
		return true;  /* Don't try to wake offline CPUs. */
589
	if (tick_nohz_full_cpu(cpu)) {
590
591
		if (cpu != smp_processor_id() ||
		    tick_nohz_tick_stopped())
592
			tick_nohz_full_kick_cpu(cpu);
593
594
595
596
597
598
		return true;
	}

	return false;
}

599
600
601
602
603
/*
 * Wake up the specified CPU.  If the CPU is going offline, it is the
 * caller's responsibility to deal with the lost wakeup, for example,
 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
 */
604
605
void wake_up_nohz_cpu(int cpu)
{
606
	if (!wake_up_full_nohz_cpu(cpu))
607
608
609
		wake_up_idle_cpu(cpu);
}

610
static inline bool got_nohz_idle_kick(void)
611
{
612
	int cpu = smp_processor_id();
613
614
615
616
617
618
619
620
621
622
623
624
625

	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
		return false;

	if (idle_cpu(cpu) && !need_resched())
		return true;

	/*
	 * We can't run Idle Load Balance on this CPU for this time so we
	 * cancel it and clear NOHZ_BALANCE_KICK
	 */
	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
	return false;
626
627
}

628
#else /* CONFIG_NO_HZ_COMMON */
629

630
static inline bool got_nohz_idle_kick(void)
Peter Zijlstra's avatar
Peter Zijlstra committed
631
{
632
	return false;
Peter Zijlstra's avatar
Peter Zijlstra committed
633
634
}

635
#endif /* CONFIG_NO_HZ_COMMON */
636

637
#ifdef CONFIG_NO_HZ_FULL
638
bool sched_can_stop_tick(struct rq *rq)
639
{
640
641
642
643
644
645
	int fifo_nr_running;

	/* Deadline tasks, even if single, need the tick */
	if (rq->dl.dl_nr_running)
		return false;

646
	/*
647
648
	 * If there are more than one RR tasks, we need the tick to effect the
	 * actual RR behaviour.
649
	 */
650
651
652
653
654
	if (rq->rt.rr_nr_running) {
		if (rq->rt.rr_nr_running == 1)
			return true;
		else
			return false;
655
656
	}

657
658
659
660
661
662
663
664
665
666
667
668
669
670
	/*
	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
	 * forced preemption between FIFO tasks.
	 */
	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
	if (fifo_nr_running)
		return true;

	/*
	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
	 * if there's more than one we need the tick for involuntary
	 * preemption.
	 */
	if (rq->nr_running > 1)
671
		return false;
672

673
	return true;
674
675
}
#endif /* CONFIG_NO_HZ_FULL */
676

677
void sched_avg_update(struct rq *rq)
678
{
679
680
	s64 period = sched_avg_period();

681
	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
682
683
684
685
686
687
		/*
		 * Inline assembly required to prevent the compiler
		 * optimising this loop into a divmod call.
		 * See __iter_div_u64_rem() for another example of this.
		 */
		asm("" : "+rm" (rq->age_stamp));
688
689
690
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
691
692
}

693
#endif /* CONFIG_SMP */
694

695
696
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
697
/*
698
699
700
701
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
702
 */
703
int walk_tg_tree_from(struct task_group *from,
704
			     tg_visitor down, tg_visitor up, void *data)
705
706
{
	struct task_group *parent, *child;
Peter Zijlstra's avatar
Peter Zijlstra committed
707
	int ret;
708

709
710
	parent = from;

711
down:
Peter Zijlstra's avatar
Peter Zijlstra committed
712
713
	ret = (*down)(parent, data);
	if (ret)
714
		goto out;
715
716
717
718
719
720
721
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
722
	ret = (*up)(parent, data);
723
724
	if (ret || parent == from)
		goto out;
725
726
727
728
729

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
730
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
731
	return ret;
732
733
}

734
int tg_nop(struct task_group *tg, void *data)
Peter Zijlstra's avatar
Peter Zijlstra committed
735
{
736
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
737
}
738
739
#endif

740
741
static void set_load_weight(struct task_struct *p)
{
Nikhil Rao's avatar
Nikhil Rao committed
742
743
744
	int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;

Ingo Molnar's avatar
Ingo Molnar committed
745
746
747
	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
748
	if (idle_policy(p->policy)) {
749
		load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Rao's avatar
Nikhil Rao committed
750
		load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnar's avatar
Ingo Molnar committed
751
752
		return;
	}
753

754
755
	load->weight = scale_load(sched_prio_to_weight[prio]);
	load->inv_weight = sched_prio_to_wmult[prio];
756
757
}

758
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
759
{
760
	update_rq_clock(rq);
761
762
	if (!(flags & ENQUEUE_RESTORE))
		sched_info_queued(rq, p);
763
	p->sched_class->enqueue_task(rq, p, flags);
764
765
}

766
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
767
{
768
	update_rq_clock(rq);
769
770
	if (!(flags & DEQUEUE_SAVE))
		sched_info_dequeued(rq, p);
771
	p->sched_class->dequeue_task(rq, p, flags);
772
773
}

774
void activate_task(struct rq *rq, struct task_struct *p, int flags)
775
776
777
778
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

779
	enqueue_task(rq, p, flags);
780
781
}

782
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
783
784
785
786
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

787
	dequeue_task(rq, p, flags);
788
789
}

790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
	struct task_struct *old_stop = cpu_rq(cpu)->stop;

	if (stop) {
		/*
		 * Make it appear like a SCHED_FIFO task, its something
		 * userspace knows about and won't get confused about.
		 *
		 * Also, it will make PI more or less work without too
		 * much confusion -- but then, stop work should not
		 * rely on PI working anyway.
		 */
		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

		stop->sched_class = &stop_sched_class;
	}

	cpu_rq(cpu)->stop = stop;

	if (old_stop) {
		/*
		 * Reset it back to a normal scheduling class so that
		 * it can die in pieces.
		 */
		old_stop->sched_class = &rt_sched_class;
	}
}

820
/*
Ingo Molnar's avatar
Ingo Molnar committed
821
 * __normal_prio - return the priority that is based on the static prio
822
823
824
 */
static inline int __normal_prio(struct task_struct *p)
{
Ingo Molnar's avatar
Ingo Molnar committed
825
	return p->static_prio;
826
827
}

828
829
830
831
832
833
834
/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
835
static inline int normal_prio(struct task_struct *p)
836
837
838
{
	int prio;

839
840
841
	if (task_has_dl_policy(p))
		prio = MAX_DL_PRIO-1;
	else if (task_has_rt_policy(p))
842
843
844
845
846
847
848
849
850
851
852
853
854
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
855
static int effective_prio(struct task_struct *p)
856
857
858
859
860
861
862
863
864
865
866
867
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

Linus Torvalds's avatar
Linus Torvalds committed
868
869
870
/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
871
872
 *
 * Return: 1 if the task is currently executing. 0 otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
873
 */
874
inline int task_curr(const struct task_struct *p)
Linus Torvalds's avatar
Linus Torvalds committed
875
876
877
878
{
	return cpu_curr(task_cpu(p)) == p;
}

879
/*
880
881
882
883
884
 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
 * use the balance_callback list if you want balancing.
 *
 * this means any call to check_class_changed() must be followed by a call to
 * balance_callback().
885
 */
886
887
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
				       const struct sched_class *prev_class,
Peter Zijlstra's avatar
Peter Zijlstra committed
888
				       int oldprio)
889
890
891
{
	if (prev_class != p->sched_class) {
		if (prev_class->switched_from)
Peter Zijlstra's avatar
Peter Zijlstra committed
892
			prev_class->switched_from(rq, p);
893

Peter Zijlstra's avatar
Peter Zijlstra committed
894
		p->sched_class->switched_to(rq, p);
895
	} else if (oldprio != p->prio || dl_task(p))
Peter Zijlstra's avatar
Peter Zijlstra committed
896
		p->sched_class->prio_changed(rq, p, oldprio);
897
898
}

899
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
900
901
902
903
904
905
906
907
908
909
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
910
				resched_curr(rq);
911
912
913
914
915
916
917
918
919
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
920
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
921
		rq_clock_skip_update(rq, true);
922
923
}

Linus Torvalds's avatar
Linus Torvalds committed
924
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
/*
 * This is how migration works:
 *
 * 1) we invoke migration_cpu_stop() on the target CPU using
 *    stop_one_cpu().
 * 2) stopper starts to run (implicitly forcing the migrated thread
 *    off the CPU)
 * 3) it checks whether the migrated task is still in the wrong runqueue.
 * 4) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
 * 5) stopper completes and stop_one_cpu() returns and the migration
 *    is done.
 */

/*
 * move_queued_task - move a queued task to new rq.
 *
 * Returns (locked) new rq. Old rq's lock is released.
 */
944
static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
945
946
947
948
{
	lockdep_assert_held(&rq->lock);

	p->on_rq = TASK_ON_RQ_MIGRATING;
949
	dequeue_task(rq, p, 0);
Peter Zijlstra's avatar
Peter Zijlstra committed
950
951
952
953
954
955
956
957
	set_task_cpu(p, new_cpu);
	raw_spin_unlock(&rq->lock);

	rq = cpu_rq(new_cpu);

	raw_spin_lock(&rq->lock);
	BUG_ON(task_cpu(p) != new_cpu);
	enqueue_task(rq, p, 0);
958
	p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstra's avatar
Peter Zijlstra committed
959
960
961
962
963
964
965
966
967
968
969
	check_preempt_curr(rq, p, 0);

	return rq;
}

struct migration_arg {
	struct task_struct *task;
	int dest_cpu;
};

/*
Ingo Molnar's avatar
Ingo Molnar committed
970
 * Move (not current) task off this CPU, onto the destination CPU. We're doing
Peter Zijlstra's avatar
Peter Zijlstra committed
971
972
973
974
975
976
977
 * this because either it can't run here any more (set_cpus_allowed()
 * away from this CPU, or CPU going down), or because we're
 * attempting to rebalance this task on exec (sched_exec).
 *
 * So we race with normal scheduler movements, but that's OK, as long
 * as the task is no longer on this CPU.
 */
978
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
979
980
{
	if (unlikely(!cpu_active(dest_cpu)))
981
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
982
983
984

	/* Affinity changed (again). */
	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
985
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
986

987
988
989
	rq = move_queued_task(rq, p, dest_cpu);

	return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
990
991
992
993
994
995
996
997
998
999
}

/*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
 * 'pushing' onto another runqueue.
 */
static int migration_cpu_stop(void *data)
{
	struct migration_arg *arg = data;
1000
1001
	struct task_struct *p = arg->task;
	struct rq *rq = this_rq();
Peter Zijlstra's avatar
Peter Zijlstra committed
1002
1003

	/*
Ingo Molnar's avatar
Ingo Molnar committed
1004
1005
	 * The original target CPU might have gone down and we might
	 * be on another CPU but it doesn't matter.
Peter Zijlstra's avatar
Peter Zijlstra committed
1006
1007
1008
1009
1010
1011
1012
1013
	 */
	local_irq_disable();
	/*
	 * We need to explicitly wake pending tasks before running
	 * __migrate_task() such that we will not miss enforcing cpus_allowed
	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
	 */
	sched_ttwu_pending();
1014
1015
1016
1017
1018
1019
1020
1021

	raw_spin_lock(&p->pi_lock);
	raw_spin_lock(&rq->lock);
	/*
	 * If task_rq(p) != rq, it cannot be migrated here, because we're
	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
	 * we're holding p->pi_lock.
	 */
1022
1023
1024
1025
1026
1027
	if (task_rq(p) == rq) {
		if (task_on_rq_queued(p))
			rq = __migrate_task(rq, p, arg->dest_cpu);
		else
			p->wake_cpu = arg->dest_cpu;
	}
1028
1029
1030
	raw_spin_unlock(&rq->lock);
	raw_spin_unlock(&p->pi_lock);

Peter Zijlstra's avatar
Peter Zijlstra committed
1031
1032
1033
1034
	local_irq_enable();
	return 0;
}

1035
1036
1037
1038
1039
/*
 * sched_class::set_cpus_allowed must do the below, but is not required to
 * actually call this function.
 */
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
Peter Zijlstra's avatar
Peter Zijlstra committed
1040
1041
1042
1043
1044
{
	cpumask_copy(&p->cpus_allowed, new_mask);
	p->nr_cpus_allowed = cpumask_weight(new_mask);
}

1045
1046
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
1047
1048
1049
	struct rq *rq = task_rq(p);
	bool queued, running;

1050
	lockdep_assert_held(&p->pi_lock);
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060

	queued = task_on_rq_queued(p);
	running = task_current(rq, p);

	if (queued) {
		/*
		 * Because __kthread_bind() calls this on blocked tasks without
		 * holding rq->lock.
		 */
		lockdep_assert_held(&rq->lock);
1061
		dequeue_task(rq, p, DEQUEUE_SAVE);
1062
1063
1064
1065
	}
	if (running)
		put_prev_task(rq, p);

1066
	p->sched_class->set_cpus_allowed(p, new_mask);
1067
1068

	if (queued)
1069
		enqueue_task(rq, p, ENQUEUE_RESTORE);
1070
	if (running)
1071
		set_curr_task(rq, p);
1072
1073
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1074
1075
1076
1077
1078
1079
1080
1081
1082
/*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
 * is removed from the allowed bitmask.
 *
 * NOTE: the caller must have a valid reference to the task, the
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
1083
1084
static int __set_cpus_allowed_ptr(struct task_struct *p,
				  const struct cpumask *new_mask, bool check)
Peter Zijlstra's avatar
Peter Zijlstra committed
1085
{
1086
	const struct cpumask *cpu_valid_mask = cpu_active_mask;
Peter Zijlstra's avatar
Peter Zijlstra committed
1087
	unsigned int dest_cpu;
1088
1089
	struct rq_flags rf;
	struct rq *rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1090
1091
	int ret = 0;

1092
	rq = task_rq_lock(p, &rf);
1093
	update_rq_clock(rq);
Peter Zijlstra's avatar
Peter Zijlstra committed
1094

1095
1096
1097
1098
1099
1100
1101
	if (p->flags & PF_KTHREAD) {
		/*
		 * Kernel threads are allowed on online && !active CPUs
		 */
		cpu_valid_mask = cpu_online_mask;
	}

1102
1103
1104
1105
1106
1107
1108
1109
1110
	/*
	 * Must re-check here, to close a race against __kthread_bind(),
	 * sched_setaffinity() is not guaranteed to observe the flag.
	 */
	if (check && (p->flags & PF_NO_SETAFFINITY)) {
		ret = -EINVAL;
		goto out;
	}

Peter Zijlstra's avatar
Peter Zijlstra committed
1111
1112
1113
	if (cpumask_equal(&p->cpus_allowed, new_mask))
		goto out;

1114
	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
Peter Zijlstra's avatar
Peter Zijlstra committed
1115
1116
1117
1118
1119
1120
		ret = -EINVAL;
		goto out;
	}

	do_set_cpus_allowed(p, new_mask);

1121
1122
1123
	if (p->flags & PF_KTHREAD) {
		/*
		 * For kernel threads that do indeed end up on online &&
Ingo Molnar's avatar
Ingo Molnar committed
1124
		 * !active we want to ensure they are strict per-CPU threads.
1125
1126
1127
1128
1129
1130
		 */
		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
			!cpumask_intersects(new_mask, cpu_active_mask) &&
			p->nr_cpus_allowed != 1);
	}

Peter Zijlstra's avatar
Peter Zijlstra committed
1131
1132
1133
1134
	/* Can the task run on the task's current CPU? If so, we're done */
	if (cpumask_test_cpu(task_cpu(p), new_mask))
		goto out;

1135
	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
Peter Zijlstra's avatar
Peter Zijlstra committed
1136
1137
1138
	if (task_running(rq, p) || p->state == TASK_WAKING) {
		struct migration_arg arg = { p, dest_cpu };
		/* Need help from migration thread: drop lock and wait. */
1139
		task_rq_unlock(rq, p, &rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
1140
1141
1142
		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
		tlb_migrate_finish(p->mm);
		return 0;
1143
1144
1145
1146
1147
	} else if (task_on_rq_queued(p)) {
		/*
		 * OK, since we're going to drop the lock immediately
		 * afterwards anyway.
		 */
1148
		rq_unpin_lock(rq, &rf);
1149
		rq = move_queued_task(rq, p, dest_cpu);
1150
		rq_repin_lock(rq, &rf);
1151
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
1152
out:
1153
	task_rq_unlock(rq, p, &rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
1154
1155
1156

	return ret;
}
1157
1158
1159
1160
1161

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
	return __set_cpus_allowed_ptr(p, new_mask, false);
}
Peter Zijlstra's avatar
Peter Zijlstra committed
1162
1163
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

Ingo Molnar's avatar
Ingo Molnar committed
1164
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnar's avatar
Ingo Molnar committed
1165
{
1166
1167
1168
1169
1170
#ifdef CONFIG_SCHED_DEBUG
	/*
	 * We should never call set_task_cpu() on a blocked task,
	 * ttwu() will sort out the placement.
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1171
	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1172
			!p->on_rq);
1173

1174
1175
1176
1177
1178
1179
1180
1181
1182
	/*
	 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
	 * because schedstat_wait_{start,end} rebase migrating task's wait_start
	 * time relying on p->on_rq.
	 */
	WARN_ON_ONCE(p->state == TASK_RUNNING &&
		     p->sched_class == &fair_sched_class &&
		     (p->on_rq && !task_on_rq_migrating(p)));

1183
#ifdef CONFIG_LOCKDEP
1184
1185
1186
1187
1188
	/*
	 * The caller should hold either p->pi_lock or rq->lock, when changing
	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
	 *
	 * sched_move_task() holds both and thus holding either pins the cgroup,
1189
	 * see task_group().
1190
1191
1192
1193
	 *
	 * Furthermore, all task_rq users should acquire both locks, see
	 * task_rq_lock().
	 */
1194
1195
1196
	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
				      lockdep_is_held(&task_rq(p)->lock)));
#endif
1197
1198
#endif

1199
	trace_sched_migrate_task(p, new_cpu);
1200

1201
	if (task_cpu(p) != new_cpu) {
1202
		if (p->sched_class->migrate_task_rq)
1203
			p->sched_class->migrate_task_rq(p);
1204
		p->se.nr_migrations++;
1205
		perf_event_task_migrate(p);
1206
	}
Ingo Molnar's avatar
Ingo Molnar committed
1207
1208

	__set_task_cpu(p, new_cpu);
Ingo Molnar's avatar
Ingo Molnar committed
1209
1210
}

1211
1212
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
1213
	if (task_on_rq_queued(p)) {
1214
1215
1216
1217
1218
		struct rq *src_rq, *dst_rq;

		src_rq =