Commit a39b8633 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of...

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (31 commits)
  sched: fix warning in fs/proc/base.c
  schedstat: consolidate per-task cpu runtime stats
  sched: use RCU variant of list traversal in for_each_leaf_rt_rq()
  sched, cpuacct: export percpu cpuacct cgroup stats
  sched, cpuacct: refactoring cpuusage_read / cpuusage_write
  sched: optimize update_curr()
  sched: fix wakeup preemption clock
  sched: add missing arch_update_cpu_topology() call
  sched: let arch_update_cpu_topology indicate if topology changed
  sched: idle_balance() does not call load_balance_newidle()
  sched: fix sd_parent_degenerate on non-numa smp machine
  sched: add uid information to sched_debug for CONFIG_USER_SCHED
  sched: move double_unlock_balance() higher
  sched: update comment for move_task_off_dead_cpu
  sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares
  sched/rt: removed unneeded defintion
  sched: add hierarchical accounting to cpu accounting controller
  sched: include group statistics in /proc/sched_debug
  sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER
  sched: clean up SCHED_CPUMASK_ALLOC
  ...
parents b0f4b285 4e202284
CPU Accounting Controller
-------------------------
The CPU accounting controller is used to group tasks using cgroups and
account the CPU usage of these groups of tasks.
The CPU accounting controller supports multi-hierarchy groups. An accounting
group accumulates the CPU usage of all of its child groups and the tasks
directly present in its group.
Accounting groups can be created by first mounting the cgroup filesystem.
# mkdir /cgroups
# mount -t cgroup -ocpuacct none /cgroups
With the above step, the initial or the parent accounting group
becomes visible at /cgroups. At bootup, this group includes all the
tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
this group which is essentially the CPU time obtained by all the tasks
in the system.
New accounting groups can be created under the parent group /cgroups.
# cd /cgroups
# mkdir g1
# echo $$ > g1
The above steps create a new group g1 and move the current shell
process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also.
......@@ -8,7 +8,7 @@ Context switch
By default, the switch_to arch function is called with the runqueue
locked. This is usually not a problem unless switch_to may need to
take the runqueue lock. This is usually due to a wake up operation in
the context switch. See include/asm-ia64/system.h for an example.
the context switch. See arch/ia64/include/asm/system.h for an example.
To request the scheduler call switch_to with the runqueue unlocked,
you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
......@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
introduce a significant interrupt latency by adding the line
`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
unlocked context switches. This define also implies
`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
example.
......
......@@ -99,7 +99,7 @@ config GENERIC_IOMAP
bool
default y
config SCHED_NO_NO_OMIT_FRAME_POINTER
config SCHED_OMIT_FRAME_POINTER
bool
default y
......
......@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
bool
default y
config SCHED_NO_NO_OMIT_FRAME_POINTER
config SCHED_OMIT_FRAME_POINTER
bool
default y
......
......@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
bool
default y
config SCHED_NO_NO_OMIT_FRAME_POINTER
config SCHED_OMIT_FRAME_POINTER
bool
default y
......
......@@ -141,7 +141,7 @@ config GENERIC_NVRAM
bool
default y if PPC32
config SCHED_NO_NO_OMIT_FRAME_POINTER
config SCHED_OMIT_FRAME_POINTER
bool
default y
......
......@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
cpu_core_map[cpu] = cpu_coregroup_map(cpu);
}
void arch_update_cpu_topology(void)
int arch_update_cpu_topology(void)
{
struct tl_info *info = tl_info;
struct sys_device *sysdev;
......@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
if (!machine_has_topology) {
update_cpu_core_map();
topology_update_polarization_simple();
return;
return 0;
}
stsi(info, 15, 1, 2);
tl_to_cores(info);
......@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
sysdev = get_cpu_sysdev(cpu);
kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
}
return 1;
}
static void topology_work_fn(struct work_struct *work)
......
......@@ -368,10 +368,10 @@ config X86_RDC321X
as R-8610-(G).
If you don't have one of these chips, you should say N here.
config SCHED_NO_NO_OMIT_FRAME_POINTER
config SCHED_OMIT_FRAME_POINTER
def_bool y
prompt "Single-depth WCHAN output"
depends on X86_32
depends on X86
help
Calculate simpler /proc/<PID>/wchan values. If this option
is disabled then wchan values will recurse back to the
......
......@@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
{
return sprintf(buffer, "%llu %llu %lu\n",
task->sched_info.cpu_time,
task->sched_info.run_delay,
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
}
#endif
......
......@@ -23,7 +23,7 @@
*/
#if defined(CONFIG_FRAME_POINTER) || \
!defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER)
!defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
#define M32R_PUSH_FP " push fp\n"
#define M32R_POP_FP " pop fp\n"
#else
......
......@@ -260,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu)
}
#endif
extern unsigned long rt_needs_cpu(int cpu);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
......@@ -669,8 +667,7 @@ struct reclaim_state;
struct sched_info {
/* cumulative counters */
unsigned long pcount; /* # of times run on this cpu */
unsigned long long cpu_time, /* time spent on the cpu */
run_delay; /* time spent waiting on a runqueue */
unsigned long long run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
......@@ -2210,6 +2207,7 @@ extern void normalize_rt_tasks(void);
extern struct task_group init_task_group;
#ifdef CONFIG_USER_SCHED
extern struct task_group root_task_group;
extern void set_tg_uid(struct user_struct *user);
#endif
extern struct task_group *sched_create_group(struct task_group *parent);
......
......@@ -49,7 +49,7 @@
for_each_online_node(node) \
if (nr_cpus_node(node))
void arch_update_cpu_topology(void);
int arch_update_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE 10
......
......@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -pg
endif
obj-$(CONFIG_FREEZER) += freezer.o
......@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
# me. I suspect most platforms don't need this, but until we know that for sure
......
......@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
*/
t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;
t3 = tsk->se.sum_exec_runtime;
d->cpu_count += t1;
......
......@@ -267,6 +267,10 @@ struct task_group {
struct cgroup_subsys_state css;
#endif
#ifdef CONFIG_USER_SCHED
uid_t uid;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each cpu */
struct sched_entity **se;
......@@ -292,6 +296,12 @@ struct task_group {
#ifdef CONFIG_USER_SCHED
/* Helper function to pass uid information to create_sched_user() */
void set_tg_uid(struct user_struct *user)
{
user->tg->uid = user->uid;
}
/*
* Root task group.
* Every UID task group (including init_task_group aka UID-0) will
......@@ -594,6 +604,8 @@ struct rq {
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
unsigned long long rq_cpu_time;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_exp_empty;
......@@ -711,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = {
#undef SCHED_FEAT
static int sched_feat_open(struct inode *inode, struct file *filp)
{
filp->private_data = inode->i_private;
return 0;
}
static ssize_t
sched_feat_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
static int sched_feat_show(struct seq_file *m, void *v)
{
char *buf;
int r = 0;
int len = 0;
int i;
for (i = 0; sched_feat_names[i]; i++) {
len += strlen(sched_feat_names[i]);
len += 4;
}
buf = kmalloc(len + 2, GFP_KERNEL);
if (!buf)
return -ENOMEM;
for (i = 0; sched_feat_names[i]; i++) {
if (sysctl_sched_features & (1UL << i))
r += sprintf(buf + r, "%s ", sched_feat_names[i]);
else
r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
r += sprintf(buf + r, "\n");
WARN_ON(r >= len + 2);
r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
kfree(buf);
return r;
return 0;
}
static ssize_t
......@@ -794,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.read = sched_feat_read,
.write = sched_feat_write,
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
......@@ -1482,27 +1474,13 @@ static void
update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
unsigned long rq_weight;
if (!tg->se[cpu])
return;
rq_weight = tg->cfs_rq[cpu]->load.weight;
/*
* If there are currently no tasks on the cpu pretend there is one of
* average load so that when a new task gets to run here it will not
* get delayed by group starvation.
*/
if (!rq_weight) {
boost = 1;
rq_weight = NICE_0_LOAD;
}
if (unlikely(rq_weight > sd_rq_weight))
rq_weight = sd_rq_weight;
rq_weight = tg->cfs_rq[cpu]->rq_weight;
/*
* \Sum shares * rq_weight
......@@ -1510,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
* \Sum rq_weight
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
shares = (sd_shares * rq_weight) / sd_rq_weight;
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
if (abs(shares - tg->se[cpu]->load.weight) >
......@@ -1519,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
/*
* record the actual number of shares, not the boosted amount.
*/
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
tg->cfs_rq[cpu]->rq_weight = rq_weight;
tg->cfs_rq[cpu]->shares = shares;
__set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags);
......@@ -1537,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
*/
static int tg_shares_up(struct task_group *tg, void *data)
{
unsigned long rq_weight = 0;
unsigned long weight, rq_weight = 0;
unsigned long shares = 0;
struct sched_domain *sd = data;
int i;
for_each_cpu_mask(i, sd->span) {
rq_weight += tg->cfs_rq[i]->load.weight;
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
* run here it will not get delayed by group starvation.
*/
weight = tg->cfs_rq[i]->load.weight;
if (!weight)
weight = NICE_0_LOAD;
tg->cfs_rq[i]->rq_weight = weight;
rq_weight += weight;
shares += tg->cfs_rq[i]->shares;
}
......@@ -1553,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
for_each_cpu_mask(i, sd->span)
update_group_shares_cpu(tg, i, shares, rq_weight);
......@@ -1620,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
#endif
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
if (unlikely(!irqs_disabled())) {
/* printk() doesn't work good under rq->lock */
spin_unlock(&this_rq->lock);
BUG_ON(1);
}
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
ret = 1;
} else
spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
}
return ret;
}
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
......@@ -2264,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
smp_wmb();
rq = task_rq_lock(p, &flags);
update_rq_clock(rq);
old_state = p->state;
if (!(old_state & state))
goto out;
......@@ -2321,7 +2336,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
success = 1;
......@@ -2821,40 +2835,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
*/
static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
__releases(this_rq->lock)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
int ret = 0;
if (unlikely(!irqs_disabled())) {
/* printk() doesn't work good under rq->lock */
spin_unlock(&this_rq->lock);
BUG_ON(1);
}
if (unlikely(!spin_trylock(&busiest->lock))) {
if (busiest < this_rq) {
spin_unlock(&this_rq->lock);
spin_lock(&busiest->lock);
spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
ret = 1;
} else
spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
}
return ret;
}
static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
......@@ -3716,7 +3696,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
static void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
int pulled_task = -1;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
cpumask_t tmpmask;
......@@ -6150,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
/*
* Figure out where task on dead CPU should go, use force if necessary.
* NOTE: interrupts should be disabled by the caller
*/
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
......@@ -6662,28 +6641,6 @@ early_initcall(migration_init);
#ifdef CONFIG_SCHED_DEBUG
static inline const char *sd_level_to_string(enum sched_domain_level lvl)
{
switch (lvl) {
case SD_LV_NONE:
return "NONE";
case SD_LV_SIBLING:
return "SIBLING";
case SD_LV_MC:
return "MC";
case SD_LV_CPU:
return "CPU";
case SD_LV_NODE:
return "NODE";
case SD_LV_ALLNODES:
return "ALLNODES";
case SD_LV_MAX:
return "MAX";
}
return "MAX";
}
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_t *groupmask)
{
......@@ -6703,8 +6660,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
return -1;
}
printk(KERN_CONT "span %s level %s\n",
str, sd_level_to_string(sd->level));
printk(KERN_CONT "span %s level %s\n", str, sd->name);
if (!cpu_isset(cpu, sd->span)) {
printk(KERN_ERR "ERROR: domain->span does not contain "
......@@ -6840,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_EXEC |
SD_SHARE_CPUPOWER |
SD_SHARE_PKG_RESOURCES);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
if (~cflags & pflags)
return 0;
......@@ -7360,13 +7318,21 @@ struct allmasks {
};
#if NR_CPUS > 128
#define SCHED_CPUMASK_ALLOC 1
#define SCHED_CPUMASK_FREE(v) kfree(v)
#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
static inline void sched_cpumask_alloc(struct allmasks **masks)
{
*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
}
static inline void sched_cpumask_free(struct allmasks *masks)
{
kfree(masks);
}
#else
#define SCHED_CPUMASK_ALLOC 0
#define SCHED_CPUMASK_FREE(v)
#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
static inline void sched_cpumask_alloc(struct allmasks **masks)
{ }
static inline void sched_cpumask_free(struct allmasks *masks)
{ }
#endif
#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
......@@ -7442,9 +7408,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
return -ENOMEM;
}
#if SCHED_CPUMASK_ALLOC
/* get space for all scratch cpumask variables */
allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
sched_cpumask_alloc(&allmasks);
if (!allmasks) {
printk(KERN_WARNING "Cannot alloc cpumask array\n");
kfree(rd);
......@@ -7453,7 +7418,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
#endif
return -ENOMEM;
}
#endif
tmpmask = (cpumask_t *)allmasks;
......@@ -7707,13 +7672,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
cpu_attach_domain(sd, rd, i);
}
SCHED_CPUMASK_FREE((void *)allmasks);
sched_cpumask_free(allmasks);
return 0;
#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map, tmpmask);
SCHED_CPUMASK_FREE((void *)allmasks);
sched_cpumask_free(allmasks);
kfree(rd);
return -ENOMEM;
#endif
......@@ -7736,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur;
*/
static cpumask_t fallback_doms;
void __attribute__((weak)) arch_update_cpu_topology(void)
/*
* arch_update_cpu_topology lets virtualized architectures update the
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
int __attribute__((weak)) arch_update_cpu_topology(void)
{
return 0;
}