perf_event.c 114 KB
Newer Older
1
/*
2
 * Performance events core code:
3
 *
4 5 6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
9
 * For licensing details see kernel-base/COPYING
10 11 12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
14 15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
17 18
#include <linux/poll.h>
#include <linux/sysfs.h>
19
#include <linux/dcache.h>
20
#include <linux/percpu.h>
21
#include <linux/ptrace.h>
22 23 24
#include <linux/vmstat.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
25 26 27
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
28
#include <linux/kernel_stat.h>
29
#include <linux/perf_event.h>
30

31 32
#include <asm/irq_regs.h>

33
/*
34
 * Each CPU has a list of per CPU events:
35 36 37
 */
DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

38
int perf_max_events __read_mostly = 1;
39 40 41
static int perf_reserved_percpu __read_mostly;
static int perf_overcommit __read_mostly = 1;

42 43 44 45
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
46

47
/*
48
 * perf event paranoia level:
49 50
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
51
 *   1 - disallow cpu events for unpriv
52
 *   2 - disallow kernel profiling for unpriv
53
 */
54
int sysctl_perf_event_paranoid __read_mostly = 1;
55

56 57
static inline bool perf_paranoid_tracepoint_raw(void)
{
58
	return sysctl_perf_event_paranoid > -1;
59 60
}

61 62
static inline bool perf_paranoid_cpu(void)
{
63
	return sysctl_perf_event_paranoid > 0;
64 65 66 67
}

static inline bool perf_paranoid_kernel(void)
{
68
	return sysctl_perf_event_paranoid > 1;
69 70
}

71
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
72 73

/*
74
 * max perf event sample rate
75
 */
76
int sysctl_perf_event_sample_rate __read_mostly = 100000;
77

78
static atomic64_t perf_event_id;
79

80
/*
81
 * Lock for (sysadmin-configurable) event reservations:
82
 */
83
static DEFINE_SPINLOCK(perf_resource_lock);
84 85 86 87

/*
 * Architecture provided APIs - weak aliases:
 */
88
extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
89
{
90
	return NULL;
91 92
}

93 94 95
void __weak hw_perf_disable(void)		{ barrier(); }
void __weak hw_perf_enable(void)		{ barrier(); }

96 97
void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
98 99

int __weak
100
hw_perf_group_sched_in(struct perf_event *group_leader,
101
	       struct perf_cpu_context *cpuctx,
102
	       struct perf_event_context *ctx, int cpu)
103 104 105
{
	return 0;
}
106

107
void __weak perf_event_print_debug(void)	{ }
108

109
static DEFINE_PER_CPU(int, perf_disable_count);
110 111 112

void __perf_disable(void)
{
113
	__get_cpu_var(perf_disable_count)++;
114 115 116 117
}

bool __perf_enable(void)
{
118
	return !--__get_cpu_var(perf_disable_count);
119 120 121 122 123 124 125 126 127 128 129 130 131 132
}

void perf_disable(void)
{
	__perf_disable();
	hw_perf_disable();
}

void perf_enable(void)
{
	if (__perf_enable())
		hw_perf_enable();
}

133
static void get_ctx(struct perf_event_context *ctx)
134
{
135
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136 137
}

138 139
static void free_ctx(struct rcu_head *head)
{
140
	struct perf_event_context *ctx;
141

142
	ctx = container_of(head, struct perf_event_context, rcu_head);
143 144 145
	kfree(ctx);
}

146
static void put_ctx(struct perf_event_context *ctx)
147
{
148 149 150
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
151 152 153
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
154
	}
155 156
}

157
static void unclone_ctx(struct perf_event_context *ctx)
158 159 160 161 162 163 164
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

165
/*
166
 * If we inherit events we want to return the parent event id
167 168
 * to userspace.
 */
169
static u64 primary_event_id(struct perf_event *event)
170
{
171
	u64 id = event->id;
172

173 174
	if (event->parent)
		id = event->parent->id;
175 176 177 178

	return id;
}

179
/*
180
 * Get the perf_event_context for a task and lock it.
181 182 183
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
184
static struct perf_event_context *
185
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186
{
187
	struct perf_event_context *ctx;
188 189 190

	rcu_read_lock();
 retry:
191
	ctx = rcu_dereference(task->perf_event_ctxp);
192 193 194 195
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
196
		 * perf_event_task_sched_out, though the
197 198 199 200 201 202 203
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
		spin_lock_irqsave(&ctx->lock, *flags);
204
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 206 207
			spin_unlock_irqrestore(&ctx->lock, *flags);
			goto retry;
		}
208 209 210 211 212

		if (!atomic_inc_not_zero(&ctx->refcount)) {
			spin_unlock_irqrestore(&ctx->lock, *flags);
			ctx = NULL;
		}
213 214 215 216 217 218 219 220 221 222
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
223
static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
224
{
225
	struct perf_event_context *ctx;
226 227 228 229 230 231 232 233 234 235
	unsigned long flags;

	ctx = perf_lock_task_context(task, &flags);
	if (ctx) {
		++ctx->pin_count;
		spin_unlock_irqrestore(&ctx->lock, flags);
	}
	return ctx;
}

236
static void perf_unpin_context(struct perf_event_context *ctx)
237 238 239 240 241 242 243 244 245
{
	unsigned long flags;

	spin_lock_irqsave(&ctx->lock, flags);
	--ctx->pin_count;
	spin_unlock_irqrestore(&ctx->lock, flags);
	put_ctx(ctx);
}

246
/*
247
 * Add a event from the lists for its context.
248 249
 * Must be called with ctx->mutex and ctx->lock held.
 */
250
static void
251
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
252
{
253
	struct perf_event *group_leader = event->group_leader;
254 255

	/*
256 257
	 * Depending on whether it is a standalone or sibling event,
	 * add it straight to the context's event list, or to the group
258 259
	 * leader's sibling list:
	 */
260 261
	if (group_leader == event)
		list_add_tail(&event->group_entry, &ctx->group_list);
Peter Zijlstra's avatar
Peter Zijlstra committed
262
	else {
263
		list_add_tail(&event->group_entry, &group_leader->sibling_list);
Peter Zijlstra's avatar
Peter Zijlstra committed
264 265
		group_leader->nr_siblings++;
	}
266

267 268 269
	list_add_rcu(&event->event_entry, &ctx->event_list);
	ctx->nr_events++;
	if (event->attr.inherit_stat)
270
		ctx->nr_stat++;
271 272
}

273
/*
274
 * Remove a event from the lists for its context.
275
 * Must be called with ctx->mutex and ctx->lock held.
276
 */
277
static void
278
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
279
{
280
	struct perf_event *sibling, *tmp;
281

282
	if (list_empty(&event->group_entry))
283
		return;
284 285
	ctx->nr_events--;
	if (event->attr.inherit_stat)
286
		ctx->nr_stat--;
287

288 289
	list_del_init(&event->group_entry);
	list_del_rcu(&event->event_entry);
290

291 292
	if (event->group_leader != event)
		event->group_leader->nr_siblings--;
Peter Zijlstra's avatar
Peter Zijlstra committed
293

294
	/*
295 296
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
297 298
	 * to the context list directly:
	 */
299
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
300

301
		list_move_tail(&sibling->group_entry, &ctx->group_list);
302 303 304 305
		sibling->group_leader = sibling;
	}
}

306
static void
307
event_sched_out(struct perf_event *event,
308
		  struct perf_cpu_context *cpuctx,
309
		  struct perf_event_context *ctx)
310
{
311
	if (event->state != PERF_EVENT_STATE_ACTIVE)
312 313
		return;

314 315 316 317
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
318
	}
319 320 321
	event->tstamp_stopped = ctx->time;
	event->pmu->disable(event);
	event->oncpu = -1;
322

323
	if (!is_software_event(event))
324 325
		cpuctx->active_oncpu--;
	ctx->nr_active--;
326
	if (event->attr.exclusive || !cpuctx->active_oncpu)
327 328 329
		cpuctx->exclusive = 0;
}

330
static void
331
group_sched_out(struct perf_event *group_event,
332
		struct perf_cpu_context *cpuctx,
333
		struct perf_event_context *ctx)
334
{
335
	struct perf_event *event;
336

337
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
338 339
		return;

340
	event_sched_out(group_event, cpuctx, ctx);
341 342 343 344

	/*
	 * Schedule out siblings (if any):
	 */
345 346
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
347

348
	if (group_event->attr.exclusive)
349 350 351
		cpuctx->exclusive = 0;
}

352
/*
353
 * Cross CPU call to remove a performance event
354
 *
355
 * We disable the event on the hardware level first. After that we
356 357
 * remove it from the context list.
 */
358
static void __perf_event_remove_from_context(void *info)
359 360
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
361 362
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
363 364 365 366 367 368

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
369
	if (ctx->task && cpuctx->task_ctx != ctx)
370 371
		return;

372
	spin_lock(&ctx->lock);
373 374
	/*
	 * Protect the list operation against NMI by disabling the
375
	 * events on a global level.
376 377
	 */
	perf_disable();
378

379
	event_sched_out(event, cpuctx, ctx);
380

381
	list_del_event(event, ctx);
382 383 384

	if (!ctx->task) {
		/*
385
		 * Allow more per task events with respect to the
386 387 388
		 * reservation:
		 */
		cpuctx->max_pertask =
389 390
			min(perf_max_events - ctx->nr_events,
			    perf_max_events - perf_reserved_percpu);
391 392
	}

393
	perf_enable();
394
	spin_unlock(&ctx->lock);
395 396 397 398
}


/*
399
 * Remove the event from a task's (or a CPU's) list of events.
400
 *
401
 * Must be called with ctx->mutex held.
402
 *
403
 * CPU events are removed with a smp call. For task events we only
404
 * call when the task is on a CPU.
405
 *
406 407
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
408 409
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
410
 * When called from perf_event_exit_task, it's OK because the
411
 * context has been detached from its task.
412
 */
413
static void perf_event_remove_from_context(struct perf_event *event)
414
{
415
	struct perf_event_context *ctx = event->ctx;
416 417 418 419
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
420
		 * Per cpu events are removed via an smp call and
421 422
		 * the removal is always sucessful.
		 */
423 424 425
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
426 427 428 429
		return;
	}

retry:
430 431
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
432 433 434 435 436

	spin_lock_irq(&ctx->lock);
	/*
	 * If the context is active we need to retry the smp call.
	 */
437
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
438 439 440 441 442 443
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
444
	 * can remove the event safely, if the call above did not
445 446
	 * succeed.
	 */
447 448
	if (!list_empty(&event->group_entry)) {
		list_del_event(event, ctx);
449 450 451 452
	}
	spin_unlock_irq(&ctx->lock);
}

453
static inline u64 perf_clock(void)
454
{
455
	return cpu_clock(smp_processor_id());
456 457 458 459 460
}

/*
 * Update the record of the current time in a context.
 */
461
static void update_context_time(struct perf_event_context *ctx)
462
{
463 464 465 466
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
467 468 469
}

/*
470
 * Update the total_time_enabled and total_time_running fields for a event.
471
 */
472
static void update_event_times(struct perf_event *event)
473
{
474
	struct perf_event_context *ctx = event->ctx;
475 476
	u64 run_end;

477 478
	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 480
		return;

481
	event->total_time_enabled = ctx->time - event->tstamp_enabled;
482

483 484
	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
485 486 487
	else
		run_end = ctx->time;

488
	event->total_time_running = run_end - event->tstamp_running;
489 490 491
}

/*
492
 * Update total_time_enabled and total_time_running for all events in a group.
493
 */
494
static void update_group_times(struct perf_event *leader)
495
{
496
	struct perf_event *event;
497

498 499 500
	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
501 502
}

503
/*
504
 * Cross CPU call to disable a performance event
505
 */
506
static void __perf_event_disable(void *info)
507
{
508
	struct perf_event *event = info;
509
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
510
	struct perf_event_context *ctx = event->ctx;
511 512

	/*
513 514
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
515
	 */
516
	if (ctx->task && cpuctx->task_ctx != ctx)
517 518
		return;

519
	spin_lock(&ctx->lock);
520 521

	/*
522
	 * If the event is on, turn it off.
523 524
	 * If it is in error state, leave it in error state.
	 */
525
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
526
		update_context_time(ctx);
527 528 529
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
530
		else
531 532
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
533 534
	}

535
	spin_unlock(&ctx->lock);
536 537 538
}

/*
539
 * Disable a event.
540
 *
541 542
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
543
 * remains valid.  This condition is satisifed when called through
544 545 546 547
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
548
 * is the current context on this CPU and preemption is disabled,
549
 * hence we can't get into perf_event_task_sched_out for this context.
550
 */
551
static void perf_event_disable(struct perf_event *event)
552
{
553
	struct perf_event_context *ctx = event->ctx;
554 555 556 557
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
558
		 * Disable the event on the cpu that it's on
559
		 */
560 561
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
562 563 564 565
		return;
	}

 retry:
566
	task_oncpu_function_call(task, __perf_event_disable, event);
567 568 569

	spin_lock_irq(&ctx->lock);
	/*
570
	 * If the event is still active, we need to retry the cross-call.
571
	 */
572
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 574 575 576 577 578 579 580
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
581 582 583
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
584
	}
585 586 587 588

	spin_unlock_irq(&ctx->lock);
}

589
static int
590
event_sched_in(struct perf_event *event,
591
		 struct perf_cpu_context *cpuctx,
592
		 struct perf_event_context *ctx,
593 594
		 int cpu)
{
595
	if (event->state <= PERF_EVENT_STATE_OFF)
596 597
		return 0;

598 599
	event->state = PERF_EVENT_STATE_ACTIVE;
	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
600 601 602 603 604
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

605 606 607
	if (event->pmu->enable(event)) {
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
608 609 610
		return -EAGAIN;
	}

611
	event->tstamp_running += ctx->time - event->tstamp_stopped;
612

613
	if (!is_software_event(event))
614
		cpuctx->active_oncpu++;
615 616
	ctx->nr_active++;

617
	if (event->attr.exclusive)
618 619
		cpuctx->exclusive = 1;

620 621 622
	return 0;
}

623
static int
624
group_sched_in(struct perf_event *group_event,
625
	       struct perf_cpu_context *cpuctx,
626
	       struct perf_event_context *ctx,
627 628
	       int cpu)
{
629
	struct perf_event *event, *partial_group;
630 631
	int ret;

632
	if (group_event->state == PERF_EVENT_STATE_OFF)
633 634
		return 0;

635
	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
636 637 638
	if (ret)
		return ret < 0 ? ret : 0;

639
	if (event_sched_in(group_event, cpuctx, ctx, cpu))
640 641 642 643 644
		return -EAGAIN;

	/*
	 * Schedule in siblings as one group (if any):
	 */
645 646 647
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event_sched_in(event, cpuctx, ctx, cpu)) {
			partial_group = event;
648 649 650 651 652 653 654 655 656 657 658
			goto group_error;
		}
	}

	return 0;

group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
	 */
659 660
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
661
			break;
662
		event_sched_out(event, cpuctx, ctx);
663
	}
664
	event_sched_out(group_event, cpuctx, ctx);
665 666 667 668

	return -EAGAIN;
}

669
/*
670 671
 * Return 1 for a group consisting entirely of software events,
 * 0 if the group contains any hardware events.
672
 */
673
static int is_software_only_group(struct perf_event *leader)
674
{
675
	struct perf_event *event;
676

677
	if (!is_software_event(leader))
678
		return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
679

680 681
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		if (!is_software_event(event))
682
			return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
683

684 685 686 687
	return 1;
}

/*
688
 * Work out whether we can put this event group on the CPU now.
689
 */
690
static int group_can_go_on(struct perf_event *event,
691 692 693 694
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
695
	 * Groups consisting entirely of software events can always go on.
696
	 */
697
	if (is_software_only_group(event))
698 699 700
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
701
	 * events can go on.
702 703 704 705 706
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
707
	 * events on the CPU, it can't go on.
708
	 */
709
	if (event->attr.exclusive && cpuctx->active_oncpu)
710 711 712 713 714 715 716 717
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

718 719
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
720
{
721 722 723 724
	list_add_event(event, ctx);
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
725 726
}

727
/*
728
 * Cross CPU call to install and enable a performance event
729 730
 *
 * Must be called with ctx->mutex held
731 732 733 734
 */
static void __perf_install_in_context(void *info)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
735 736 737
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
738
	int cpu = smp_processor_id();
739
	int err;
740 741 742 743 744

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
745
	 * Or possibly this is the right context but it isn't
746
	 * on this cpu because it had no events.
747
	 */
748
	if (ctx->task && cpuctx->task_ctx != ctx) {
749
		if (cpuctx->task_ctx || ctx->task != current)
750 751 752
			return;
		cpuctx->task_ctx = ctx;
	}
753

754
	spin_lock(&ctx->lock);
755
	ctx->is_active = 1;
756
	update_context_time(ctx);
757 758 759

	/*
	 * Protect the list operation against NMI by disabling the
760
	 * events on a global level. NOP for non NMI based events.
761
	 */
762
	perf_disable();
763

764
	add_event_to_ctx(event, ctx);
765

766
	/*
767
	 * Don't put the event on if it is disabled or if
768 769
	 * it is in a group and the group isn't on.
	 */
770 771
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
772 773
		goto unlock;

774
	/*
775 776 777
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
778
	 */
779
	if (!group_can_go_on(event, cpuctx, 1))
780 781
		err = -EEXIST;
	else
782
		err = event_sched_in(event, cpuctx, ctx, cpu);
783

784 785
	if (err) {
		/*
786
		 * This event couldn't go on.  If it is in a group
787
		 * then we have to pull the whole group off.
788
		 * If the event group is pinned then put it in error state.
789
		 */
790
		if (leader != event)
791
			group_sched_out(leader, cpuctx, ctx);
792
		if (leader->attr.pinned) {
793
			update_group_times(leader);
794
			leader->state = PERF_EVENT_STATE_ERROR;
795
		}
796
	}
797

798
	if (!err && !ctx->task && cpuctx->max_pertask)
799 800
		cpuctx->max_pertask--;

801
 unlock:
802
	perf_enable();
803

804
	spin_unlock(&ctx->lock);
805 806 807
}

/*
808
 * Attach a performance event to a context
809
 *
810 811
 * First we add the event to the list with the hardware enable bit
 * in event->hw_config cleared.
812
 *
813
 * If the event is attached to a task which is on a CPU we use a smp
814 815
 * call to enable it in the task context. The task might have been
 * scheduled away, but we check this in the smp call again.
816 817
 *
 * Must be called with ctx->mutex held.
818 819
 */
static void
820 821
perf_install_in_context(struct perf_event_context *ctx,
			struct perf_event *event,
822 823 824 825 826 827
			int cpu)
{
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
828
		 * Per cpu events are installed via an smp call and
829 830 831
		 * the install is always sucessful.
		 */
		smp_call_function_single(cpu, __perf_install_in_context,
832
					 event, 1);
833 834 835 836 837
		return;
	}

retry:
	task_oncpu_function_call(task, __perf_install_in_context,
838
				 event);
839 840 841 842 843

	spin_lock_irq(&ctx->lock);
	/*
	 * we need to retry the smp call.
	 */
844
	if (ctx->is_active && list_empty(&event->group_entry)) {
845 846 847 848 849 850
		spin_unlock_irq(&ctx->lock);
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
851
	 * can add the event safely, if it the call above did not
852 853
	 * succeed.
	 */
854 855
	if (list_empty(&event->group_entry))
		add_event_to_ctx(event, ctx);
856 857 858
	spin_unlock_irq(&ctx->lock);
}

859
/*
860
 * Put a event into inactive state and update time fields.
861 862 863 864 865 866
 * Enabling the leader of a group effectively enables all
 * the group members that aren't explicitly disabled, so we
 * have to update their ->tstamp_enabled also.
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
867 868
static void __perf_event_mark_enabled(struct perf_event *event,
					struct perf_event_context *ctx)
869
{
870
	struct perf_event *sub;
871

872 873 874 875
	event->state = PERF_EVENT_STATE_INACTIVE;
	event->tstamp_enabled = ctx->time - event->total_time_enabled;
	list_for_each_entry(sub, &event->sibling_list, group_entry)
		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
876 877 878 879
			sub->tstamp_enabled =
				ctx->time - sub->total_time_enabled;
}

880
/*
881
 * Cross CPU call to enable a performance event
882
 */
883
static void __perf_event_enable(void *info)
884
{
885
	struct perf_event *event = info;
886
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
887 888
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
889
	int err;
890

891
	/*
892 893
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
894
	 */
895
	if (ctx->task && cpuctx->task_ctx != ctx) {
896
		if (cpuctx->task_ctx || ctx->task != current)
897 898 899
			return;
		cpuctx->task_ctx = ctx;
	}
900

901
	spin_lock(&ctx->lock);
902
	ctx->is_active = 1;
903
	update_context_time(ctx);
904