uprobes.c 57 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0+
2
/*
3
 * User-space Probes (UProbes)
4
 *
5
 * Copyright (C) IBM Corporation, 2008-2012
6
7
8
 * Authors:
 *	Srikar Dronamraju
 *	Jim Keniston
9
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
10
11
12
13
14
15
16
 */

#include <linux/kernel.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>	/* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
17
#include <linux/sched/mm.h>
18
#include <linux/sched/coredump.h>
19
#include <linux/export.h>
20
21
22
#include <linux/rmap.h>		/* anon_vma_prepare */
#include <linux/mmu_notifier.h>	/* set_pte_at_notify */
#include <linux/swap.h>		/* try_to_free_swap */
23
24
#include <linux/ptrace.h>	/* user_enable_single_step */
#include <linux/kdebug.h>	/* notifier mechanism */
25
#include "../../mm/internal.h"	/* munlock_vma_page */
26
#include <linux/percpu-rwsem.h>
27
#include <linux/task_work.h>
28
#include <linux/shmem_fs.h>
29
#include <linux/khugepaged.h>
30

31
32
#include <linux/uprobes.h>

33
34
35
#define UINSNS_PER_PAGE			(PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
#define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE

36
static struct rb_root uprobes_tree = RB_ROOT;
37
38
39
40
41
/*
 * allows us to skip the uprobe_mmap if there are no uprobe events active
 * at this time.  Probably a fine grained per inode count is better?
 */
#define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
42

43
44
45
46
47
static DEFINE_SPINLOCK(uprobes_treelock);	/* serialize rbtree access */

#define UPROBES_HASH_SZ	13
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
48
#define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
49

50
DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
51

52
/* Have a copy of original instruction */
53
#define UPROBE_COPY_INSN	0
54

55
56
struct uprobe {
	struct rb_node		rb_node;	/* node in the rb tree */
57
	refcount_t		ref;
58
	struct rw_semaphore	register_rwsem;
59
60
61
62
63
	struct rw_semaphore	consumer_rwsem;
	struct list_head	pending_list;
	struct uprobe_consumer	*consumers;
	struct inode		*inode;		/* Also hold a ref to inode */
	loff_t			offset;
64
	loff_t			ref_ctr_offset;
65
	unsigned long		flags;
66
67
68
69
70
71
72
73
74
75
76

	/*
	 * The generic code assumes that it has two members of unknown type
	 * owned by the arch-specific code:
	 *
	 * 	insn -	copy_insn() saves the original instruction here for
	 *		arch_uprobe_analyze_insn().
	 *
	 *	ixol -	potentially modified instruction to execute out of
	 *		line, copied to xol_area by xol_get_insn_slot().
	 */
77
78
79
	struct arch_uprobe	arch;
};

80
81
82
83
84
85
86
87
88
struct delayed_uprobe {
	struct list_head list;
	struct uprobe *uprobe;
	struct mm_struct *mm;
};

static DEFINE_MUTEX(delayed_uprobe_lock);
static LIST_HEAD(delayed_uprobe_list);

89
/*
90
91
92
93
 * Execute out of line area: anonymous executable mapping installed
 * by the probed task to execute the copy of the original instruction
 * mangled by set_swbp().
 *
94
95
96
97
98
 * On a breakpoint hit, thread contests for a slot.  It frees the
 * slot after singlestep. Currently a fixed number of slots are
 * allocated.
 */
struct xol_area {
99
100
101
	wait_queue_head_t 		wq;		/* if all slots are busy */
	atomic_t 			slot_count;	/* number of in-use slots */
	unsigned long 			*bitmap;	/* 0 = free slot */
102

103
104
	struct vm_special_mapping	xol_mapping;
	struct page 			*pages[2];
105
106
107
108
109
	/*
	 * We keep the vma's vm_start rather than a pointer to the vma
	 * itself.  The probed process or a naughty kernel module could make
	 * the vma go away, and we must handle that reasonably gracefully.
	 */
110
	unsigned long 			vaddr;		/* Page(s) of instruction slots */
111
112
};

113
114
115
116
117
118
119
120
121
122
/*
 * valid_vma: Verify if the specified vma is an executable vma
 * Relax restrictions while unregistering: vm_flags might have
 * changed after breakpoint was inserted.
 *	- is_register: indicates if we are in register context.
 *	- Return 1 if the specified virtual address is in an
 *	  executable vma.
 */
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
123
	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
124

125
126
	if (is_register)
		flags |= VM_WRITE;
127

128
	return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
129
130
}

131
static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
132
{
133
	return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
134
135
}

136
137
138
139
140
static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
{
	return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
}

141
142
143
144
145
/**
 * __replace_page - replace page in vma by new page.
 * based on replace_page in mm/ksm.c
 *
 * @vma:      vma that holds the pte pointing to page
146
 * @addr:     address the old @page is mapped at
147
148
 * @old_page: the page we are replacing by new_page
 * @new_page: the modified page we replace page by
149
 *
150
151
152
 * If @new_page is NULL, only unmap @old_page.
 *
 * Returns 0 on success, negative error code otherwise.
153
 */
154
static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
155
				struct page *old_page, struct page *new_page)
156
157
{
	struct mm_struct *mm = vma->vm_mm;
158
	struct page_vma_mapped_walk pvmw = {
159
		.page = compound_head(old_page),
160
161
162
		.vma = vma,
		.address = addr,
	};
163
	int err;
164
	struct mmu_notifier_range range;
165

166
	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
167
				addr + PAGE_SIZE);
168

169
	if (new_page) {
170
		err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
171
172
173
		if (err)
			return err;
	}
174

175
	/* For try_to_free_swap() and munlock_vma_page() below */
176
	lock_page(old_page);
177

178
	mmu_notifier_invalidate_range_start(&range);
179
	err = -EAGAIN;
180
	if (!page_vma_mapped_walk(&pvmw))
181
		goto unlock;
182
	VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
183

184
185
	if (new_page) {
		get_page(new_page);
186
		page_add_new_anon_rmap(new_page, vma, addr, false);
187
188
189
190
		lru_cache_add_active_or_unevictable(new_page, vma);
	} else
		/* no new page, just dec_mm_counter for old_page */
		dec_mm_counter(mm, MM_ANONPAGES);
191

192
193
	if (!PageAnon(old_page)) {
		dec_mm_counter(mm, mm_counter_file(old_page));
194
195
196
		inc_mm_counter(mm, MM_ANONPAGES);
	}

197
198
	flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
	ptep_clear_flush_notify(vma, addr, pvmw.pte);
199
200
201
	if (new_page)
		set_pte_at_notify(mm, addr, pvmw.pte,
				  mk_pte(new_page, vma->vm_page_prot));
202

203
204
205
	page_remove_rmap(old_page, false);
	if (!page_mapped(old_page))
		try_to_free_swap(old_page);
206
	page_vma_mapped_walk_done(&pvmw);
207

208
	if (vma->vm_flags & VM_LOCKED)
209
210
		munlock_vma_page(old_page);
	put_page(old_page);
211

212
213
	err = 0;
 unlock:
214
	mmu_notifier_invalidate_range_end(&range);
215
	unlock_page(old_page);
216
	return err;
217
218
219
}

/**
220
 * is_swbp_insn - check if instruction is breakpoint instruction.
221
 * @insn: instruction to be checked.
222
 * Default implementation of is_swbp_insn
223
224
 * Returns true if @insn is a breakpoint instruction.
 */
225
bool __weak is_swbp_insn(uprobe_opcode_t *insn)
226
{
227
	return *insn == UPROBE_SWBP_INSN;
228
229
}

230
231
232
233
234
235
236
237
238
239
240
241
242
243
/**
 * is_trap_insn - check if instruction is breakpoint instruction.
 * @insn: instruction to be checked.
 * Default implementation of is_trap_insn
 * Returns true if @insn is a breakpoint instruction.
 *
 * This function is needed for the case where an architecture has multiple
 * trap instructions (like powerpc).
 */
bool __weak is_trap_insn(uprobe_opcode_t *insn)
{
	return is_swbp_insn(insn);
}

244
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
245
246
{
	void *kaddr = kmap_atomic(page);
247
	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
248
249
250
	kunmap_atomic(kaddr);
}

251
252
253
254
255
256
257
static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
{
	void *kaddr = kmap_atomic(page);
	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
	kunmap_atomic(kaddr);
}

258
259
260
261
262
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
{
	uprobe_opcode_t old_opcode;
	bool is_swbp;

263
264
265
266
267
268
269
270
271
	/*
	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
	 * We do not check if it is any other 'trap variant' which could
	 * be conditional trap instruction such as the one powerpc supports.
	 *
	 * The logic is that we do not care if the underlying instruction
	 * is a trap variant; uprobes always wins over any other (gdb)
	 * breakpoint.
	 */
272
	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
273
274
275
276
277
278
279
	is_swbp = is_swbp_insn(&old_opcode);

	if (is_swbp_insn(new_opcode)) {
		if (is_swbp)		/* register: already installed? */
			return 0;
	} else {
		if (!is_swbp)		/* unregister: was it changed by us? */
280
			return 0;
281
282
283
284
285
	}

	return 1;
}

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
{
	struct delayed_uprobe *du;

	list_for_each_entry(du, &delayed_uprobe_list, list)
		if (du->uprobe == uprobe && du->mm == mm)
			return du;
	return NULL;
}

static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
{
	struct delayed_uprobe *du;

	if (delayed_uprobe_check(uprobe, mm))
		return 0;

	du  = kzalloc(sizeof(*du), GFP_KERNEL);
	if (!du)
		return -ENOMEM;

	du->uprobe = uprobe;
	du->mm = mm;
	list_add(&du->list, &delayed_uprobe_list);
	return 0;
}

static void delayed_uprobe_delete(struct delayed_uprobe *du)
{
	if (WARN_ON(!du))
		return;
	list_del(&du->list);
	kfree(du);
}

static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
{
	struct list_head *pos, *q;
	struct delayed_uprobe *du;

	if (!uprobe && !mm)
		return;

	list_for_each_safe(pos, q, &delayed_uprobe_list) {
		du = list_entry(pos, struct delayed_uprobe, list);

		if (uprobe && du->uprobe != uprobe)
			continue;
		if (mm && du->mm != mm)
			continue;

		delayed_uprobe_delete(du);
	}
}

static bool valid_ref_ctr_vma(struct uprobe *uprobe,
			      struct vm_area_struct *vma)
{
	unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);

	return uprobe->ref_ctr_offset &&
		vma->vm_file &&
		file_inode(vma->vm_file) == uprobe->inode &&
		(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
		vma->vm_start <= vaddr &&
		vma->vm_end > vaddr;
}

static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
{
	struct vm_area_struct *tmp;

	for (tmp = mm->mmap; tmp; tmp = tmp->vm_next)
		if (valid_ref_ctr_vma(uprobe, tmp))
			return tmp;

	return NULL;
}

static int
__update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
{
	void *kaddr;
	struct page *page;
	struct vm_area_struct *vma;
	int ret;
	short *ptr;

	if (!vaddr || !d)
		return -EINVAL;

	ret = get_user_pages_remote(NULL, mm, vaddr, 1,
			FOLL_WRITE, &page, &vma, NULL);
	if (unlikely(ret <= 0)) {
		/*
		 * We are asking for 1 page. If get_user_pages_remote() fails,
		 * it may return 0, in that case we have to return error.
		 */
		return ret == 0 ? -EBUSY : ret;
	}

	kaddr = kmap_atomic(page);
	ptr = kaddr + (vaddr & ~PAGE_MASK);

	if (unlikely(*ptr + d < 0)) {
		pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
			"curr val: %d, delta: %d\n", vaddr, *ptr, d);
		ret = -EINVAL;
		goto out;
	}

	*ptr += d;
	ret = 0;
out:
	kunmap_atomic(kaddr);
	put_page(page);
	return ret;
}

static void update_ref_ctr_warn(struct uprobe *uprobe,
				struct mm_struct *mm, short d)
{
	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
		"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
		d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
		(unsigned long long) uprobe->offset,
		(unsigned long long) uprobe->ref_ctr_offset, mm);
}

static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
			  short d)
{
	struct vm_area_struct *rc_vma;
	unsigned long rc_vaddr;
	int ret = 0;

	rc_vma = find_ref_ctr_vma(uprobe, mm);

	if (rc_vma) {
		rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
		ret = __update_ref_ctr(mm, rc_vaddr, d);
		if (ret)
			update_ref_ctr_warn(uprobe, mm, d);

		if (d > 0)
			return ret;
	}

	mutex_lock(&delayed_uprobe_lock);
	if (d > 0)
		ret = delayed_uprobe_add(uprobe, mm);
	else
		delayed_uprobe_remove(uprobe, mm);
	mutex_unlock(&delayed_uprobe_lock);

	return ret;
}

446
447
448
449
450
/*
 * NOTE:
 * Expect the breakpoint instruction to be the smallest size instruction for
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
451
 * supported by that architecture then we need to modify is_trap_at_addr and
452
453
 * uprobe_write_opcode accordingly. This would never be a problem for archs
 * that have fixed length instructions.
454
 *
455
 * uprobe_write_opcode - write the opcode at a given virtual address.
456
457
458
459
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
 *
460
 * Called with mm->mmap_lock held for write.
461
462
 * Return 0 (success) or a negative errno.
 */
463
464
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
			unsigned long vaddr, uprobe_opcode_t opcode)
465
{
466
	struct uprobe *uprobe;
467
468
	struct page *old_page, *new_page;
	struct vm_area_struct *vma;
469
	int ret, is_register, ref_ctr_updated = 0;
470
	bool orig_page_huge = false;
471
	unsigned int gup_flags = FOLL_FORCE;
472
473
474

	is_register = is_swbp_insn(&opcode);
	uprobe = container_of(auprobe, struct uprobe, arch);
475

476
retry:
477
478
	if (is_register)
		gup_flags |= FOLL_SPLIT_PMD;
479
	/* Read the page with vaddr into memory */
480
481
	ret = get_user_pages_remote(NULL, mm, vaddr, 1, gup_flags,
				    &old_page, &vma, NULL);
482
483
	if (ret <= 0)
		return ret;
484

485
486
487
488
	ret = verify_opcode(old_page, vaddr, &opcode);
	if (ret <= 0)
		goto put_old;

489
490
491
492
493
494
	if (WARN(!is_register && PageCompound(old_page),
		 "uprobe unregister should never work on compound page\n")) {
		ret = -EINVAL;
		goto put_old;
	}

495
496
497
498
499
500
501
502
503
	/* We are going to replace instruction, update ref_ctr. */
	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
		if (ret)
			goto put_old;

		ref_ctr_updated = 1;
	}

504
505
506
507
	ret = 0;
	if (!is_register && !PageAnon(old_page))
		goto put_old;

508
509
510
511
	ret = anon_vma_prepare(vma);
	if (ret)
		goto put_old;

512
513
514
	ret = -ENOMEM;
	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
	if (!new_page)
515
		goto put_old;
516

517
	__SetPageUptodate(new_page);
518
519
	copy_highpage(new_page, old_page);
	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
520

521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
	if (!is_register) {
		struct page *orig_page;
		pgoff_t index;

		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);

		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
					  index);

		if (orig_page) {
			if (PageUptodate(orig_page) &&
			    pages_identical(new_page, orig_page)) {
				/* let go new_page */
				put_page(new_page);
				new_page = NULL;
537
538
539

				if (PageCompound(orig_page))
					orig_page_huge = true;
540
541
542
543
544
			}
			put_page(orig_page);
		}
	}

545
	ret = __replace_page(vma, vaddr, old_page, new_page);
546
547
	if (new_page)
		put_page(new_page);
548
put_old:
549
550
	put_page(old_page);

551
552
	if (unlikely(ret == -EAGAIN))
		goto retry;
553
554
555
556
557

	/* Revert back reference counter if instruction update failed. */
	if (ret && is_register && ref_ctr_updated)
		update_ref_ctr(uprobe, mm, -1);

558
559
560
561
	/* try collapse pmd for compound page */
	if (!ret && orig_page_huge)
		collapse_pte_mapped_thp(mm, vaddr);

562
563
564
565
	return ret;
}

/**
566
 * set_swbp - store breakpoint at a given address.
567
 * @auprobe: arch specific probepoint information.
568
569
570
571
572
573
 * @mm: the probed process address space.
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, store the breakpoint instruction at @vaddr.
 * Return 0 (success) or a negative errno.
 */
574
int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
575
{
576
	return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
577
578
579
580
581
}

/**
 * set_orig_insn - Restore the original instruction.
 * @mm: the probed process address space.
582
 * @auprobe: arch specific probepoint information.
583
584
585
586
587
 * @vaddr: the virtual address to insert the opcode.
 *
 * For mm @mm, restore the original opcode (opcode) at @vaddr.
 * Return 0 (success) or a negative errno.
 */
588
int __weak
589
set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
590
{
591
592
	return uprobe_write_opcode(auprobe, mm, vaddr,
			*(uprobe_opcode_t *)&auprobe->insn);
593
594
}

595
596
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
597
	refcount_inc(&uprobe->ref);
598
599
600
601
602
	return uprobe;
}

static void put_uprobe(struct uprobe *uprobe)
{
603
	if (refcount_dec_and_test(&uprobe->ref)) {
604
605
606
607
608
		/*
		 * If application munmap(exec_vma) before uprobe_unregister()
		 * gets called, we don't get a chance to remove uprobe from
		 * delayed_uprobe_list from remove_breakpoint(). Do it here.
		 */
609
		mutex_lock(&delayed_uprobe_lock);
610
		delayed_uprobe_remove(uprobe, NULL);
611
		mutex_unlock(&delayed_uprobe_lock);
612
		kfree(uprobe);
613
	}
614
615
}

616
617
618
619
static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
	if (l->inode < r->inode)
		return -1;
620

621
622
623
	if (l->inode > r->inode)
		return 1;

624
625
626
627
628
	if (l->offset < r->offset)
		return -1;

	if (l->offset > r->offset)
		return 1;
629
630
631
632
633
634
635
636
637
638
639
640
641
642

	return 0;
}

static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
{
	struct uprobe u = { .inode = inode, .offset = offset };
	struct rb_node *n = uprobes_tree.rb_node;
	struct uprobe *uprobe;
	int match;

	while (n) {
		uprobe = rb_entry(n, struct uprobe, rb_node);
		match = match_uprobe(&u, uprobe);
643
644
		if (!match)
			return get_uprobe(uprobe);
645

646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
		if (match < 0)
			n = n->rb_left;
		else
			n = n->rb_right;
	}
	return NULL;
}

/*
 * Find a uprobe corresponding to a given inode:offset
 * Acquires uprobes_treelock
 */
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
	struct uprobe *uprobe;

662
	spin_lock(&uprobes_treelock);
663
	uprobe = __find_uprobe(inode, offset);
664
	spin_unlock(&uprobes_treelock);
665

666
667
668
669
670
671
672
673
674
675
676
677
678
679
	return uprobe;
}

static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
	struct rb_node **p = &uprobes_tree.rb_node;
	struct rb_node *parent = NULL;
	struct uprobe *u;
	int match;

	while (*p) {
		parent = *p;
		u = rb_entry(parent, struct uprobe, rb_node);
		match = match_uprobe(uprobe, u);
680
681
		if (!match)
			return get_uprobe(u);
682
683
684
685
686
687
688

		if (match < 0)
			p = &parent->rb_left;
		else
			p = &parent->rb_right;

	}
689

690
691
692
693
	u = NULL;
	rb_link_node(&uprobe->rb_node, parent, p);
	rb_insert_color(&uprobe->rb_node, &uprobes_tree);
	/* get access + creation ref */
694
	refcount_set(&uprobe->ref, 2);
695

696
697
698
699
	return u;
}

/*
700
 * Acquire uprobes_treelock.
701
702
703
704
705
706
707
708
709
710
 * Matching uprobe already exists in rbtree;
 *	increment (access refcount) and return the matching uprobe.
 *
 * No matching uprobe; insert the uprobe in rb_tree;
 *	get a double refcount (access + creation) and return NULL.
 */
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
	struct uprobe *u;

711
	spin_lock(&uprobes_treelock);
712
	u = __insert_uprobe(uprobe);
713
	spin_unlock(&uprobes_treelock);
714

715
716
717
	return u;
}

718
719
720
721
722
723
724
725
726
727
static void
ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
{
	pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
		"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
		uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
		(unsigned long long) cur_uprobe->ref_ctr_offset,
		(unsigned long long) uprobe->ref_ctr_offset);
}

728
729
static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
				   loff_t ref_ctr_offset)
730
731
732
733
734
735
736
{
	struct uprobe *uprobe, *cur_uprobe;

	uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
	if (!uprobe)
		return NULL;

737
	uprobe->inode = inode;
738
	uprobe->offset = offset;
739
	uprobe->ref_ctr_offset = ref_ctr_offset;
740
	init_rwsem(&uprobe->register_rwsem);
741
742
743
744
745
746
	init_rwsem(&uprobe->consumer_rwsem);

	/* add to uprobes_tree, sorted on inode:offset */
	cur_uprobe = insert_uprobe(uprobe);
	/* a uprobe exists for this inode:offset combination */
	if (cur_uprobe) {
747
748
749
750
751
752
		if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
			ref_ctr_mismatch_warn(cur_uprobe, uprobe);
			put_uprobe(cur_uprobe);
			kfree(uprobe);
			return ERR_PTR(-EINVAL);
		}
753
754
		kfree(uprobe);
		uprobe = cur_uprobe;
755
756
	}

757
758
759
	return uprobe;
}

760
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
761
762
{
	down_write(&uprobe->consumer_rwsem);
763
764
	uc->next = uprobe->consumers;
	uprobe->consumers = uc;
765
766
767
768
	up_write(&uprobe->consumer_rwsem);
}

/*
769
770
 * For uprobe @uprobe, delete the consumer @uc.
 * Return true if the @uc is deleted successfully
771
772
 * or return false.
 */
773
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
774
775
776
777
778
779
{
	struct uprobe_consumer **con;
	bool ret = false;

	down_write(&uprobe->consumer_rwsem);
	for (con = &uprobe->consumers; *con; con = &(*con)->next) {
780
781
		if (*con == uc) {
			*con = uc->next;
782
783
784
785
786
			ret = true;
			break;
		}
	}
	up_write(&uprobe->consumer_rwsem);
787

788
789
790
	return ret;
}

791
792
static int __copy_insn(struct address_space *mapping, struct file *filp,
			void *insn, int nbytes, loff_t offset)
793
794
795
{
	struct page *page;
	/*
796
797
798
	 * Ensure that the page that has the original instruction is populated
	 * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
	 * see uprobe_register().
799
	 */
800
	if (mapping->a_ops->readpage)
801
		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
802
	else
803
		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
804
805
806
	if (IS_ERR(page))
		return PTR_ERR(page);

807
	copy_from_page(page, offset, insn, nbytes);
808
	put_page(page);
809

810
811
812
	return 0;
}

813
static int copy_insn(struct uprobe *uprobe, struct file *filp)
814
{
815
816
	struct address_space *mapping = uprobe->inode->i_mapping;
	loff_t offs = uprobe->offset;
817
818
	void *insn = &uprobe->arch.insn;
	int size = sizeof(uprobe->arch.insn);
819
820
821
822
823
824
825
826
827
	int len, err = -EIO;

	/* Copy only available bytes, -EIO if nothing was read */
	do {
		if (offs >= i_size_read(uprobe->inode))
			break;

		len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
		err = __copy_insn(mapping, filp, insn, len, offs);
828
		if (err)
829
830
831
832
833
834
835
836
			break;

		insn += len;
		offs += len;
		size -= len;
	} while (size);

	return err;
837
838
}

839
840
841
842
843
static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
				struct mm_struct *mm, unsigned long vaddr)
{
	int ret = 0;

844
	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
845
846
		return ret;

847
848
	/* TODO: move this into _register, until then we abuse this sem. */
	down_write(&uprobe->consumer_rwsem);
849
	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
850
851
		goto out;

852
853
854
855
856
	ret = copy_insn(uprobe, file);
	if (ret)
		goto out;

	ret = -ENOTSUPP;
857
	if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
858
859
860
861
862
863
		goto out;

	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
	if (ret)
		goto out;

864
	smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
865
	set_bit(UPROBE_COPY_INSN, &uprobe->flags);
866
867

 out:
868
	up_write(&uprobe->consumer_rwsem);
869

870
871
872
	return ret;
}

873
874
static inline bool consumer_filter(struct uprobe_consumer *uc,
				   enum uprobe_filter_ctx ctx, struct mm_struct *mm)
875
{
876
	return !uc->filter || uc->filter(uc, ctx, mm);
877
878
}

879
880
static bool filter_chain(struct uprobe *uprobe,
			 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
881
{
882
883
884
885
886
	struct uprobe_consumer *uc;
	bool ret = false;

	down_read(&uprobe->consumer_rwsem);
	for (uc = uprobe->consumers; uc; uc = uc->next) {
887
		ret = consumer_filter(uc, ctx, mm);
888
889
890
891
892
893
		if (ret)
			break;
	}
	up_read(&uprobe->consumer_rwsem);

	return ret;
894
895
}

896
897
static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
898
			struct vm_area_struct *vma, unsigned long vaddr)
899
{
900
	bool first_uprobe;
901
902
	int ret;

903
904
905
	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
	if (ret)
		return ret;
906

907
908
909
910
911
912
913
914
	/*
	 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
	 * the task can hit this breakpoint right after __replace_page().
	 */
	first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
	if (first_uprobe)
		set_bit(MMF_HAS_UPROBES, &mm->flags);

915
	ret = set_swbp(&uprobe->arch, mm, vaddr);
916
917
918
	if (!ret)
		clear_bit(MMF_RECALC_UPROBES, &mm->flags);
	else if (first_uprobe)
919
		clear_bit(MMF_HAS_UPROBES, &mm->flags);
920
921
922
923

	return ret;
}

924
static int
925
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
926
{
927
	set_bit(MMF_RECALC_UPROBES, &mm->flags);
928
	return set_orig_insn(&uprobe->arch, mm, vaddr);
929
930
}

931
932
933
934
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
	return !RB_EMPTY_NODE(&uprobe->rb_node);
}
935
/*
936
937
938
 * There could be threads that have already hit the breakpoint. They
 * will recheck the current insn and restart if find_uprobe() fails.
 * See find_active_uprobe().
939
 */
940
941
static void delete_uprobe(struct uprobe *uprobe)
{
942
943
944
	if (WARN_ON(!uprobe_is_active(uprobe)))
		return;

945
	spin_lock(&uprobes_treelock);
946
	rb_erase(&uprobe->rb_node, &uprobes_tree);
947
	spin_unlock(&uprobes_treelock);
948
	RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
949
950
951
	put_uprobe(uprobe);
}

952
953
954
struct map_info {
	struct map_info *next;
	struct mm_struct *mm;
955
	unsigned long vaddr;
956
957
958
};

static inline struct map_info *free_map_info(struct map_info *info)
959
{
960
961
962
963
964
965
966
967
968
	struct map_info *next = info->next;
	kfree(info);
	return next;
}

static struct map_info *
build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
{
	unsigned long pgoff = offset >> PAGE_SHIFT;
969
	struct vm_area_struct *vma;
970
971
972
973
	struct map_info *curr = NULL;
	struct map_info *prev = NULL;
	struct map_info *info;
	int more = 0;
974

975
 again:
976
	i_mmap_lock_read(mapping);
977
	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
978
979
980
		if (!valid_vma(vma, is_register))
			continue;

981
982
		if (!prev && !more) {
			/*
983
			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
984
985
986
987
988
989
990
			 * reclaim. This is optimistic, no harm done if it fails.
			 */
			prev = kmalloc(sizeof(struct map_info),
					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
			if (prev)
				prev->next = NULL;
		}
991
992
993
		if (!prev) {
			more++;
			continue;
994
995
		}

996
		if (!mmget_not_zero(vma->vm_mm))
997
			continue;
998

999
1000
1001
1002
		info = prev;
		prev = prev->next;
		info->next = curr;
		curr = info;
1003

1004
		info->mm = vma->vm_mm;
1005
		info->vaddr = offset_to_vaddr(vma, offset);
1006
	}
1007
	i_mmap_unlock_read(mapping);
1008

1009
1010
1011
1012
1013
1014
1015
1016
	if (!more)
		goto out;

	prev = curr;
	while (curr) {
		mmput(curr->mm);
		curr = curr->next;
	}
1017

1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
	do {
		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
		if (!info) {
			curr = ERR_PTR(-ENOMEM);
			goto out;
		}
		info->next = prev;
		prev = info;
	} while (--more);

	goto again;
 out:
	while (prev)
		prev = free_map_info(prev);
	return curr;
1033
1034
}

1035
1036
static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
1037
{
1038
	bool is_register = !!new;
1039
1040
	struct map_info *info;
	int err = 0;
1041

1042
	percpu_down_write(&dup_mmap_sem);
1043
1044
	info = build_map_info(uprobe->inode->i_mapping,
					uprobe->offset, is_register);
1045
1046
1047
1048
	if (IS_ERR(info)) {
		err = PTR_ERR(info);
		goto out;
	}
1049

1050
1051
1052
	while (info) {
		struct mm_struct *mm = info->mm;
		struct vm_area_struct *vma;
1053

1054
		if (err && is_register)
1055
			goto free;
1056

1057
		mmap_write_lock(mm);
1058
1059
		vma = find_vma(mm, info->vaddr);
		if (!vma || !valid_vma(vma, is_register) ||
Oleg Nesterov's avatar
Oleg Nesterov committed
1060
		    file_inode(vma->vm_file) != uprobe->inode)
1061
1062
			goto unlock;

1063
1064
		if (vma->vm_start > info->vaddr ||
		    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
1065
			goto unlock;
1066

1067
1068
		if (is_register) {
			/* consult only the "caller", new consumer. */
1069
			if (consumer_filter(new,
1070
					UPROBE_FILTER_REGISTER, mm))
1071
1072
				err = install_breakpoint(uprobe, mm, vma, info->vaddr);
		} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
1073
1074
			if (!filter_chain(uprobe,
					UPROBE_FILTER_UNREGISTER, mm))
1075
1076
				err |= remove_breakpoint(uprobe, mm, info->vaddr);
		}
1077

1078
 unlock:
1079
		mmap_write_unlock(mm);
1080
1081
1082
 free:
		mmput(mm);
		info = free_map_info(info);
1083
	}
1084
1085
 out:
	percpu_up_write(&dup_mmap_sem);
1086
	return err;
1087
1088
}

1089
1090
static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
1091
{
1092
1093
	int err;

1094
	if (WARN_ON(!consumer_del(uprobe, uc)))
1095
		return;
1096

1097
	err = register_for_each_vma(uprobe, NULL);
1098
1099
1100
	/* TODO : cant unregister? schedule a worker thread */
	if (!uprobe->consumers && !err)
		delete_uprobe(uprobe);
1101
1102
1103
}

/*
1104
 * uprobe_unregister - unregister an already registered probe.
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
 * @inode: the file in which the probe has to be removed.
 * @offset: offset from the start of the file.
 * @uc: identify which probe if multiple probes are colocated.
 */
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
	struct uprobe *uprobe;

	uprobe = find_uprobe(inode, offset);
	if (WARN_ON(!uprobe))
		return;

	down_write(&uprobe->register_rwsem);
	__uprobe_unregister(uprobe, uc);
	up_write(&uprobe->register_rwsem);
	put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);

/*
 * __uprobe_register - register a probe
1126
1127
 * @inode: the file in which the probe has to be placed.
 * @offset: offset from the start of the file.
1128
 * @uc: information on howto handle the probe..